linux/fs/btrfs/extent_io.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/bitops.h>
   4#include <linux/slab.h>
   5#include <linux/bio.h>
   6#include <linux/mm.h>
   7#include <linux/pagemap.h>
   8#include <linux/page-flags.h>
   9#include <linux/spinlock.h>
  10#include <linux/blkdev.h>
  11#include <linux/swap.h>
  12#include <linux/writeback.h>
  13#include <linux/pagevec.h>
  14#include <linux/prefetch.h>
  15#include <linux/cleancache.h>
  16#include "extent_io.h"
  17#include "extent_map.h"
  18#include "ctree.h"
  19#include "btrfs_inode.h"
  20#include "volumes.h"
  21#include "check-integrity.h"
  22#include "locking.h"
  23#include "rcu-string.h"
  24#include "backref.h"
  25#include "disk-io.h"
  26
  27static struct kmem_cache *extent_state_cache;
  28static struct kmem_cache *extent_buffer_cache;
  29static struct bio_set btrfs_bioset;
  30
  31static inline bool extent_state_in_tree(const struct extent_state *state)
  32{
  33        return !RB_EMPTY_NODE(&state->rb_node);
  34}
  35
  36#ifdef CONFIG_BTRFS_DEBUG
  37static LIST_HEAD(buffers);
  38static LIST_HEAD(states);
  39
  40static DEFINE_SPINLOCK(leak_lock);
  41
  42static inline
  43void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
  44{
  45        unsigned long flags;
  46
  47        spin_lock_irqsave(&leak_lock, flags);
  48        list_add(new, head);
  49        spin_unlock_irqrestore(&leak_lock, flags);
  50}
  51
  52static inline
  53void btrfs_leak_debug_del(struct list_head *entry)
  54{
  55        unsigned long flags;
  56
  57        spin_lock_irqsave(&leak_lock, flags);
  58        list_del(entry);
  59        spin_unlock_irqrestore(&leak_lock, flags);
  60}
  61
  62static inline
  63void btrfs_leak_debug_check(void)
  64{
  65        struct extent_state *state;
  66        struct extent_buffer *eb;
  67
  68        while (!list_empty(&states)) {
  69                state = list_entry(states.next, struct extent_state, leak_list);
  70                pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
  71                       state->start, state->end, state->state,
  72                       extent_state_in_tree(state),
  73                       refcount_read(&state->refs));
  74                list_del(&state->leak_list);
  75                kmem_cache_free(extent_state_cache, state);
  76        }
  77
  78        while (!list_empty(&buffers)) {
  79                eb = list_entry(buffers.next, struct extent_buffer, leak_list);
  80                pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
  81                       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
  82                list_del(&eb->leak_list);
  83                kmem_cache_free(extent_buffer_cache, eb);
  84        }
  85}
  86
  87#define btrfs_debug_check_extent_io_range(tree, start, end)             \
  88        __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
  89static inline void __btrfs_debug_check_extent_io_range(const char *caller,
  90                struct extent_io_tree *tree, u64 start, u64 end)
  91{
  92        if (tree->ops && tree->ops->check_extent_io_range)
  93                tree->ops->check_extent_io_range(tree->private_data, caller,
  94                                                 start, end);
  95}
  96#else
  97#define btrfs_leak_debug_add(new, head) do {} while (0)
  98#define btrfs_leak_debug_del(entry)     do {} while (0)
  99#define btrfs_leak_debug_check()        do {} while (0)
 100#define btrfs_debug_check_extent_io_range(c, s, e)      do {} while (0)
 101#endif
 102
 103#define BUFFER_LRU_MAX 64
 104
 105struct tree_entry {
 106        u64 start;
 107        u64 end;
 108        struct rb_node rb_node;
 109};
 110
 111struct extent_page_data {
 112        struct bio *bio;
 113        struct extent_io_tree *tree;
 114        /* tells writepage not to lock the state bits for this range
 115         * it still does the unlocking
 116         */
 117        unsigned int extent_locked:1;
 118
 119        /* tells the submit_bio code to use REQ_SYNC */
 120        unsigned int sync_io:1;
 121};
 122
 123static int add_extent_changeset(struct extent_state *state, unsigned bits,
 124                                 struct extent_changeset *changeset,
 125                                 int set)
 126{
 127        int ret;
 128
 129        if (!changeset)
 130                return 0;
 131        if (set && (state->state & bits) == bits)
 132                return 0;
 133        if (!set && (state->state & bits) == 0)
 134                return 0;
 135        changeset->bytes_changed += state->end - state->start + 1;
 136        ret = ulist_add(&changeset->range_changed, state->start, state->end,
 137                        GFP_ATOMIC);
 138        return ret;
 139}
 140
 141static void flush_write_bio(struct extent_page_data *epd);
 142
 143int __init extent_io_init(void)
 144{
 145        extent_state_cache = kmem_cache_create("btrfs_extent_state",
 146                        sizeof(struct extent_state), 0,
 147                        SLAB_MEM_SPREAD, NULL);
 148        if (!extent_state_cache)
 149                return -ENOMEM;
 150
 151        extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 152                        sizeof(struct extent_buffer), 0,
 153                        SLAB_MEM_SPREAD, NULL);
 154        if (!extent_buffer_cache)
 155                goto free_state_cache;
 156
 157        if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
 158                        offsetof(struct btrfs_io_bio, bio),
 159                        BIOSET_NEED_BVECS))
 160                goto free_buffer_cache;
 161
 162        if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
 163                goto free_bioset;
 164
 165        return 0;
 166
 167free_bioset:
 168        bioset_exit(&btrfs_bioset);
 169
 170free_buffer_cache:
 171        kmem_cache_destroy(extent_buffer_cache);
 172        extent_buffer_cache = NULL;
 173
 174free_state_cache:
 175        kmem_cache_destroy(extent_state_cache);
 176        extent_state_cache = NULL;
 177        return -ENOMEM;
 178}
 179
 180void __cold extent_io_exit(void)
 181{
 182        btrfs_leak_debug_check();
 183
 184        /*
 185         * Make sure all delayed rcu free are flushed before we
 186         * destroy caches.
 187         */
 188        rcu_barrier();
 189        kmem_cache_destroy(extent_state_cache);
 190        kmem_cache_destroy(extent_buffer_cache);
 191        bioset_exit(&btrfs_bioset);
 192}
 193
 194void extent_io_tree_init(struct extent_io_tree *tree,
 195                         void *private_data)
 196{
 197        tree->state = RB_ROOT;
 198        tree->ops = NULL;
 199        tree->dirty_bytes = 0;
 200        spin_lock_init(&tree->lock);
 201        tree->private_data = private_data;
 202}
 203
 204static struct extent_state *alloc_extent_state(gfp_t mask)
 205{
 206        struct extent_state *state;
 207
 208        /*
 209         * The given mask might be not appropriate for the slab allocator,
 210         * drop the unsupported bits
 211         */
 212        mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
 213        state = kmem_cache_alloc(extent_state_cache, mask);
 214        if (!state)
 215                return state;
 216        state->state = 0;
 217        state->failrec = NULL;
 218        RB_CLEAR_NODE(&state->rb_node);
 219        btrfs_leak_debug_add(&state->leak_list, &states);
 220        refcount_set(&state->refs, 1);
 221        init_waitqueue_head(&state->wq);
 222        trace_alloc_extent_state(state, mask, _RET_IP_);
 223        return state;
 224}
 225
 226void free_extent_state(struct extent_state *state)
 227{
 228        if (!state)
 229                return;
 230        if (refcount_dec_and_test(&state->refs)) {
 231                WARN_ON(extent_state_in_tree(state));
 232                btrfs_leak_debug_del(&state->leak_list);
 233                trace_free_extent_state(state, _RET_IP_);
 234                kmem_cache_free(extent_state_cache, state);
 235        }
 236}
 237
 238static struct rb_node *tree_insert(struct rb_root *root,
 239                                   struct rb_node *search_start,
 240                                   u64 offset,
 241                                   struct rb_node *node,
 242                                   struct rb_node ***p_in,
 243                                   struct rb_node **parent_in)
 244{
 245        struct rb_node **p;
 246        struct rb_node *parent = NULL;
 247        struct tree_entry *entry;
 248
 249        if (p_in && parent_in) {
 250                p = *p_in;
 251                parent = *parent_in;
 252                goto do_insert;
 253        }
 254
 255        p = search_start ? &search_start : &root->rb_node;
 256        while (*p) {
 257                parent = *p;
 258                entry = rb_entry(parent, struct tree_entry, rb_node);
 259
 260                if (offset < entry->start)
 261                        p = &(*p)->rb_left;
 262                else if (offset > entry->end)
 263                        p = &(*p)->rb_right;
 264                else
 265                        return parent;
 266        }
 267
 268do_insert:
 269        rb_link_node(node, parent, p);
 270        rb_insert_color(node, root);
 271        return NULL;
 272}
 273
 274static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 275                                      struct rb_node **prev_ret,
 276                                      struct rb_node **next_ret,
 277                                      struct rb_node ***p_ret,
 278                                      struct rb_node **parent_ret)
 279{
 280        struct rb_root *root = &tree->state;
 281        struct rb_node **n = &root->rb_node;
 282        struct rb_node *prev = NULL;
 283        struct rb_node *orig_prev = NULL;
 284        struct tree_entry *entry;
 285        struct tree_entry *prev_entry = NULL;
 286
 287        while (*n) {
 288                prev = *n;
 289                entry = rb_entry(prev, struct tree_entry, rb_node);
 290                prev_entry = entry;
 291
 292                if (offset < entry->start)
 293                        n = &(*n)->rb_left;
 294                else if (offset > entry->end)
 295                        n = &(*n)->rb_right;
 296                else
 297                        return *n;
 298        }
 299
 300        if (p_ret)
 301                *p_ret = n;
 302        if (parent_ret)
 303                *parent_ret = prev;
 304
 305        if (prev_ret) {
 306                orig_prev = prev;
 307                while (prev && offset > prev_entry->end) {
 308                        prev = rb_next(prev);
 309                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 310                }
 311                *prev_ret = prev;
 312                prev = orig_prev;
 313        }
 314
 315        if (next_ret) {
 316                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 317                while (prev && offset < prev_entry->start) {
 318                        prev = rb_prev(prev);
 319                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 320                }
 321                *next_ret = prev;
 322        }
 323        return NULL;
 324}
 325
 326static inline struct rb_node *
 327tree_search_for_insert(struct extent_io_tree *tree,
 328                       u64 offset,
 329                       struct rb_node ***p_ret,
 330                       struct rb_node **parent_ret)
 331{
 332        struct rb_node *prev = NULL;
 333        struct rb_node *ret;
 334
 335        ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
 336        if (!ret)
 337                return prev;
 338        return ret;
 339}
 340
 341static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 342                                          u64 offset)
 343{
 344        return tree_search_for_insert(tree, offset, NULL, NULL);
 345}
 346
 347static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 348                     struct extent_state *other)
 349{
 350        if (tree->ops && tree->ops->merge_extent_hook)
 351                tree->ops->merge_extent_hook(tree->private_data, new, other);
 352}
 353
 354/*
 355 * utility function to look for merge candidates inside a given range.
 356 * Any extents with matching state are merged together into a single
 357 * extent in the tree.  Extents with EXTENT_IO in their state field
 358 * are not merged because the end_io handlers need to be able to do
 359 * operations on them without sleeping (or doing allocations/splits).
 360 *
 361 * This should be called with the tree lock held.
 362 */
 363static void merge_state(struct extent_io_tree *tree,
 364                        struct extent_state *state)
 365{
 366        struct extent_state *other;
 367        struct rb_node *other_node;
 368
 369        if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 370                return;
 371
 372        other_node = rb_prev(&state->rb_node);
 373        if (other_node) {
 374                other = rb_entry(other_node, struct extent_state, rb_node);
 375                if (other->end == state->start - 1 &&
 376                    other->state == state->state) {
 377                        merge_cb(tree, state, other);
 378                        state->start = other->start;
 379                        rb_erase(&other->rb_node, &tree->state);
 380                        RB_CLEAR_NODE(&other->rb_node);
 381                        free_extent_state(other);
 382                }
 383        }
 384        other_node = rb_next(&state->rb_node);
 385        if (other_node) {
 386                other = rb_entry(other_node, struct extent_state, rb_node);
 387                if (other->start == state->end + 1 &&
 388                    other->state == state->state) {
 389                        merge_cb(tree, state, other);
 390                        state->end = other->end;
 391                        rb_erase(&other->rb_node, &tree->state);
 392                        RB_CLEAR_NODE(&other->rb_node);
 393                        free_extent_state(other);
 394                }
 395        }
 396}
 397
 398static void set_state_cb(struct extent_io_tree *tree,
 399                         struct extent_state *state, unsigned *bits)
 400{
 401        if (tree->ops && tree->ops->set_bit_hook)
 402                tree->ops->set_bit_hook(tree->private_data, state, bits);
 403}
 404
 405static void clear_state_cb(struct extent_io_tree *tree,
 406                           struct extent_state *state, unsigned *bits)
 407{
 408        if (tree->ops && tree->ops->clear_bit_hook)
 409                tree->ops->clear_bit_hook(tree->private_data, state, bits);
 410}
 411
 412static void set_state_bits(struct extent_io_tree *tree,
 413                           struct extent_state *state, unsigned *bits,
 414                           struct extent_changeset *changeset);
 415
 416/*
 417 * insert an extent_state struct into the tree.  'bits' are set on the
 418 * struct before it is inserted.
 419 *
 420 * This may return -EEXIST if the extent is already there, in which case the
 421 * state struct is freed.
 422 *
 423 * The tree lock is not taken internally.  This is a utility function and
 424 * probably isn't what you want to call (see set/clear_extent_bit).
 425 */
 426static int insert_state(struct extent_io_tree *tree,
 427                        struct extent_state *state, u64 start, u64 end,
 428                        struct rb_node ***p,
 429                        struct rb_node **parent,
 430                        unsigned *bits, struct extent_changeset *changeset)
 431{
 432        struct rb_node *node;
 433
 434        if (end < start)
 435                WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
 436                       end, start);
 437        state->start = start;
 438        state->end = end;
 439
 440        set_state_bits(tree, state, bits, changeset);
 441
 442        node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 443        if (node) {
 444                struct extent_state *found;
 445                found = rb_entry(node, struct extent_state, rb_node);
 446                pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n",
 447                       found->start, found->end, start, end);
 448                return -EEXIST;
 449        }
 450        merge_state(tree, state);
 451        return 0;
 452}
 453
 454static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 455                     u64 split)
 456{
 457        if (tree->ops && tree->ops->split_extent_hook)
 458                tree->ops->split_extent_hook(tree->private_data, orig, split);
 459}
 460
 461/*
 462 * split a given extent state struct in two, inserting the preallocated
 463 * struct 'prealloc' as the newly created second half.  'split' indicates an
 464 * offset inside 'orig' where it should be split.
 465 *
 466 * Before calling,
 467 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 468 * are two extent state structs in the tree:
 469 * prealloc: [orig->start, split - 1]
 470 * orig: [ split, orig->end ]
 471 *
 472 * The tree locks are not taken by this function. They need to be held
 473 * by the caller.
 474 */
 475static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 476                       struct extent_state *prealloc, u64 split)
 477{
 478        struct rb_node *node;
 479
 480        split_cb(tree, orig, split);
 481
 482        prealloc->start = orig->start;
 483        prealloc->end = split - 1;
 484        prealloc->state = orig->state;
 485        orig->start = split;
 486
 487        node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
 488                           &prealloc->rb_node, NULL, NULL);
 489        if (node) {
 490                free_extent_state(prealloc);
 491                return -EEXIST;
 492        }
 493        return 0;
 494}
 495
 496static struct extent_state *next_state(struct extent_state *state)
 497{
 498        struct rb_node *next = rb_next(&state->rb_node);
 499        if (next)
 500                return rb_entry(next, struct extent_state, rb_node);
 501        else
 502                return NULL;
 503}
 504
 505/*
 506 * utility function to clear some bits in an extent state struct.
 507 * it will optionally wake up any one waiting on this state (wake == 1).
 508 *
 509 * If no bits are set on the state struct after clearing things, the
 510 * struct is freed and removed from the tree
 511 */
 512static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 513                                            struct extent_state *state,
 514                                            unsigned *bits, int wake,
 515                                            struct extent_changeset *changeset)
 516{
 517        struct extent_state *next;
 518        unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
 519        int ret;
 520
 521        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 522                u64 range = state->end - state->start + 1;
 523                WARN_ON(range > tree->dirty_bytes);
 524                tree->dirty_bytes -= range;
 525        }
 526        clear_state_cb(tree, state, bits);
 527        ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
 528        BUG_ON(ret < 0);
 529        state->state &= ~bits_to_clear;
 530        if (wake)
 531                wake_up(&state->wq);
 532        if (state->state == 0) {
 533                next = next_state(state);
 534                if (extent_state_in_tree(state)) {
 535                        rb_erase(&state->rb_node, &tree->state);
 536                        RB_CLEAR_NODE(&state->rb_node);
 537                        free_extent_state(state);
 538                } else {
 539                        WARN_ON(1);
 540                }
 541        } else {
 542                merge_state(tree, state);
 543                next = next_state(state);
 544        }
 545        return next;
 546}
 547
 548static struct extent_state *
 549alloc_extent_state_atomic(struct extent_state *prealloc)
 550{
 551        if (!prealloc)
 552                prealloc = alloc_extent_state(GFP_ATOMIC);
 553
 554        return prealloc;
 555}
 556
 557static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 558{
 559        struct inode *inode = tree->private_data;
 560
 561        btrfs_panic(btrfs_sb(inode->i_sb), err,
 562        "locking error: extent tree was modified by another thread while locked");
 563}
 564
 565/*
 566 * clear some bits on a range in the tree.  This may require splitting
 567 * or inserting elements in the tree, so the gfp mask is used to
 568 * indicate which allocations or sleeping are allowed.
 569 *
 570 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 571 * the given range from the tree regardless of state (ie for truncate).
 572 *
 573 * the range [start, end] is inclusive.
 574 *
 575 * This takes the tree lock, and returns 0 on success and < 0 on error.
 576 */
 577int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 578                              unsigned bits, int wake, int delete,
 579                              struct extent_state **cached_state,
 580                              gfp_t mask, struct extent_changeset *changeset)
 581{
 582        struct extent_state *state;
 583        struct extent_state *cached;
 584        struct extent_state *prealloc = NULL;
 585        struct rb_node *node;
 586        u64 last_end;
 587        int err;
 588        int clear = 0;
 589
 590        btrfs_debug_check_extent_io_range(tree, start, end);
 591
 592        if (bits & EXTENT_DELALLOC)
 593                bits |= EXTENT_NORESERVE;
 594
 595        if (delete)
 596                bits |= ~EXTENT_CTLBITS;
 597        bits |= EXTENT_FIRST_DELALLOC;
 598
 599        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 600                clear = 1;
 601again:
 602        if (!prealloc && gfpflags_allow_blocking(mask)) {
 603                /*
 604                 * Don't care for allocation failure here because we might end
 605                 * up not needing the pre-allocated extent state at all, which
 606                 * is the case if we only have in the tree extent states that
 607                 * cover our input range and don't cover too any other range.
 608                 * If we end up needing a new extent state we allocate it later.
 609                 */
 610                prealloc = alloc_extent_state(mask);
 611        }
 612
 613        spin_lock(&tree->lock);
 614        if (cached_state) {
 615                cached = *cached_state;
 616
 617                if (clear) {
 618                        *cached_state = NULL;
 619                        cached_state = NULL;
 620                }
 621
 622                if (cached && extent_state_in_tree(cached) &&
 623                    cached->start <= start && cached->end > start) {
 624                        if (clear)
 625                                refcount_dec(&cached->refs);
 626                        state = cached;
 627                        goto hit_next;
 628                }
 629                if (clear)
 630                        free_extent_state(cached);
 631        }
 632        /*
 633         * this search will find the extents that end after
 634         * our range starts
 635         */
 636        node = tree_search(tree, start);
 637        if (!node)
 638                goto out;
 639        state = rb_entry(node, struct extent_state, rb_node);
 640hit_next:
 641        if (state->start > end)
 642                goto out;
 643        WARN_ON(state->end < start);
 644        last_end = state->end;
 645
 646        /* the state doesn't have the wanted bits, go ahead */
 647        if (!(state->state & bits)) {
 648                state = next_state(state);
 649                goto next;
 650        }
 651
 652        /*
 653         *     | ---- desired range ---- |
 654         *  | state | or
 655         *  | ------------- state -------------- |
 656         *
 657         * We need to split the extent we found, and may flip
 658         * bits on second half.
 659         *
 660         * If the extent we found extends past our range, we
 661         * just split and search again.  It'll get split again
 662         * the next time though.
 663         *
 664         * If the extent we found is inside our range, we clear
 665         * the desired bit on it.
 666         */
 667
 668        if (state->start < start) {
 669                prealloc = alloc_extent_state_atomic(prealloc);
 670                BUG_ON(!prealloc);
 671                err = split_state(tree, state, prealloc, start);
 672                if (err)
 673                        extent_io_tree_panic(tree, err);
 674
 675                prealloc = NULL;
 676                if (err)
 677                        goto out;
 678                if (state->end <= end) {
 679                        state = clear_state_bit(tree, state, &bits, wake,
 680                                                changeset);
 681                        goto next;
 682                }
 683                goto search_again;
 684        }
 685        /*
 686         * | ---- desired range ---- |
 687         *                        | state |
 688         * We need to split the extent, and clear the bit
 689         * on the first half
 690         */
 691        if (state->start <= end && state->end > end) {
 692                prealloc = alloc_extent_state_atomic(prealloc);
 693                BUG_ON(!prealloc);
 694                err = split_state(tree, state, prealloc, end + 1);
 695                if (err)
 696                        extent_io_tree_panic(tree, err);
 697
 698                if (wake)
 699                        wake_up(&state->wq);
 700
 701                clear_state_bit(tree, prealloc, &bits, wake, changeset);
 702
 703                prealloc = NULL;
 704                goto out;
 705        }
 706
 707        state = clear_state_bit(tree, state, &bits, wake, changeset);
 708next:
 709        if (last_end == (u64)-1)
 710                goto out;
 711        start = last_end + 1;
 712        if (start <= end && state && !need_resched())
 713                goto hit_next;
 714
 715search_again:
 716        if (start > end)
 717                goto out;
 718        spin_unlock(&tree->lock);
 719        if (gfpflags_allow_blocking(mask))
 720                cond_resched();
 721        goto again;
 722
 723out:
 724        spin_unlock(&tree->lock);
 725        if (prealloc)
 726                free_extent_state(prealloc);
 727
 728        return 0;
 729
 730}
 731
 732static void wait_on_state(struct extent_io_tree *tree,
 733                          struct extent_state *state)
 734                __releases(tree->lock)
 735                __acquires(tree->lock)
 736{
 737        DEFINE_WAIT(wait);
 738        prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 739        spin_unlock(&tree->lock);
 740        schedule();
 741        spin_lock(&tree->lock);
 742        finish_wait(&state->wq, &wait);
 743}
 744
 745/*
 746 * waits for one or more bits to clear on a range in the state tree.
 747 * The range [start, end] is inclusive.
 748 * The tree lock is taken by this function
 749 */
 750static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 751                            unsigned long bits)
 752{
 753        struct extent_state *state;
 754        struct rb_node *node;
 755
 756        btrfs_debug_check_extent_io_range(tree, start, end);
 757
 758        spin_lock(&tree->lock);
 759again:
 760        while (1) {
 761                /*
 762                 * this search will find all the extents that end after
 763                 * our range starts
 764                 */
 765                node = tree_search(tree, start);
 766process_node:
 767                if (!node)
 768                        break;
 769
 770                state = rb_entry(node, struct extent_state, rb_node);
 771
 772                if (state->start > end)
 773                        goto out;
 774
 775                if (state->state & bits) {
 776                        start = state->start;
 777                        refcount_inc(&state->refs);
 778                        wait_on_state(tree, state);
 779                        free_extent_state(state);
 780                        goto again;
 781                }
 782                start = state->end + 1;
 783
 784                if (start > end)
 785                        break;
 786
 787                if (!cond_resched_lock(&tree->lock)) {
 788                        node = rb_next(node);
 789                        goto process_node;
 790                }
 791        }
 792out:
 793        spin_unlock(&tree->lock);
 794}
 795
 796static void set_state_bits(struct extent_io_tree *tree,
 797                           struct extent_state *state,
 798                           unsigned *bits, struct extent_changeset *changeset)
 799{
 800        unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
 801        int ret;
 802
 803        set_state_cb(tree, state, bits);
 804        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 805                u64 range = state->end - state->start + 1;
 806                tree->dirty_bytes += range;
 807        }
 808        ret = add_extent_changeset(state, bits_to_set, changeset, 1);
 809        BUG_ON(ret < 0);
 810        state->state |= bits_to_set;
 811}
 812
 813static void cache_state_if_flags(struct extent_state *state,
 814                                 struct extent_state **cached_ptr,
 815                                 unsigned flags)
 816{
 817        if (cached_ptr && !(*cached_ptr)) {
 818                if (!flags || (state->state & flags)) {
 819                        *cached_ptr = state;
 820                        refcount_inc(&state->refs);
 821                }
 822        }
 823}
 824
 825static void cache_state(struct extent_state *state,
 826                        struct extent_state **cached_ptr)
 827{
 828        return cache_state_if_flags(state, cached_ptr,
 829                                    EXTENT_IOBITS | EXTENT_BOUNDARY);
 830}
 831
 832/*
 833 * set some bits on a range in the tree.  This may require allocations or
 834 * sleeping, so the gfp mask is used to indicate what is allowed.
 835 *
 836 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 837 * part of the range already has the desired bits set.  The start of the
 838 * existing range is returned in failed_start in this case.
 839 *
 840 * [start, end] is inclusive This takes the tree lock.
 841 */
 842
 843static int __must_check
 844__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 845                 unsigned bits, unsigned exclusive_bits,
 846                 u64 *failed_start, struct extent_state **cached_state,
 847                 gfp_t mask, struct extent_changeset *changeset)
 848{
 849        struct extent_state *state;
 850        struct extent_state *prealloc = NULL;
 851        struct rb_node *node;
 852        struct rb_node **p;
 853        struct rb_node *parent;
 854        int err = 0;
 855        u64 last_start;
 856        u64 last_end;
 857
 858        btrfs_debug_check_extent_io_range(tree, start, end);
 859
 860        bits |= EXTENT_FIRST_DELALLOC;
 861again:
 862        if (!prealloc && gfpflags_allow_blocking(mask)) {
 863                /*
 864                 * Don't care for allocation failure here because we might end
 865                 * up not needing the pre-allocated extent state at all, which
 866                 * is the case if we only have in the tree extent states that
 867                 * cover our input range and don't cover too any other range.
 868                 * If we end up needing a new extent state we allocate it later.
 869                 */
 870                prealloc = alloc_extent_state(mask);
 871        }
 872
 873        spin_lock(&tree->lock);
 874        if (cached_state && *cached_state) {
 875                state = *cached_state;
 876                if (state->start <= start && state->end > start &&
 877                    extent_state_in_tree(state)) {
 878                        node = &state->rb_node;
 879                        goto hit_next;
 880                }
 881        }
 882        /*
 883         * this search will find all the extents that end after
 884         * our range starts.
 885         */
 886        node = tree_search_for_insert(tree, start, &p, &parent);
 887        if (!node) {
 888                prealloc = alloc_extent_state_atomic(prealloc);
 889                BUG_ON(!prealloc);
 890                err = insert_state(tree, prealloc, start, end,
 891                                   &p, &parent, &bits, changeset);
 892                if (err)
 893                        extent_io_tree_panic(tree, err);
 894
 895                cache_state(prealloc, cached_state);
 896                prealloc = NULL;
 897                goto out;
 898        }
 899        state = rb_entry(node, struct extent_state, rb_node);
 900hit_next:
 901        last_start = state->start;
 902        last_end = state->end;
 903
 904        /*
 905         * | ---- desired range ---- |
 906         * | state |
 907         *
 908         * Just lock what we found and keep going
 909         */
 910        if (state->start == start && state->end <= end) {
 911                if (state->state & exclusive_bits) {
 912                        *failed_start = state->start;
 913                        err = -EEXIST;
 914                        goto out;
 915                }
 916
 917                set_state_bits(tree, state, &bits, changeset);
 918                cache_state(state, cached_state);
 919                merge_state(tree, state);
 920                if (last_end == (u64)-1)
 921                        goto out;
 922                start = last_end + 1;
 923                state = next_state(state);
 924                if (start < end && state && state->start == start &&
 925                    !need_resched())
 926                        goto hit_next;
 927                goto search_again;
 928        }
 929
 930        /*
 931         *     | ---- desired range ---- |
 932         * | state |
 933         *   or
 934         * | ------------- state -------------- |
 935         *
 936         * We need to split the extent we found, and may flip bits on
 937         * second half.
 938         *
 939         * If the extent we found extends past our
 940         * range, we just split and search again.  It'll get split
 941         * again the next time though.
 942         *
 943         * If the extent we found is inside our range, we set the
 944         * desired bit on it.
 945         */
 946        if (state->start < start) {
 947                if (state->state & exclusive_bits) {
 948                        *failed_start = start;
 949                        err = -EEXIST;
 950                        goto out;
 951                }
 952
 953                prealloc = alloc_extent_state_atomic(prealloc);
 954                BUG_ON(!prealloc);
 955                err = split_state(tree, state, prealloc, start);
 956                if (err)
 957                        extent_io_tree_panic(tree, err);
 958
 959                prealloc = NULL;
 960                if (err)
 961                        goto out;
 962                if (state->end <= end) {
 963                        set_state_bits(tree, state, &bits, changeset);
 964                        cache_state(state, cached_state);
 965                        merge_state(tree, state);
 966                        if (last_end == (u64)-1)
 967                                goto out;
 968                        start = last_end + 1;
 969                        state = next_state(state);
 970                        if (start < end && state && state->start == start &&
 971                            !need_resched())
 972                                goto hit_next;
 973                }
 974                goto search_again;
 975        }
 976        /*
 977         * | ---- desired range ---- |
 978         *     | state | or               | state |
 979         *
 980         * There's a hole, we need to insert something in it and
 981         * ignore the extent we found.
 982         */
 983        if (state->start > start) {
 984                u64 this_end;
 985                if (end < last_start)
 986                        this_end = end;
 987                else
 988                        this_end = last_start - 1;
 989
 990                prealloc = alloc_extent_state_atomic(prealloc);
 991                BUG_ON(!prealloc);
 992
 993                /*
 994                 * Avoid to free 'prealloc' if it can be merged with
 995                 * the later extent.
 996                 */
 997                err = insert_state(tree, prealloc, start, this_end,
 998                                   NULL, NULL, &bits, changeset);
 999                if (err)
1000                        extent_io_tree_panic(tree, err);
1001
1002                cache_state(prealloc, cached_state);
1003                prealloc = NULL;
1004                start = this_end + 1;
1005                goto search_again;
1006        }
1007        /*
1008         * | ---- desired range ---- |
1009         *                        | state |
1010         * We need to split the extent, and set the bit
1011         * on the first half
1012         */
1013        if (state->start <= end && state->end > end) {
1014                if (state->state & exclusive_bits) {
1015                        *failed_start = start;
1016                        err = -EEXIST;
1017                        goto out;
1018                }
1019
1020                prealloc = alloc_extent_state_atomic(prealloc);
1021                BUG_ON(!prealloc);
1022                err = split_state(tree, state, prealloc, end + 1);
1023                if (err)
1024                        extent_io_tree_panic(tree, err);
1025
1026                set_state_bits(tree, prealloc, &bits, changeset);
1027                cache_state(prealloc, cached_state);
1028                merge_state(tree, prealloc);
1029                prealloc = NULL;
1030                goto out;
1031        }
1032
1033search_again:
1034        if (start > end)
1035                goto out;
1036        spin_unlock(&tree->lock);
1037        if (gfpflags_allow_blocking(mask))
1038                cond_resched();
1039        goto again;
1040
1041out:
1042        spin_unlock(&tree->lock);
1043        if (prealloc)
1044                free_extent_state(prealloc);
1045
1046        return err;
1047
1048}
1049
1050int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1051                   unsigned bits, u64 * failed_start,
1052                   struct extent_state **cached_state, gfp_t mask)
1053{
1054        return __set_extent_bit(tree, start, end, bits, 0, failed_start,
1055                                cached_state, mask, NULL);
1056}
1057
1058
1059/**
1060 * convert_extent_bit - convert all bits in a given range from one bit to
1061 *                      another
1062 * @tree:       the io tree to search
1063 * @start:      the start offset in bytes
1064 * @end:        the end offset in bytes (inclusive)
1065 * @bits:       the bits to set in this range
1066 * @clear_bits: the bits to clear in this range
1067 * @cached_state:       state that we're going to cache
1068 *
1069 * This will go through and set bits for the given range.  If any states exist
1070 * already in this range they are set with the given bit and cleared of the
1071 * clear_bits.  This is only meant to be used by things that are mergeable, ie
1072 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1073 * boundary bits like LOCK.
1074 *
1075 * All allocations are done with GFP_NOFS.
1076 */
1077int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1078                       unsigned bits, unsigned clear_bits,
1079                       struct extent_state **cached_state)
1080{
1081        struct extent_state *state;
1082        struct extent_state *prealloc = NULL;
1083        struct rb_node *node;
1084        struct rb_node **p;
1085        struct rb_node *parent;
1086        int err = 0;
1087        u64 last_start;
1088        u64 last_end;
1089        bool first_iteration = true;
1090
1091        btrfs_debug_check_extent_io_range(tree, start, end);
1092
1093again:
1094        if (!prealloc) {
1095                /*
1096                 * Best effort, don't worry if extent state allocation fails
1097                 * here for the first iteration. We might have a cached state
1098                 * that matches exactly the target range, in which case no
1099                 * extent state allocations are needed. We'll only know this
1100                 * after locking the tree.
1101                 */
1102                prealloc = alloc_extent_state(GFP_NOFS);
1103                if (!prealloc && !first_iteration)
1104                        return -ENOMEM;
1105        }
1106
1107        spin_lock(&tree->lock);
1108        if (cached_state && *cached_state) {
1109                state = *cached_state;
1110                if (state->start <= start && state->end > start &&
1111                    extent_state_in_tree(state)) {
1112                        node = &state->rb_node;
1113                        goto hit_next;
1114                }
1115        }
1116
1117        /*
1118         * this search will find all the extents that end after
1119         * our range starts.
1120         */
1121        node = tree_search_for_insert(tree, start, &p, &parent);
1122        if (!node) {
1123                prealloc = alloc_extent_state_atomic(prealloc);
1124                if (!prealloc) {
1125                        err = -ENOMEM;
1126                        goto out;
1127                }
1128                err = insert_state(tree, prealloc, start, end,
1129                                   &p, &parent, &bits, NULL);
1130                if (err)
1131                        extent_io_tree_panic(tree, err);
1132                cache_state(prealloc, cached_state);
1133                prealloc = NULL;
1134                goto out;
1135        }
1136        state = rb_entry(node, struct extent_state, rb_node);
1137hit_next:
1138        last_start = state->start;
1139        last_end = state->end;
1140
1141        /*
1142         * | ---- desired range ---- |
1143         * | state |
1144         *
1145         * Just lock what we found and keep going
1146         */
1147        if (state->start == start && state->end <= end) {
1148                set_state_bits(tree, state, &bits, NULL);
1149                cache_state(state, cached_state);
1150                state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
1151                if (last_end == (u64)-1)
1152                        goto out;
1153                start = last_end + 1;
1154                if (start < end && state && state->start == start &&
1155                    !need_resched())
1156                        goto hit_next;
1157                goto search_again;
1158        }
1159
1160        /*
1161         *     | ---- desired range ---- |
1162         * | state |
1163         *   or
1164         * | ------------- state -------------- |
1165         *
1166         * We need to split the extent we found, and may flip bits on
1167         * second half.
1168         *
1169         * If the extent we found extends past our
1170         * range, we just split and search again.  It'll get split
1171         * again the next time though.
1172         *
1173         * If the extent we found is inside our range, we set the
1174         * desired bit on it.
1175         */
1176        if (state->start < start) {
1177                prealloc = alloc_extent_state_atomic(prealloc);
1178                if (!prealloc) {
1179                        err = -ENOMEM;
1180                        goto out;
1181                }
1182                err = split_state(tree, state, prealloc, start);
1183                if (err)
1184                        extent_io_tree_panic(tree, err);
1185                prealloc = NULL;
1186                if (err)
1187                        goto out;
1188                if (state->end <= end) {
1189                        set_state_bits(tree, state, &bits, NULL);
1190                        cache_state(state, cached_state);
1191                        state = clear_state_bit(tree, state, &clear_bits, 0,
1192                                                NULL);
1193                        if (last_end == (u64)-1)
1194                                goto out;
1195                        start = last_end + 1;
1196                        if (start < end && state && state->start == start &&
1197                            !need_resched())
1198                                goto hit_next;
1199                }
1200                goto search_again;
1201        }
1202        /*
1203         * | ---- desired range ---- |
1204         *     | state | or               | state |
1205         *
1206         * There's a hole, we need to insert something in it and
1207         * ignore the extent we found.
1208         */
1209        if (state->start > start) {
1210                u64 this_end;
1211                if (end < last_start)
1212                        this_end = end;
1213                else
1214                        this_end = last_start - 1;
1215
1216                prealloc = alloc_extent_state_atomic(prealloc);
1217                if (!prealloc) {
1218                        err = -ENOMEM;
1219                        goto out;
1220                }
1221
1222                /*
1223                 * Avoid to free 'prealloc' if it can be merged with
1224                 * the later extent.
1225                 */
1226                err = insert_state(tree, prealloc, start, this_end,
1227                                   NULL, NULL, &bits, NULL);
1228                if (err)
1229                        extent_io_tree_panic(tree, err);
1230                cache_state(prealloc, cached_state);
1231                prealloc = NULL;
1232                start = this_end + 1;
1233                goto search_again;
1234        }
1235        /*
1236         * | ---- desired range ---- |
1237         *                        | state |
1238         * We need to split the extent, and set the bit
1239         * on the first half
1240         */
1241        if (state->start <= end && state->end > end) {
1242                prealloc = alloc_extent_state_atomic(prealloc);
1243                if (!prealloc) {
1244                        err = -ENOMEM;
1245                        goto out;
1246                }
1247
1248                err = split_state(tree, state, prealloc, end + 1);
1249                if (err)
1250                        extent_io_tree_panic(tree, err);
1251
1252                set_state_bits(tree, prealloc, &bits, NULL);
1253                cache_state(prealloc, cached_state);
1254                clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
1255                prealloc = NULL;
1256                goto out;
1257        }
1258
1259search_again:
1260        if (start > end)
1261                goto out;
1262        spin_unlock(&tree->lock);
1263        cond_resched();
1264        first_iteration = false;
1265        goto again;
1266
1267out:
1268        spin_unlock(&tree->lock);
1269        if (prealloc)
1270                free_extent_state(prealloc);
1271
1272        return err;
1273}
1274
1275/* wrappers around set/clear extent bit */
1276int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1277                           unsigned bits, struct extent_changeset *changeset)
1278{
1279        /*
1280         * We don't support EXTENT_LOCKED yet, as current changeset will
1281         * record any bits changed, so for EXTENT_LOCKED case, it will
1282         * either fail with -EEXIST or changeset will record the whole
1283         * range.
1284         */
1285        BUG_ON(bits & EXTENT_LOCKED);
1286
1287        return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1288                                changeset);
1289}
1290
1291int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1292                     unsigned bits, int wake, int delete,
1293                     struct extent_state **cached)
1294{
1295        return __clear_extent_bit(tree, start, end, bits, wake, delete,
1296                                  cached, GFP_NOFS, NULL);
1297}
1298
1299int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1300                unsigned bits, struct extent_changeset *changeset)
1301{
1302        /*
1303         * Don't support EXTENT_LOCKED case, same reason as
1304         * set_record_extent_bits().
1305         */
1306        BUG_ON(bits & EXTENT_LOCKED);
1307
1308        return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
1309                                  changeset);
1310}
1311
1312/*
1313 * either insert or lock state struct between start and end use mask to tell
1314 * us if waiting is desired.
1315 */
1316int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1317                     struct extent_state **cached_state)
1318{
1319        int err;
1320        u64 failed_start;
1321
1322        while (1) {
1323                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
1324                                       EXTENT_LOCKED, &failed_start,
1325                                       cached_state, GFP_NOFS, NULL);
1326                if (err == -EEXIST) {
1327                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1328                        start = failed_start;
1329                } else
1330                        break;
1331                WARN_ON(start > end);
1332        }
1333        return err;
1334}
1335
1336int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1337{
1338        int err;
1339        u64 failed_start;
1340
1341        err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1342                               &failed_start, NULL, GFP_NOFS, NULL);
1343        if (err == -EEXIST) {
1344                if (failed_start > start)
1345                        clear_extent_bit(tree, start, failed_start - 1,
1346                                         EXTENT_LOCKED, 1, 0, NULL);
1347                return 0;
1348        }
1349        return 1;
1350}
1351
1352void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1353{
1354        unsigned long index = start >> PAGE_SHIFT;
1355        unsigned long end_index = end >> PAGE_SHIFT;
1356        struct page *page;
1357
1358        while (index <= end_index) {
1359                page = find_get_page(inode->i_mapping, index);
1360                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1361                clear_page_dirty_for_io(page);
1362                put_page(page);
1363                index++;
1364        }
1365}
1366
1367void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1368{
1369        unsigned long index = start >> PAGE_SHIFT;
1370        unsigned long end_index = end >> PAGE_SHIFT;
1371        struct page *page;
1372
1373        while (index <= end_index) {
1374                page = find_get_page(inode->i_mapping, index);
1375                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1376                __set_page_dirty_nobuffers(page);
1377                account_page_redirty(page);
1378                put_page(page);
1379                index++;
1380        }
1381}
1382
1383/* find the first state struct with 'bits' set after 'start', and
1384 * return it.  tree->lock must be held.  NULL will returned if
1385 * nothing was found after 'start'
1386 */
1387static struct extent_state *
1388find_first_extent_bit_state(struct extent_io_tree *tree,
1389                            u64 start, unsigned bits)
1390{
1391        struct rb_node *node;
1392        struct extent_state *state;
1393
1394        /*
1395         * this search will find all the extents that end after
1396         * our range starts.
1397         */
1398        node = tree_search(tree, start);
1399        if (!node)
1400                goto out;
1401
1402        while (1) {
1403                state = rb_entry(node, struct extent_state, rb_node);
1404                if (state->end >= start && (state->state & bits))
1405                        return state;
1406
1407                node = rb_next(node);
1408                if (!node)
1409                        break;
1410        }
1411out:
1412        return NULL;
1413}
1414
1415/*
1416 * find the first offset in the io tree with 'bits' set. zero is
1417 * returned if we find something, and *start_ret and *end_ret are
1418 * set to reflect the state struct that was found.
1419 *
1420 * If nothing was found, 1 is returned. If found something, return 0.
1421 */
1422int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1423                          u64 *start_ret, u64 *end_ret, unsigned bits,
1424                          struct extent_state **cached_state)
1425{
1426        struct extent_state *state;
1427        struct rb_node *n;
1428        int ret = 1;
1429
1430        spin_lock(&tree->lock);
1431        if (cached_state && *cached_state) {
1432                state = *cached_state;
1433                if (state->end == start - 1 && extent_state_in_tree(state)) {
1434                        n = rb_next(&state->rb_node);
1435                        while (n) {
1436                                state = rb_entry(n, struct extent_state,
1437                                                 rb_node);
1438                                if (state->state & bits)
1439                                        goto got_it;
1440                                n = rb_next(n);
1441                        }
1442                        free_extent_state(*cached_state);
1443                        *cached_state = NULL;
1444                        goto out;
1445                }
1446                free_extent_state(*cached_state);
1447                *cached_state = NULL;
1448        }
1449
1450        state = find_first_extent_bit_state(tree, start, bits);
1451got_it:
1452        if (state) {
1453                cache_state_if_flags(state, cached_state, 0);
1454                *start_ret = state->start;
1455                *end_ret = state->end;
1456                ret = 0;
1457        }
1458out:
1459        spin_unlock(&tree->lock);
1460        return ret;
1461}
1462
1463/*
1464 * find a contiguous range of bytes in the file marked as delalloc, not
1465 * more than 'max_bytes'.  start and end are used to return the range,
1466 *
1467 * 1 is returned if we find something, 0 if nothing was in the tree
1468 */
1469static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1470                                        u64 *start, u64 *end, u64 max_bytes,
1471                                        struct extent_state **cached_state)
1472{
1473        struct rb_node *node;
1474        struct extent_state *state;
1475        u64 cur_start = *start;
1476        u64 found = 0;
1477        u64 total_bytes = 0;
1478
1479        spin_lock(&tree->lock);
1480
1481        /*
1482         * this search will find all the extents that end after
1483         * our range starts.
1484         */
1485        node = tree_search(tree, cur_start);
1486        if (!node) {
1487                if (!found)
1488                        *end = (u64)-1;
1489                goto out;
1490        }
1491
1492        while (1) {
1493                state = rb_entry(node, struct extent_state, rb_node);
1494                if (found && (state->start != cur_start ||
1495                              (state->state & EXTENT_BOUNDARY))) {
1496                        goto out;
1497                }
1498                if (!(state->state & EXTENT_DELALLOC)) {
1499                        if (!found)
1500                                *end = state->end;
1501                        goto out;
1502                }
1503                if (!found) {
1504                        *start = state->start;
1505                        *cached_state = state;
1506                        refcount_inc(&state->refs);
1507                }
1508                found++;
1509                *end = state->end;
1510                cur_start = state->end + 1;
1511                node = rb_next(node);
1512                total_bytes += state->end - state->start + 1;
1513                if (total_bytes >= max_bytes)
1514                        break;
1515                if (!node)
1516                        break;
1517        }
1518out:
1519        spin_unlock(&tree->lock);
1520        return found;
1521}
1522
1523static int __process_pages_contig(struct address_space *mapping,
1524                                  struct page *locked_page,
1525                                  pgoff_t start_index, pgoff_t end_index,
1526                                  unsigned long page_ops, pgoff_t *index_ret);
1527
1528static noinline void __unlock_for_delalloc(struct inode *inode,
1529                                           struct page *locked_page,
1530                                           u64 start, u64 end)
1531{
1532        unsigned long index = start >> PAGE_SHIFT;
1533        unsigned long end_index = end >> PAGE_SHIFT;
1534
1535        ASSERT(locked_page);
1536        if (index == locked_page->index && end_index == index)
1537                return;
1538
1539        __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1540                               PAGE_UNLOCK, NULL);
1541}
1542
1543static noinline int lock_delalloc_pages(struct inode *inode,
1544                                        struct page *locked_page,
1545                                        u64 delalloc_start,
1546                                        u64 delalloc_end)
1547{
1548        unsigned long index = delalloc_start >> PAGE_SHIFT;
1549        unsigned long index_ret = index;
1550        unsigned long end_index = delalloc_end >> PAGE_SHIFT;
1551        int ret;
1552
1553        ASSERT(locked_page);
1554        if (index == locked_page->index && index == end_index)
1555                return 0;
1556
1557        ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1558                                     end_index, PAGE_LOCK, &index_ret);
1559        if (ret == -EAGAIN)
1560                __unlock_for_delalloc(inode, locked_page, delalloc_start,
1561                                      (u64)index_ret << PAGE_SHIFT);
1562        return ret;
1563}
1564
1565/*
1566 * find a contiguous range of bytes in the file marked as delalloc, not
1567 * more than 'max_bytes'.  start and end are used to return the range,
1568 *
1569 * 1 is returned if we find something, 0 if nothing was in the tree
1570 */
1571STATIC u64 find_lock_delalloc_range(struct inode *inode,
1572                                    struct extent_io_tree *tree,
1573                                    struct page *locked_page, u64 *start,
1574                                    u64 *end, u64 max_bytes)
1575{
1576        u64 delalloc_start;
1577        u64 delalloc_end;
1578        u64 found;
1579        struct extent_state *cached_state = NULL;
1580        int ret;
1581        int loops = 0;
1582
1583again:
1584        /* step one, find a bunch of delalloc bytes starting at start */
1585        delalloc_start = *start;
1586        delalloc_end = 0;
1587        found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1588                                    max_bytes, &cached_state);
1589        if (!found || delalloc_end <= *start) {
1590                *start = delalloc_start;
1591                *end = delalloc_end;
1592                free_extent_state(cached_state);
1593                return 0;
1594        }
1595
1596        /*
1597         * start comes from the offset of locked_page.  We have to lock
1598         * pages in order, so we can't process delalloc bytes before
1599         * locked_page
1600         */
1601        if (delalloc_start < *start)
1602                delalloc_start = *start;
1603
1604        /*
1605         * make sure to limit the number of pages we try to lock down
1606         */
1607        if (delalloc_end + 1 - delalloc_start > max_bytes)
1608                delalloc_end = delalloc_start + max_bytes - 1;
1609
1610        /* step two, lock all the pages after the page that has start */
1611        ret = lock_delalloc_pages(inode, locked_page,
1612                                  delalloc_start, delalloc_end);
1613        if (ret == -EAGAIN) {
1614                /* some of the pages are gone, lets avoid looping by
1615                 * shortening the size of the delalloc range we're searching
1616                 */
1617                free_extent_state(cached_state);
1618                cached_state = NULL;
1619                if (!loops) {
1620                        max_bytes = PAGE_SIZE;
1621                        loops = 1;
1622                        goto again;
1623                } else {
1624                        found = 0;
1625                        goto out_failed;
1626                }
1627        }
1628        BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1629
1630        /* step three, lock the state bits for the whole range */
1631        lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
1632
1633        /* then test to make sure it is all still delalloc */
1634        ret = test_range_bit(tree, delalloc_start, delalloc_end,
1635                             EXTENT_DELALLOC, 1, cached_state);
1636        if (!ret) {
1637                unlock_extent_cached(tree, delalloc_start, delalloc_end,
1638                                     &cached_state);
1639                __unlock_for_delalloc(inode, locked_page,
1640                              delalloc_start, delalloc_end);
1641                cond_resched();
1642                goto again;
1643        }
1644        free_extent_state(cached_state);
1645        *start = delalloc_start;
1646        *end = delalloc_end;
1647out_failed:
1648        return found;
1649}
1650
1651static int __process_pages_contig(struct address_space *mapping,
1652                                  struct page *locked_page,
1653                                  pgoff_t start_index, pgoff_t end_index,
1654                                  unsigned long page_ops, pgoff_t *index_ret)
1655{
1656        unsigned long nr_pages = end_index - start_index + 1;
1657        unsigned long pages_locked = 0;
1658        pgoff_t index = start_index;
1659        struct page *pages[16];
1660        unsigned ret;
1661        int err = 0;
1662        int i;
1663
1664        if (page_ops & PAGE_LOCK) {
1665                ASSERT(page_ops == PAGE_LOCK);
1666                ASSERT(index_ret && *index_ret == start_index);
1667        }
1668
1669        if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1670                mapping_set_error(mapping, -EIO);
1671
1672        while (nr_pages > 0) {
1673                ret = find_get_pages_contig(mapping, index,
1674                                     min_t(unsigned long,
1675                                     nr_pages, ARRAY_SIZE(pages)), pages);
1676                if (ret == 0) {
1677                        /*
1678                         * Only if we're going to lock these pages,
1679                         * can we find nothing at @index.
1680                         */
1681                        ASSERT(page_ops & PAGE_LOCK);
1682                        err = -EAGAIN;
1683                        goto out;
1684                }
1685
1686                for (i = 0; i < ret; i++) {
1687                        if (page_ops & PAGE_SET_PRIVATE2)
1688                                SetPagePrivate2(pages[i]);
1689
1690                        if (pages[i] == locked_page) {
1691                                put_page(pages[i]);
1692                                pages_locked++;
1693                                continue;
1694                        }
1695                        if (page_ops & PAGE_CLEAR_DIRTY)
1696                                clear_page_dirty_for_io(pages[i]);
1697                        if (page_ops & PAGE_SET_WRITEBACK)
1698                                set_page_writeback(pages[i]);
1699                        if (page_ops & PAGE_SET_ERROR)
1700                                SetPageError(pages[i]);
1701                        if (page_ops & PAGE_END_WRITEBACK)
1702                                end_page_writeback(pages[i]);
1703                        if (page_ops & PAGE_UNLOCK)
1704                                unlock_page(pages[i]);
1705                        if (page_ops & PAGE_LOCK) {
1706                                lock_page(pages[i]);
1707                                if (!PageDirty(pages[i]) ||
1708                                    pages[i]->mapping != mapping) {
1709                                        unlock_page(pages[i]);
1710                                        put_page(pages[i]);
1711                                        err = -EAGAIN;
1712                                        goto out;
1713                                }
1714                        }
1715                        put_page(pages[i]);
1716                        pages_locked++;
1717                }
1718                nr_pages -= ret;
1719                index += ret;
1720                cond_resched();
1721        }
1722out:
1723        if (err && index_ret)
1724                *index_ret = start_index + pages_locked - 1;
1725        return err;
1726}
1727
1728void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1729                                 u64 delalloc_end, struct page *locked_page,
1730                                 unsigned clear_bits,
1731                                 unsigned long page_ops)
1732{
1733        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
1734                         NULL);
1735
1736        __process_pages_contig(inode->i_mapping, locked_page,
1737                               start >> PAGE_SHIFT, end >> PAGE_SHIFT,
1738                               page_ops, NULL);
1739}
1740
1741/*
1742 * count the number of bytes in the tree that have a given bit(s)
1743 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1744 * cached.  The total number found is returned.
1745 */
1746u64 count_range_bits(struct extent_io_tree *tree,
1747                     u64 *start, u64 search_end, u64 max_bytes,
1748                     unsigned bits, int contig)
1749{
1750        struct rb_node *node;
1751        struct extent_state *state;
1752        u64 cur_start = *start;
1753        u64 total_bytes = 0;
1754        u64 last = 0;
1755        int found = 0;
1756
1757        if (WARN_ON(search_end <= cur_start))
1758                return 0;
1759
1760        spin_lock(&tree->lock);
1761        if (cur_start == 0 && bits == EXTENT_DIRTY) {
1762                total_bytes = tree->dirty_bytes;
1763                goto out;
1764        }
1765        /*
1766         * this search will find all the extents that end after
1767         * our range starts.
1768         */
1769        node = tree_search(tree, cur_start);
1770        if (!node)
1771                goto out;
1772
1773        while (1) {
1774                state = rb_entry(node, struct extent_state, rb_node);
1775                if (state->start > search_end)
1776                        break;
1777                if (contig && found && state->start > last + 1)
1778                        break;
1779                if (state->end >= cur_start && (state->state & bits) == bits) {
1780                        total_bytes += min(search_end, state->end) + 1 -
1781                                       max(cur_start, state->start);
1782                        if (total_bytes >= max_bytes)
1783                                break;
1784                        if (!found) {
1785                                *start = max(cur_start, state->start);
1786                                found = 1;
1787                        }
1788                        last = state->end;
1789                } else if (contig && found) {
1790                        break;
1791                }
1792                node = rb_next(node);
1793                if (!node)
1794                        break;
1795        }
1796out:
1797        spin_unlock(&tree->lock);
1798        return total_bytes;
1799}
1800
1801/*
1802 * set the private field for a given byte offset in the tree.  If there isn't
1803 * an extent_state there already, this does nothing.
1804 */
1805static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
1806                struct io_failure_record *failrec)
1807{
1808        struct rb_node *node;
1809        struct extent_state *state;
1810        int ret = 0;
1811
1812        spin_lock(&tree->lock);
1813        /*
1814         * this search will find all the extents that end after
1815         * our range starts.
1816         */
1817        node = tree_search(tree, start);
1818        if (!node) {
1819                ret = -ENOENT;
1820                goto out;
1821        }
1822        state = rb_entry(node, struct extent_state, rb_node);
1823        if (state->start != start) {
1824                ret = -ENOENT;
1825                goto out;
1826        }
1827        state->failrec = failrec;
1828out:
1829        spin_unlock(&tree->lock);
1830        return ret;
1831}
1832
1833static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
1834                struct io_failure_record **failrec)
1835{
1836        struct rb_node *node;
1837        struct extent_state *state;
1838        int ret = 0;
1839
1840        spin_lock(&tree->lock);
1841        /*
1842         * this search will find all the extents that end after
1843         * our range starts.
1844         */
1845        node = tree_search(tree, start);
1846        if (!node) {
1847                ret = -ENOENT;
1848                goto out;
1849        }
1850        state = rb_entry(node, struct extent_state, rb_node);
1851        if (state->start != start) {
1852                ret = -ENOENT;
1853                goto out;
1854        }
1855        *failrec = state->failrec;
1856out:
1857        spin_unlock(&tree->lock);
1858        return ret;
1859}
1860
1861/*
1862 * searches a range in the state tree for a given mask.
1863 * If 'filled' == 1, this returns 1 only if every extent in the tree
1864 * has the bits set.  Otherwise, 1 is returned if any bit in the
1865 * range is found set.
1866 */
1867int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1868                   unsigned bits, int filled, struct extent_state *cached)
1869{
1870        struct extent_state *state = NULL;
1871        struct rb_node *node;
1872        int bitset = 0;
1873
1874        spin_lock(&tree->lock);
1875        if (cached && extent_state_in_tree(cached) && cached->start <= start &&
1876            cached->end > start)
1877                node = &cached->rb_node;
1878        else
1879                node = tree_search(tree, start);
1880        while (node && start <= end) {
1881                state = rb_entry(node, struct extent_state, rb_node);
1882
1883                if (filled && state->start > start) {
1884                        bitset = 0;
1885                        break;
1886                }
1887
1888                if (state->start > end)
1889                        break;
1890
1891                if (state->state & bits) {
1892                        bitset = 1;
1893                        if (!filled)
1894                                break;
1895                } else if (filled) {
1896                        bitset = 0;
1897                        break;
1898                }
1899
1900                if (state->end == (u64)-1)
1901                        break;
1902
1903                start = state->end + 1;
1904                if (start > end)
1905                        break;
1906                node = rb_next(node);
1907                if (!node) {
1908                        if (filled)
1909                                bitset = 0;
1910                        break;
1911                }
1912        }
1913        spin_unlock(&tree->lock);
1914        return bitset;
1915}
1916
1917/*
1918 * helper function to set a given page up to date if all the
1919 * extents in the tree for that page are up to date
1920 */
1921static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1922{
1923        u64 start = page_offset(page);
1924        u64 end = start + PAGE_SIZE - 1;
1925        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1926                SetPageUptodate(page);
1927}
1928
1929int free_io_failure(struct extent_io_tree *failure_tree,
1930                    struct extent_io_tree *io_tree,
1931                    struct io_failure_record *rec)
1932{
1933        int ret;
1934        int err = 0;
1935
1936        set_state_failrec(failure_tree, rec->start, NULL);
1937        ret = clear_extent_bits(failure_tree, rec->start,
1938                                rec->start + rec->len - 1,
1939                                EXTENT_LOCKED | EXTENT_DIRTY);
1940        if (ret)
1941                err = ret;
1942
1943        ret = clear_extent_bits(io_tree, rec->start,
1944                                rec->start + rec->len - 1,
1945                                EXTENT_DAMAGED);
1946        if (ret && !err)
1947                err = ret;
1948
1949        kfree(rec);
1950        return err;
1951}
1952
1953/*
1954 * this bypasses the standard btrfs submit functions deliberately, as
1955 * the standard behavior is to write all copies in a raid setup. here we only
1956 * want to write the one bad copy. so we do the mapping for ourselves and issue
1957 * submit_bio directly.
1958 * to avoid any synchronization issues, wait for the data after writing, which
1959 * actually prevents the read that triggered the error from finishing.
1960 * currently, there can be no more than two copies of every data bit. thus,
1961 * exactly one rewrite is required.
1962 */
1963int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
1964                      u64 length, u64 logical, struct page *page,
1965                      unsigned int pg_offset, int mirror_num)
1966{
1967        struct bio *bio;
1968        struct btrfs_device *dev;
1969        u64 map_length = 0;
1970        u64 sector;
1971        struct btrfs_bio *bbio = NULL;
1972        int ret;
1973
1974        ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
1975        BUG_ON(!mirror_num);
1976
1977        bio = btrfs_io_bio_alloc(1);
1978        bio->bi_iter.bi_size = 0;
1979        map_length = length;
1980
1981        /*
1982         * Avoid races with device replace and make sure our bbio has devices
1983         * associated to its stripes that don't go away while we are doing the
1984         * read repair operation.
1985         */
1986        btrfs_bio_counter_inc_blocked(fs_info);
1987        if (btrfs_is_parity_mirror(fs_info, logical, length)) {
1988                /*
1989                 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
1990                 * to update all raid stripes, but here we just want to correct
1991                 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
1992                 * stripe's dev and sector.
1993                 */
1994                ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
1995                                      &map_length, &bbio, 0);
1996                if (ret) {
1997                        btrfs_bio_counter_dec(fs_info);
1998                        bio_put(bio);
1999                        return -EIO;
2000                }
2001                ASSERT(bbio->mirror_num == 1);
2002        } else {
2003                ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2004                                      &map_length, &bbio, mirror_num);
2005                if (ret) {
2006                        btrfs_bio_counter_dec(fs_info);
2007                        bio_put(bio);
2008                        return -EIO;
2009                }
2010                BUG_ON(mirror_num != bbio->mirror_num);
2011        }
2012
2013        sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
2014        bio->bi_iter.bi_sector = sector;
2015        dev = bbio->stripes[bbio->mirror_num - 1].dev;
2016        btrfs_put_bbio(bbio);
2017        if (!dev || !dev->bdev ||
2018            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2019                btrfs_bio_counter_dec(fs_info);
2020                bio_put(bio);
2021                return -EIO;
2022        }
2023        bio_set_dev(bio, dev->bdev);
2024        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
2025        bio_add_page(bio, page, length, pg_offset);
2026
2027        if (btrfsic_submit_bio_wait(bio)) {
2028                /* try to remap that extent elsewhere? */
2029                btrfs_bio_counter_dec(fs_info);
2030                bio_put(bio);
2031                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2032                return -EIO;
2033        }
2034
2035        btrfs_info_rl_in_rcu(fs_info,
2036                "read error corrected: ino %llu off %llu (dev %s sector %llu)",
2037                                  ino, start,
2038                                  rcu_str_deref(dev->name), sector);
2039        btrfs_bio_counter_dec(fs_info);
2040        bio_put(bio);
2041        return 0;
2042}
2043
2044int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
2045                         struct extent_buffer *eb, int mirror_num)
2046{
2047        u64 start = eb->start;
2048        int i, num_pages = num_extent_pages(eb);
2049        int ret = 0;
2050
2051        if (sb_rdonly(fs_info->sb))
2052                return -EROFS;
2053
2054        for (i = 0; i < num_pages; i++) {
2055                struct page *p = eb->pages[i];
2056
2057                ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2058                                        start - page_offset(p), mirror_num);
2059                if (ret)
2060                        break;
2061                start += PAGE_SIZE;
2062        }
2063
2064        return ret;
2065}
2066
2067/*
2068 * each time an IO finishes, we do a fast check in the IO failure tree
2069 * to see if we need to process or clean up an io_failure_record
2070 */
2071int clean_io_failure(struct btrfs_fs_info *fs_info,
2072                     struct extent_io_tree *failure_tree,
2073                     struct extent_io_tree *io_tree, u64 start,
2074                     struct page *page, u64 ino, unsigned int pg_offset)
2075{
2076        u64 private;
2077        struct io_failure_record *failrec;
2078        struct extent_state *state;
2079        int num_copies;
2080        int ret;
2081
2082        private = 0;
2083        ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2084                               EXTENT_DIRTY, 0);
2085        if (!ret)
2086                return 0;
2087
2088        ret = get_state_failrec(failure_tree, start, &failrec);
2089        if (ret)
2090                return 0;
2091
2092        BUG_ON(!failrec->this_mirror);
2093
2094        if (failrec->in_validation) {
2095                /* there was no real error, just free the record */
2096                btrfs_debug(fs_info,
2097                        "clean_io_failure: freeing dummy error at %llu",
2098                        failrec->start);
2099                goto out;
2100        }
2101        if (sb_rdonly(fs_info->sb))
2102                goto out;
2103
2104        spin_lock(&io_tree->lock);
2105        state = find_first_extent_bit_state(io_tree,
2106                                            failrec->start,
2107                                            EXTENT_LOCKED);
2108        spin_unlock(&io_tree->lock);
2109
2110        if (state && state->start <= failrec->start &&
2111            state->end >= failrec->start + failrec->len - 1) {
2112                num_copies = btrfs_num_copies(fs_info, failrec->logical,
2113                                              failrec->len);
2114                if (num_copies > 1)  {
2115                        repair_io_failure(fs_info, ino, start, failrec->len,
2116                                          failrec->logical, page, pg_offset,
2117                                          failrec->failed_mirror);
2118                }
2119        }
2120
2121out:
2122        free_io_failure(failure_tree, io_tree, failrec);
2123
2124        return 0;
2125}
2126
2127/*
2128 * Can be called when
2129 * - hold extent lock
2130 * - under ordered extent
2131 * - the inode is freeing
2132 */
2133void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
2134{
2135        struct extent_io_tree *failure_tree = &inode->io_failure_tree;
2136        struct io_failure_record *failrec;
2137        struct extent_state *state, *next;
2138
2139        if (RB_EMPTY_ROOT(&failure_tree->state))
2140                return;
2141
2142        spin_lock(&failure_tree->lock);
2143        state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2144        while (state) {
2145                if (state->start > end)
2146                        break;
2147
2148                ASSERT(state->end <= end);
2149
2150                next = next_state(state);
2151
2152                failrec = state->failrec;
2153                free_extent_state(state);
2154                kfree(failrec);
2155
2156                state = next;
2157        }
2158        spin_unlock(&failure_tree->lock);
2159}
2160
2161int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2162                struct io_failure_record **failrec_ret)
2163{
2164        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2165        struct io_failure_record *failrec;
2166        struct extent_map *em;
2167        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2168        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2169        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2170        int ret;
2171        u64 logical;
2172
2173        ret = get_state_failrec(failure_tree, start, &failrec);
2174        if (ret) {
2175                failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2176                if (!failrec)
2177                        return -ENOMEM;
2178
2179                failrec->start = start;
2180                failrec->len = end - start + 1;
2181                failrec->this_mirror = 0;
2182                failrec->bio_flags = 0;
2183                failrec->in_validation = 0;
2184
2185                read_lock(&em_tree->lock);
2186                em = lookup_extent_mapping(em_tree, start, failrec->len);
2187                if (!em) {
2188                        read_unlock(&em_tree->lock);
2189                        kfree(failrec);
2190                        return -EIO;
2191                }
2192
2193                if (em->start > start || em->start + em->len <= start) {
2194                        free_extent_map(em);
2195                        em = NULL;
2196                }
2197                read_unlock(&em_tree->lock);
2198                if (!em) {
2199                        kfree(failrec);
2200                        return -EIO;
2201                }
2202
2203                logical = start - em->start;
2204                logical = em->block_start + logical;
2205                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2206                        logical = em->block_start;
2207                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2208                        extent_set_compress_type(&failrec->bio_flags,
2209                                                 em->compress_type);
2210                }
2211
2212                btrfs_debug(fs_info,
2213                        "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2214                        logical, start, failrec->len);
2215
2216                failrec->logical = logical;
2217                free_extent_map(em);
2218
2219                /* set the bits in the private failure tree */
2220                ret = set_extent_bits(failure_tree, start, end,
2221                                        EXTENT_LOCKED | EXTENT_DIRTY);
2222                if (ret >= 0)
2223                        ret = set_state_failrec(failure_tree, start, failrec);
2224                /* set the bits in the inode's tree */
2225                if (ret >= 0)
2226                        ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
2227                if (ret < 0) {
2228                        kfree(failrec);
2229                        return ret;
2230                }
2231        } else {
2232                btrfs_debug(fs_info,
2233                        "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
2234                        failrec->logical, failrec->start, failrec->len,
2235                        failrec->in_validation);
2236                /*
2237                 * when data can be on disk more than twice, add to failrec here
2238                 * (e.g. with a list for failed_mirror) to make
2239                 * clean_io_failure() clean all those errors at once.
2240                 */
2241        }
2242
2243        *failrec_ret = failrec;
2244
2245        return 0;
2246}
2247
2248bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
2249                           struct io_failure_record *failrec, int failed_mirror)
2250{
2251        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2252        int num_copies;
2253
2254        num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
2255        if (num_copies == 1) {
2256                /*
2257                 * we only have a single copy of the data, so don't bother with
2258                 * all the retry and error correction code that follows. no
2259                 * matter what the error is, it is very likely to persist.
2260                 */
2261                btrfs_debug(fs_info,
2262                        "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2263                        num_copies, failrec->this_mirror, failed_mirror);
2264                return false;
2265        }
2266
2267        /*
2268         * there are two premises:
2269         *      a) deliver good data to the caller
2270         *      b) correct the bad sectors on disk
2271         */
2272        if (failed_bio_pages > 1) {
2273                /*
2274                 * to fulfill b), we need to know the exact failing sectors, as
2275                 * we don't want to rewrite any more than the failed ones. thus,
2276                 * we need separate read requests for the failed bio
2277                 *
2278                 * if the following BUG_ON triggers, our validation request got
2279                 * merged. we need separate requests for our algorithm to work.
2280                 */
2281                BUG_ON(failrec->in_validation);
2282                failrec->in_validation = 1;
2283                failrec->this_mirror = failed_mirror;
2284        } else {
2285                /*
2286                 * we're ready to fulfill a) and b) alongside. get a good copy
2287                 * of the failed sector and if we succeed, we have setup
2288                 * everything for repair_io_failure to do the rest for us.
2289                 */
2290                if (failrec->in_validation) {
2291                        BUG_ON(failrec->this_mirror != failed_mirror);
2292                        failrec->in_validation = 0;
2293                        failrec->this_mirror = 0;
2294                }
2295                failrec->failed_mirror = failed_mirror;
2296                failrec->this_mirror++;
2297                if (failrec->this_mirror == failed_mirror)
2298                        failrec->this_mirror++;
2299        }
2300
2301        if (failrec->this_mirror > num_copies) {
2302                btrfs_debug(fs_info,
2303                        "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2304                        num_copies, failrec->this_mirror, failed_mirror);
2305                return false;
2306        }
2307
2308        return true;
2309}
2310
2311
2312struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2313                                    struct io_failure_record *failrec,
2314                                    struct page *page, int pg_offset, int icsum,
2315                                    bio_end_io_t *endio_func, void *data)
2316{
2317        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2318        struct bio *bio;
2319        struct btrfs_io_bio *btrfs_failed_bio;
2320        struct btrfs_io_bio *btrfs_bio;
2321
2322        bio = btrfs_io_bio_alloc(1);
2323        bio->bi_end_io = endio_func;
2324        bio->bi_iter.bi_sector = failrec->logical >> 9;
2325        bio_set_dev(bio, fs_info->fs_devices->latest_bdev);
2326        bio->bi_iter.bi_size = 0;
2327        bio->bi_private = data;
2328
2329        btrfs_failed_bio = btrfs_io_bio(failed_bio);
2330        if (btrfs_failed_bio->csum) {
2331                u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
2332
2333                btrfs_bio = btrfs_io_bio(bio);
2334                btrfs_bio->csum = btrfs_bio->csum_inline;
2335                icsum *= csum_size;
2336                memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
2337                       csum_size);
2338        }
2339
2340        bio_add_page(bio, page, failrec->len, pg_offset);
2341
2342        return bio;
2343}
2344
2345/*
2346 * this is a generic handler for readpage errors (default
2347 * readpage_io_failed_hook). if other copies exist, read those and write back
2348 * good data to the failed position. does not investigate in remapping the
2349 * failed extent elsewhere, hoping the device will be smart enough to do this as
2350 * needed
2351 */
2352
2353static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2354                              struct page *page, u64 start, u64 end,
2355                              int failed_mirror)
2356{
2357        struct io_failure_record *failrec;
2358        struct inode *inode = page->mapping->host;
2359        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2360        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2361        struct bio *bio;
2362        int read_mode = 0;
2363        blk_status_t status;
2364        int ret;
2365        unsigned failed_bio_pages = bio_pages_all(failed_bio);
2366
2367        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2368
2369        ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2370        if (ret)
2371                return ret;
2372
2373        if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
2374                                    failed_mirror)) {
2375                free_io_failure(failure_tree, tree, failrec);
2376                return -EIO;
2377        }
2378
2379        if (failed_bio_pages > 1)
2380                read_mode |= REQ_FAILFAST_DEV;
2381
2382        phy_offset >>= inode->i_sb->s_blocksize_bits;
2383        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2384                                      start - page_offset(page),
2385                                      (int)phy_offset, failed_bio->bi_end_io,
2386                                      NULL);
2387        bio->bi_opf = REQ_OP_READ | read_mode;
2388
2389        btrfs_debug(btrfs_sb(inode->i_sb),
2390                "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
2391                read_mode, failrec->this_mirror, failrec->in_validation);
2392
2393        status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
2394                                         failrec->bio_flags, 0);
2395        if (status) {
2396                free_io_failure(failure_tree, tree, failrec);
2397                bio_put(bio);
2398                ret = blk_status_to_errno(status);
2399        }
2400
2401        return ret;
2402}
2403
2404/* lots and lots of room for performance fixes in the end_bio funcs */
2405
2406void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2407{
2408        int uptodate = (err == 0);
2409        struct extent_io_tree *tree;
2410        int ret = 0;
2411
2412        tree = &BTRFS_I(page->mapping->host)->io_tree;
2413
2414        if (tree->ops && tree->ops->writepage_end_io_hook)
2415                tree->ops->writepage_end_io_hook(page, start, end, NULL,
2416                                uptodate);
2417
2418        if (!uptodate) {
2419                ClearPageUptodate(page);
2420                SetPageError(page);
2421                ret = err < 0 ? err : -EIO;
2422                mapping_set_error(page->mapping, ret);
2423        }
2424}
2425
2426/*
2427 * after a writepage IO is done, we need to:
2428 * clear the uptodate bits on error
2429 * clear the writeback bits in the extent tree for this IO
2430 * end_page_writeback if the page has no more pending IO
2431 *
2432 * Scheduling is not allowed, so the extent state tree is expected
2433 * to have one and only one object corresponding to this IO.
2434 */
2435static void end_bio_extent_writepage(struct bio *bio)
2436{
2437        int error = blk_status_to_errno(bio->bi_status);
2438        struct bio_vec *bvec;
2439        u64 start;
2440        u64 end;
2441        int i;
2442
2443        ASSERT(!bio_flagged(bio, BIO_CLONED));
2444        bio_for_each_segment_all(bvec, bio, i) {
2445                struct page *page = bvec->bv_page;
2446                struct inode *inode = page->mapping->host;
2447                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2448
2449                /* We always issue full-page reads, but if some block
2450                 * in a page fails to read, blk_update_request() will
2451                 * advance bv_offset and adjust bv_len to compensate.
2452                 * Print a warning for nonzero offsets, and an error
2453                 * if they don't add up to a full page.  */
2454                if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2455                        if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
2456                                btrfs_err(fs_info,
2457                                   "partial page write in btrfs with offset %u and length %u",
2458                                        bvec->bv_offset, bvec->bv_len);
2459                        else
2460                                btrfs_info(fs_info,
2461                                   "incomplete page write in btrfs with offset %u and length %u",
2462                                        bvec->bv_offset, bvec->bv_len);
2463                }
2464
2465                start = page_offset(page);
2466                end = start + bvec->bv_offset + bvec->bv_len - 1;
2467
2468                end_extent_writepage(page, error, start, end);
2469                end_page_writeback(page);
2470        }
2471
2472        bio_put(bio);
2473}
2474
2475static void
2476endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
2477                              int uptodate)
2478{
2479        struct extent_state *cached = NULL;
2480        u64 end = start + len - 1;
2481
2482        if (uptodate && tree->track_uptodate)
2483                set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
2484        unlock_extent_cached_atomic(tree, start, end, &cached);
2485}
2486
2487/*
2488 * after a readpage IO is done, we need to:
2489 * clear the uptodate bits on error
2490 * set the uptodate bits if things worked
2491 * set the page up to date if all extents in the tree are uptodate
2492 * clear the lock bit in the extent tree
2493 * unlock the page if there are no other extents locked for it
2494 *
2495 * Scheduling is not allowed, so the extent state tree is expected
2496 * to have one and only one object corresponding to this IO.
2497 */
2498static void end_bio_extent_readpage(struct bio *bio)
2499{
2500        struct bio_vec *bvec;
2501        int uptodate = !bio->bi_status;
2502        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2503        struct extent_io_tree *tree, *failure_tree;
2504        u64 offset = 0;
2505        u64 start;
2506        u64 end;
2507        u64 len;
2508        u64 extent_start = 0;
2509        u64 extent_len = 0;
2510        int mirror;
2511        int ret;
2512        int i;
2513
2514        ASSERT(!bio_flagged(bio, BIO_CLONED));
2515        bio_for_each_segment_all(bvec, bio, i) {
2516                struct page *page = bvec->bv_page;
2517                struct inode *inode = page->mapping->host;
2518                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2519
2520                btrfs_debug(fs_info,
2521                        "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
2522                        (u64)bio->bi_iter.bi_sector, bio->bi_status,
2523                        io_bio->mirror_num);
2524                tree = &BTRFS_I(inode)->io_tree;
2525                failure_tree = &BTRFS_I(inode)->io_failure_tree;
2526
2527                /* We always issue full-page reads, but if some block
2528                 * in a page fails to read, blk_update_request() will
2529                 * advance bv_offset and adjust bv_len to compensate.
2530                 * Print a warning for nonzero offsets, and an error
2531                 * if they don't add up to a full page.  */
2532                if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2533                        if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
2534                                btrfs_err(fs_info,
2535                                        "partial page read in btrfs with offset %u and length %u",
2536                                        bvec->bv_offset, bvec->bv_len);
2537                        else
2538                                btrfs_info(fs_info,
2539                                        "incomplete page read in btrfs with offset %u and length %u",
2540                                        bvec->bv_offset, bvec->bv_len);
2541                }
2542
2543                start = page_offset(page);
2544                end = start + bvec->bv_offset + bvec->bv_len - 1;
2545                len = bvec->bv_len;
2546
2547                mirror = io_bio->mirror_num;
2548                if (likely(uptodate && tree->ops)) {
2549                        ret = tree->ops->readpage_end_io_hook(io_bio, offset,
2550                                                              page, start, end,
2551                                                              mirror);
2552                        if (ret)
2553                                uptodate = 0;
2554                        else
2555                                clean_io_failure(BTRFS_I(inode)->root->fs_info,
2556                                                 failure_tree, tree, start,
2557                                                 page,
2558                                                 btrfs_ino(BTRFS_I(inode)), 0);
2559                }
2560
2561                if (likely(uptodate))
2562                        goto readpage_ok;
2563
2564                if (tree->ops) {
2565                        ret = tree->ops->readpage_io_failed_hook(page, mirror);
2566                        if (ret == -EAGAIN) {
2567                                /*
2568                                 * Data inode's readpage_io_failed_hook() always
2569                                 * returns -EAGAIN.
2570                                 *
2571                                 * The generic bio_readpage_error handles errors
2572                                 * the following way: If possible, new read
2573                                 * requests are created and submitted and will
2574                                 * end up in end_bio_extent_readpage as well (if
2575                                 * we're lucky, not in the !uptodate case). In
2576                                 * that case it returns 0 and we just go on with
2577                                 * the next page in our bio. If it can't handle
2578                                 * the error it will return -EIO and we remain
2579                                 * responsible for that page.
2580                                 */
2581                                ret = bio_readpage_error(bio, offset, page,
2582                                                         start, end, mirror);
2583                                if (ret == 0) {
2584                                        uptodate = !bio->bi_status;
2585                                        offset += len;
2586                                        continue;
2587                                }
2588                        }
2589
2590                        /*
2591                         * metadata's readpage_io_failed_hook() always returns
2592                         * -EIO and fixes nothing.  -EIO is also returned if
2593                         * data inode error could not be fixed.
2594                         */
2595                        ASSERT(ret == -EIO);
2596                }
2597readpage_ok:
2598                if (likely(uptodate)) {
2599                        loff_t i_size = i_size_read(inode);
2600                        pgoff_t end_index = i_size >> PAGE_SHIFT;
2601                        unsigned off;
2602
2603                        /* Zero out the end if this page straddles i_size */
2604                        off = i_size & (PAGE_SIZE-1);
2605                        if (page->index == end_index && off)
2606                                zero_user_segment(page, off, PAGE_SIZE);
2607                        SetPageUptodate(page);
2608                } else {
2609                        ClearPageUptodate(page);
2610                        SetPageError(page);
2611                }
2612                unlock_page(page);
2613                offset += len;
2614
2615                if (unlikely(!uptodate)) {
2616                        if (extent_len) {
2617                                endio_readpage_release_extent(tree,
2618                                                              extent_start,
2619                                                              extent_len, 1);
2620                                extent_start = 0;
2621                                extent_len = 0;
2622                        }
2623                        endio_readpage_release_extent(tree, start,
2624                                                      end - start + 1, 0);
2625                } else if (!extent_len) {
2626                        extent_start = start;
2627                        extent_len = end + 1 - start;
2628                } else if (extent_start + extent_len == start) {
2629                        extent_len += end + 1 - start;
2630                } else {
2631                        endio_readpage_release_extent(tree, extent_start,
2632                                                      extent_len, uptodate);
2633                        extent_start = start;
2634                        extent_len = end + 1 - start;
2635                }
2636        }
2637
2638        if (extent_len)
2639                endio_readpage_release_extent(tree, extent_start, extent_len,
2640                                              uptodate);
2641        if (io_bio->end_io)
2642                io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
2643        bio_put(bio);
2644}
2645
2646/*
2647 * Initialize the members up to but not including 'bio'. Use after allocating a
2648 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
2649 * 'bio' because use of __GFP_ZERO is not supported.
2650 */
2651static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
2652{
2653        memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
2654}
2655
2656/*
2657 * The following helpers allocate a bio. As it's backed by a bioset, it'll
2658 * never fail.  We're returning a bio right now but you can call btrfs_io_bio
2659 * for the appropriate container_of magic
2660 */
2661struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
2662{
2663        struct bio *bio;
2664
2665        bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
2666        bio_set_dev(bio, bdev);
2667        bio->bi_iter.bi_sector = first_byte >> 9;
2668        btrfs_io_bio_init(btrfs_io_bio(bio));
2669        return bio;
2670}
2671
2672struct bio *btrfs_bio_clone(struct bio *bio)
2673{
2674        struct btrfs_io_bio *btrfs_bio;
2675        struct bio *new;
2676
2677        /* Bio allocation backed by a bioset does not fail */
2678        new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
2679        btrfs_bio = btrfs_io_bio(new);
2680        btrfs_io_bio_init(btrfs_bio);
2681        btrfs_bio->iter = bio->bi_iter;
2682        return new;
2683}
2684
2685struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
2686{
2687        struct bio *bio;
2688
2689        /* Bio allocation backed by a bioset does not fail */
2690        bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
2691        btrfs_io_bio_init(btrfs_io_bio(bio));
2692        return bio;
2693}
2694
2695struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
2696{
2697        struct bio *bio;
2698        struct btrfs_io_bio *btrfs_bio;
2699
2700        /* this will never fail when it's backed by a bioset */
2701        bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
2702        ASSERT(bio);
2703
2704        btrfs_bio = btrfs_io_bio(bio);
2705        btrfs_io_bio_init(btrfs_bio);
2706
2707        bio_trim(bio, offset >> 9, size >> 9);
2708        btrfs_bio->iter = bio->bi_iter;
2709        return bio;
2710}
2711
2712static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
2713                                       unsigned long bio_flags)
2714{
2715        blk_status_t ret = 0;
2716        struct bio_vec *bvec = bio_last_bvec_all(bio);
2717        struct page *page = bvec->bv_page;
2718        struct extent_io_tree *tree = bio->bi_private;
2719        u64 start;
2720
2721        start = page_offset(page) + bvec->bv_offset;
2722
2723        bio->bi_private = NULL;
2724
2725        if (tree->ops)
2726                ret = tree->ops->submit_bio_hook(tree->private_data, bio,
2727                                           mirror_num, bio_flags, start);
2728        else
2729                btrfsic_submit_bio(bio);
2730
2731        return blk_status_to_errno(ret);
2732}
2733
2734/*
2735 * @opf:        bio REQ_OP_* and REQ_* flags as one value
2736 * @tree:       tree so we can call our merge_bio hook
2737 * @wbc:        optional writeback control for io accounting
2738 * @page:       page to add to the bio
2739 * @pg_offset:  offset of the new bio or to check whether we are adding
2740 *              a contiguous page to the previous one
2741 * @size:       portion of page that we want to write
2742 * @offset:     starting offset in the page
2743 * @bdev:       attach newly created bios to this bdev
2744 * @bio_ret:    must be valid pointer, newly allocated bio will be stored there
2745 * @end_io_func:     end_io callback for new bio
2746 * @mirror_num:      desired mirror to read/write
2747 * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
2748 * @bio_flags:  flags of the current bio to see if we can merge them
2749 */
2750static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
2751                              struct writeback_control *wbc,
2752                              struct page *page, u64 offset,
2753                              size_t size, unsigned long pg_offset,
2754                              struct block_device *bdev,
2755                              struct bio **bio_ret,
2756                              bio_end_io_t end_io_func,
2757                              int mirror_num,
2758                              unsigned long prev_bio_flags,
2759                              unsigned long bio_flags,
2760                              bool force_bio_submit)
2761{
2762        int ret = 0;
2763        struct bio *bio;
2764        size_t page_size = min_t(size_t, size, PAGE_SIZE);
2765        sector_t sector = offset >> 9;
2766
2767        ASSERT(bio_ret);
2768
2769        if (*bio_ret) {
2770                bool contig;
2771                bool can_merge = true;
2772
2773                bio = *bio_ret;
2774                if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
2775                        contig = bio->bi_iter.bi_sector == sector;
2776                else
2777                        contig = bio_end_sector(bio) == sector;
2778
2779                if (tree->ops && btrfs_merge_bio_hook(page, offset, page_size,
2780                                                      bio, bio_flags))
2781                        can_merge = false;
2782
2783                if (prev_bio_flags != bio_flags || !contig || !can_merge ||
2784                    force_bio_submit ||
2785                    bio_add_page(bio, page, page_size, pg_offset) < page_size) {
2786                        ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
2787                        if (ret < 0) {
2788                                *bio_ret = NULL;
2789                                return ret;
2790                        }
2791                        bio = NULL;
2792                } else {
2793                        if (wbc)
2794                                wbc_account_io(wbc, page, page_size);
2795                        return 0;
2796                }
2797        }
2798
2799        bio = btrfs_bio_alloc(bdev, offset);
2800        bio_add_page(bio, page, page_size, pg_offset);
2801        bio->bi_end_io = end_io_func;
2802        bio->bi_private = tree;
2803        bio->bi_write_hint = page->mapping->host->i_write_hint;
2804        bio->bi_opf = opf;
2805        if (wbc) {
2806                wbc_init_bio(wbc, bio);
2807                wbc_account_io(wbc, page, page_size);
2808        }
2809
2810        *bio_ret = bio;
2811
2812        return ret;
2813}
2814
2815static void attach_extent_buffer_page(struct extent_buffer *eb,
2816                                      struct page *page)
2817{
2818        if (!PagePrivate(page)) {
2819                SetPagePrivate(page);
2820                get_page(page);
2821                set_page_private(page, (unsigned long)eb);
2822        } else {
2823                WARN_ON(page->private != (unsigned long)eb);
2824        }
2825}
2826
2827void set_page_extent_mapped(struct page *page)
2828{
2829        if (!PagePrivate(page)) {
2830                SetPagePrivate(page);
2831                get_page(page);
2832                set_page_private(page, EXTENT_PAGE_PRIVATE);
2833        }
2834}
2835
2836static struct extent_map *
2837__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2838                 u64 start, u64 len, get_extent_t *get_extent,
2839                 struct extent_map **em_cached)
2840{
2841        struct extent_map *em;
2842
2843        if (em_cached && *em_cached) {
2844                em = *em_cached;
2845                if (extent_map_in_tree(em) && start >= em->start &&
2846                    start < extent_map_end(em)) {
2847                        refcount_inc(&em->refs);
2848                        return em;
2849                }
2850
2851                free_extent_map(em);
2852                *em_cached = NULL;
2853        }
2854
2855        em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
2856        if (em_cached && !IS_ERR_OR_NULL(em)) {
2857                BUG_ON(*em_cached);
2858                refcount_inc(&em->refs);
2859                *em_cached = em;
2860        }
2861        return em;
2862}
2863/*
2864 * basic readpage implementation.  Locked extent state structs are inserted
2865 * into the tree that are removed when the IO is done (by the end_io
2866 * handlers)
2867 * XXX JDM: This needs looking at to ensure proper page locking
2868 * return 0 on success, otherwise return error
2869 */
2870static int __do_readpage(struct extent_io_tree *tree,
2871                         struct page *page,
2872                         get_extent_t *get_extent,
2873                         struct extent_map **em_cached,
2874                         struct bio **bio, int mirror_num,
2875                         unsigned long *bio_flags, unsigned int read_flags,
2876                         u64 *prev_em_start)
2877{
2878        struct inode *inode = page->mapping->host;
2879        u64 start = page_offset(page);
2880        const u64 end = start + PAGE_SIZE - 1;
2881        u64 cur = start;
2882        u64 extent_offset;
2883        u64 last_byte = i_size_read(inode);
2884        u64 block_start;
2885        u64 cur_end;
2886        struct extent_map *em;
2887        struct block_device *bdev;
2888        int ret = 0;
2889        int nr = 0;
2890        size_t pg_offset = 0;
2891        size_t iosize;
2892        size_t disk_io_size;
2893        size_t blocksize = inode->i_sb->s_blocksize;
2894        unsigned long this_bio_flag = 0;
2895
2896        set_page_extent_mapped(page);
2897
2898        if (!PageUptodate(page)) {
2899                if (cleancache_get_page(page) == 0) {
2900                        BUG_ON(blocksize != PAGE_SIZE);
2901                        unlock_extent(tree, start, end);
2902                        goto out;
2903                }
2904        }
2905
2906        if (page->index == last_byte >> PAGE_SHIFT) {
2907                char *userpage;
2908                size_t zero_offset = last_byte & (PAGE_SIZE - 1);
2909
2910                if (zero_offset) {
2911                        iosize = PAGE_SIZE - zero_offset;
2912                        userpage = kmap_atomic(page);
2913                        memset(userpage + zero_offset, 0, iosize);
2914                        flush_dcache_page(page);
2915                        kunmap_atomic(userpage);
2916                }
2917        }
2918        while (cur <= end) {
2919                bool force_bio_submit = false;
2920                u64 offset;
2921
2922                if (cur >= last_byte) {
2923                        char *userpage;
2924                        struct extent_state *cached = NULL;
2925
2926                        iosize = PAGE_SIZE - pg_offset;
2927                        userpage = kmap_atomic(page);
2928                        memset(userpage + pg_offset, 0, iosize);
2929                        flush_dcache_page(page);
2930                        kunmap_atomic(userpage);
2931                        set_extent_uptodate(tree, cur, cur + iosize - 1,
2932                                            &cached, GFP_NOFS);
2933                        unlock_extent_cached(tree, cur,
2934                                             cur + iosize - 1, &cached);
2935                        break;
2936                }
2937                em = __get_extent_map(inode, page, pg_offset, cur,
2938                                      end - cur + 1, get_extent, em_cached);
2939                if (IS_ERR_OR_NULL(em)) {
2940                        SetPageError(page);
2941                        unlock_extent(tree, cur, end);
2942                        break;
2943                }
2944                extent_offset = cur - em->start;
2945                BUG_ON(extent_map_end(em) <= cur);
2946                BUG_ON(end < cur);
2947
2948                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2949                        this_bio_flag |= EXTENT_BIO_COMPRESSED;
2950                        extent_set_compress_type(&this_bio_flag,
2951                                                 em->compress_type);
2952                }
2953
2954                iosize = min(extent_map_end(em) - cur, end - cur + 1);
2955                cur_end = min(extent_map_end(em) - 1, end);
2956                iosize = ALIGN(iosize, blocksize);
2957                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2958                        disk_io_size = em->block_len;
2959                        offset = em->block_start;
2960                } else {
2961                        offset = em->block_start + extent_offset;
2962                        disk_io_size = iosize;
2963                }
2964                bdev = em->bdev;
2965                block_start = em->block_start;
2966                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2967                        block_start = EXTENT_MAP_HOLE;
2968
2969                /*
2970                 * If we have a file range that points to a compressed extent
2971                 * and it's followed by a consecutive file range that points to
2972                 * to the same compressed extent (possibly with a different
2973                 * offset and/or length, so it either points to the whole extent
2974                 * or only part of it), we must make sure we do not submit a
2975                 * single bio to populate the pages for the 2 ranges because
2976                 * this makes the compressed extent read zero out the pages
2977                 * belonging to the 2nd range. Imagine the following scenario:
2978                 *
2979                 *  File layout
2980                 *  [0 - 8K]                     [8K - 24K]
2981                 *    |                               |
2982                 *    |                               |
2983                 * points to extent X,         points to extent X,
2984                 * offset 4K, length of 8K     offset 0, length 16K
2985                 *
2986                 * [extent X, compressed length = 4K uncompressed length = 16K]
2987                 *
2988                 * If the bio to read the compressed extent covers both ranges,
2989                 * it will decompress extent X into the pages belonging to the
2990                 * first range and then it will stop, zeroing out the remaining
2991                 * pages that belong to the other range that points to extent X.
2992                 * So here we make sure we submit 2 bios, one for the first
2993                 * range and another one for the third range. Both will target
2994                 * the same physical extent from disk, but we can't currently
2995                 * make the compressed bio endio callback populate the pages
2996                 * for both ranges because each compressed bio is tightly
2997                 * coupled with a single extent map, and each range can have
2998                 * an extent map with a different offset value relative to the
2999                 * uncompressed data of our extent and different lengths. This
3000                 * is a corner case so we prioritize correctness over
3001                 * non-optimal behavior (submitting 2 bios for the same extent).
3002                 */
3003                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3004                    prev_em_start && *prev_em_start != (u64)-1 &&
3005                    *prev_em_start != em->orig_start)
3006                        force_bio_submit = true;
3007
3008                if (prev_em_start)
3009                        *prev_em_start = em->orig_start;
3010
3011                free_extent_map(em);
3012                em = NULL;
3013
3014                /* we've found a hole, just zero and go on */
3015                if (block_start == EXTENT_MAP_HOLE) {
3016                        char *userpage;
3017                        struct extent_state *cached = NULL;
3018
3019                        userpage = kmap_atomic(page);
3020                        memset(userpage + pg_offset, 0, iosize);
3021                        flush_dcache_page(page);
3022                        kunmap_atomic(userpage);
3023
3024                        set_extent_uptodate(tree, cur, cur + iosize - 1,
3025                                            &cached, GFP_NOFS);
3026                        unlock_extent_cached(tree, cur,
3027                                             cur + iosize - 1, &cached);
3028                        cur = cur + iosize;
3029                        pg_offset += iosize;
3030                        continue;
3031                }
3032                /* the get_extent function already copied into the page */
3033                if (test_range_bit(tree, cur, cur_end,
3034                                   EXTENT_UPTODATE, 1, NULL)) {
3035                        check_page_uptodate(tree, page);
3036                        unlock_extent(tree, cur, cur + iosize - 1);
3037                        cur = cur + iosize;
3038                        pg_offset += iosize;
3039                        continue;
3040                }
3041                /* we have an inline extent but it didn't get marked up
3042                 * to date.  Error out
3043                 */
3044                if (block_start == EXTENT_MAP_INLINE) {
3045                        SetPageError(page);
3046                        unlock_extent(tree, cur, cur + iosize - 1);
3047                        cur = cur + iosize;
3048                        pg_offset += iosize;
3049                        continue;
3050                }
3051
3052                ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
3053                                         page, offset, disk_io_size,
3054                                         pg_offset, bdev, bio,
3055                                         end_bio_extent_readpage, mirror_num,
3056                                         *bio_flags,
3057                                         this_bio_flag,
3058                                         force_bio_submit);
3059                if (!ret) {
3060                        nr++;
3061                        *bio_flags = this_bio_flag;
3062                } else {
3063                        SetPageError(page);
3064                        unlock_extent(tree, cur, cur + iosize - 1);
3065                        goto out;
3066                }
3067                cur = cur + iosize;
3068                pg_offset += iosize;
3069        }
3070out:
3071        if (!nr) {
3072                if (!PageError(page))
3073                        SetPageUptodate(page);
3074                unlock_page(page);
3075        }
3076        return ret;
3077}
3078
3079static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
3080                                             struct page *pages[], int nr_pages,
3081                                             u64 start, u64 end,
3082                                             struct extent_map **em_cached,
3083                                             struct bio **bio,
3084                                             unsigned long *bio_flags,
3085                                             u64 *prev_em_start)
3086{
3087        struct inode *inode;
3088        struct btrfs_ordered_extent *ordered;
3089        int index;
3090
3091        inode = pages[0]->mapping->host;
3092        while (1) {
3093                lock_extent(tree, start, end);
3094                ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
3095                                                     end - start + 1);
3096                if (!ordered)
3097                        break;
3098                unlock_extent(tree, start, end);
3099                btrfs_start_ordered_extent(inode, ordered, 1);
3100                btrfs_put_ordered_extent(ordered);
3101        }
3102
3103        for (index = 0; index < nr_pages; index++) {
3104                __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
3105                                bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
3106                put_page(pages[index]);
3107        }
3108}
3109
3110static void __extent_readpages(struct extent_io_tree *tree,
3111                               struct page *pages[],
3112                               int nr_pages,
3113                               struct extent_map **em_cached,
3114                               struct bio **bio, unsigned long *bio_flags,
3115                               u64 *prev_em_start)
3116{
3117        u64 start = 0;
3118        u64 end = 0;
3119        u64 page_start;
3120        int index;
3121        int first_index = 0;
3122
3123        for (index = 0; index < nr_pages; index++) {
3124                page_start = page_offset(pages[index]);
3125                if (!end) {
3126                        start = page_start;
3127                        end = start + PAGE_SIZE - 1;
3128                        first_index = index;
3129                } else if (end + 1 == page_start) {
3130                        end += PAGE_SIZE;
3131                } else {
3132                        __do_contiguous_readpages(tree, &pages[first_index],
3133                                                  index - first_index, start,
3134                                                  end, em_cached,
3135                                                  bio, bio_flags,
3136                                                  prev_em_start);
3137                        start = page_start;
3138                        end = start + PAGE_SIZE - 1;
3139                        first_index = index;
3140                }
3141        }
3142
3143        if (end)
3144                __do_contiguous_readpages(tree, &pages[first_index],
3145                                          index - first_index, start,
3146                                          end, em_cached, bio,
3147                                          bio_flags, prev_em_start);
3148}
3149
3150static int __extent_read_full_page(struct extent_io_tree *tree,
3151                                   struct page *page,
3152                                   get_extent_t *get_extent,
3153                                   struct bio **bio, int mirror_num,
3154                                   unsigned long *bio_flags,
3155                                   unsigned int read_flags)
3156{
3157        struct inode *inode = page->mapping->host;
3158        struct btrfs_ordered_extent *ordered;
3159        u64 start = page_offset(page);
3160        u64 end = start + PAGE_SIZE - 1;
3161        int ret;
3162
3163        while (1) {
3164                lock_extent(tree, start, end);
3165                ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
3166                                                PAGE_SIZE);
3167                if (!ordered)
3168                        break;
3169                unlock_extent(tree, start, end);
3170                btrfs_start_ordered_extent(inode, ordered, 1);
3171                btrfs_put_ordered_extent(ordered);
3172        }
3173
3174        ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
3175                            bio_flags, read_flags, NULL);
3176        return ret;
3177}
3178
3179int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
3180                            get_extent_t *get_extent, int mirror_num)
3181{
3182        struct bio *bio = NULL;
3183        unsigned long bio_flags = 0;
3184        int ret;
3185
3186        ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
3187                                      &bio_flags, 0);
3188        if (bio)
3189                ret = submit_one_bio(bio, mirror_num, bio_flags);
3190        return ret;
3191}
3192
3193static void update_nr_written(struct writeback_control *wbc,
3194                              unsigned long nr_written)
3195{
3196        wbc->nr_to_write -= nr_written;
3197}
3198
3199/*
3200 * helper for __extent_writepage, doing all of the delayed allocation setup.
3201 *
3202 * This returns 1 if our fill_delalloc function did all the work required
3203 * to write the page (copy into inline extent).  In this case the IO has
3204 * been started and the page is already unlocked.
3205 *
3206 * This returns 0 if all went well (page still locked)
3207 * This returns < 0 if there were errors (page still locked)
3208 */
3209static noinline_for_stack int writepage_delalloc(struct inode *inode,
3210                              struct page *page, struct writeback_control *wbc,
3211                              struct extent_page_data *epd,
3212                              u64 delalloc_start,
3213                              unsigned long *nr_written)
3214{
3215        struct extent_io_tree *tree = epd->tree;
3216        u64 page_end = delalloc_start + PAGE_SIZE - 1;
3217        u64 nr_delalloc;
3218        u64 delalloc_to_write = 0;
3219        u64 delalloc_end = 0;
3220        int ret;
3221        int page_started = 0;
3222
3223        if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
3224                return 0;
3225
3226        while (delalloc_end < page_end) {
3227                nr_delalloc = find_lock_delalloc_range(inode, tree,
3228                                               page,
3229                                               &delalloc_start,
3230                                               &delalloc_end,
3231                                               BTRFS_MAX_EXTENT_SIZE);
3232                if (nr_delalloc == 0) {
3233                        delalloc_start = delalloc_end + 1;
3234                        continue;
3235                }
3236                ret = tree->ops->fill_delalloc(inode, page,
3237                                               delalloc_start,
3238                                               delalloc_end,
3239                                               &page_started,
3240                                               nr_written, wbc);
3241                /* File system has been set read-only */
3242                if (ret) {
3243                        SetPageError(page);
3244                        /* fill_delalloc should be return < 0 for error
3245                         * but just in case, we use > 0 here meaning the
3246                         * IO is started, so we don't want to return > 0
3247                         * unless things are going well.
3248                         */
3249                        ret = ret < 0 ? ret : -EIO;
3250                        goto done;
3251                }
3252                /*
3253                 * delalloc_end is already one less than the total length, so
3254                 * we don't subtract one from PAGE_SIZE
3255                 */
3256                delalloc_to_write += (delalloc_end - delalloc_start +
3257                                      PAGE_SIZE) >> PAGE_SHIFT;
3258                delalloc_start = delalloc_end + 1;
3259        }
3260        if (wbc->nr_to_write < delalloc_to_write) {
3261                int thresh = 8192;
3262
3263                if (delalloc_to_write < thresh * 2)
3264                        thresh = delalloc_to_write;
3265                wbc->nr_to_write = min_t(u64, delalloc_to_write,
3266                                         thresh);
3267        }
3268
3269        /* did the fill delalloc function already unlock and start
3270         * the IO?
3271         */
3272        if (page_started) {
3273                /*
3274                 * we've unlocked the page, so we can't update
3275                 * the mapping's writeback index, just update
3276                 * nr_to_write.
3277                 */
3278                wbc->nr_to_write -= *nr_written;
3279                return 1;
3280        }
3281
3282        ret = 0;
3283
3284done:
3285        return ret;
3286}
3287
3288/*
3289 * helper for __extent_writepage.  This calls the writepage start hooks,
3290 * and does the loop to map the page into extents and bios.
3291 *
3292 * We return 1 if the IO is started and the page is unlocked,
3293 * 0 if all went well (page still locked)
3294 * < 0 if there were errors (page still locked)
3295 */
3296static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3297                                 struct page *page,
3298                                 struct writeback_control *wbc,
3299                                 struct extent_page_data *epd,
3300                                 loff_t i_size,
3301                                 unsigned long nr_written,
3302                                 unsigned int write_flags, int *nr_ret)
3303{
3304        struct extent_io_tree *tree = epd->tree;
3305        u64 start = page_offset(page);
3306        u64 page_end = start + PAGE_SIZE - 1;
3307        u64 end;
3308        u64 cur = start;
3309        u64 extent_offset;
3310        u64 block_start;
3311        u64 iosize;
3312        struct extent_map *em;
3313        struct block_device *bdev;
3314        size_t pg_offset = 0;
3315        size_t blocksize;
3316        int ret = 0;
3317        int nr = 0;
3318        bool compressed;
3319
3320        if (tree->ops && tree->ops->writepage_start_hook) {
3321                ret = tree->ops->writepage_start_hook(page, start,
3322                                                      page_end);
3323                if (ret) {
3324                        /* Fixup worker will requeue */
3325                        if (ret == -EBUSY)
3326                                wbc->pages_skipped++;
3327                        else
3328                                redirty_page_for_writepage(wbc, page);
3329
3330                        update_nr_written(wbc, nr_written);
3331                        unlock_page(page);
3332                        return 1;
3333                }
3334        }
3335
3336        /*
3337         * we don't want to touch the inode after unlocking the page,
3338         * so we update the mapping writeback index now
3339         */
3340        update_nr_written(wbc, nr_written + 1);
3341
3342        end = page_end;
3343        if (i_size <= start) {
3344                if (tree->ops && tree->ops->writepage_end_io_hook)
3345                        tree->ops->writepage_end_io_hook(page, start,
3346                                                         page_end, NULL, 1);
3347                goto done;
3348        }
3349
3350        blocksize = inode->i_sb->s_blocksize;
3351
3352        while (cur <= end) {
3353                u64 em_end;
3354                u64 offset;
3355
3356                if (cur >= i_size) {
3357                        if (tree->ops && tree->ops->writepage_end_io_hook)
3358                                tree->ops->writepage_end_io_hook(page, cur,
3359                                                         page_end, NULL, 1);
3360                        break;
3361                }
3362                em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
3363                                     end - cur + 1, 1);
3364                if (IS_ERR_OR_NULL(em)) {
3365                        SetPageError(page);
3366                        ret = PTR_ERR_OR_ZERO(em);
3367                        break;
3368                }
3369
3370                extent_offset = cur - em->start;
3371                em_end = extent_map_end(em);
3372                BUG_ON(em_end <= cur);
3373                BUG_ON(end < cur);
3374                iosize = min(em_end - cur, end - cur + 1);
3375                iosize = ALIGN(iosize, blocksize);
3376                offset = em->block_start + extent_offset;
3377                bdev = em->bdev;
3378                block_start = em->block_start;
3379                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3380                free_extent_map(em);
3381                em = NULL;
3382
3383                /*
3384                 * compressed and inline extents are written through other
3385                 * paths in the FS
3386                 */
3387                if (compressed || block_start == EXTENT_MAP_HOLE ||
3388                    block_start == EXTENT_MAP_INLINE) {
3389                        /*
3390                         * end_io notification does not happen here for
3391                         * compressed extents
3392                         */
3393                        if (!compressed && tree->ops &&
3394                            tree->ops->writepage_end_io_hook)
3395                                tree->ops->writepage_end_io_hook(page, cur,
3396                                                         cur + iosize - 1,
3397                                                         NULL, 1);
3398                        else if (compressed) {
3399                                /* we don't want to end_page_writeback on
3400                                 * a compressed extent.  this happens
3401                                 * elsewhere
3402                                 */
3403                                nr++;
3404                        }
3405
3406                        cur += iosize;
3407                        pg_offset += iosize;
3408                        continue;
3409                }
3410
3411                btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
3412                if (!PageWriteback(page)) {
3413                        btrfs_err(BTRFS_I(inode)->root->fs_info,
3414                                   "page %lu not writeback, cur %llu end %llu",
3415                               page->index, cur, end);
3416                }
3417
3418                ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
3419                                         page, offset, iosize, pg_offset,
3420                                         bdev, &epd->bio,
3421                                         end_bio_extent_writepage,
3422                                         0, 0, 0, false);
3423                if (ret) {
3424                        SetPageError(page);
3425                        if (PageWriteback(page))
3426                                end_page_writeback(page);
3427                }
3428
3429                cur = cur + iosize;
3430                pg_offset += iosize;
3431                nr++;
3432        }
3433done:
3434        *nr_ret = nr;
3435        return ret;
3436}
3437
3438/*
3439 * the writepage semantics are similar to regular writepage.  extent
3440 * records are inserted to lock ranges in the tree, and as dirty areas
3441 * are found, they are marked writeback.  Then the lock bits are removed
3442 * and the end_io handler clears the writeback ranges
3443 */
3444static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3445                              struct extent_page_data *epd)
3446{
3447        struct inode *inode = page->mapping->host;
3448        u64 start = page_offset(page);
3449        u64 page_end = start + PAGE_SIZE - 1;
3450        int ret;
3451        int nr = 0;
3452        size_t pg_offset = 0;
3453        loff_t i_size = i_size_read(inode);
3454        unsigned long end_index = i_size >> PAGE_SHIFT;
3455        unsigned int write_flags = 0;
3456        unsigned long nr_written = 0;
3457
3458        write_flags = wbc_to_write_flags(wbc);
3459
3460        trace___extent_writepage(page, inode, wbc);
3461
3462        WARN_ON(!PageLocked(page));
3463
3464        ClearPageError(page);
3465
3466        pg_offset = i_size & (PAGE_SIZE - 1);
3467        if (page->index > end_index ||
3468           (page->index == end_index && !pg_offset)) {
3469                page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
3470                unlock_page(page);
3471                return 0;
3472        }
3473
3474        if (page->index == end_index) {
3475                char *userpage;
3476
3477                userpage = kmap_atomic(page);
3478                memset(userpage + pg_offset, 0,
3479                       PAGE_SIZE - pg_offset);
3480                kunmap_atomic(userpage);
3481                flush_dcache_page(page);
3482        }
3483
3484        pg_offset = 0;
3485
3486        set_page_extent_mapped(page);
3487
3488        ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
3489        if (ret == 1)
3490                goto done_unlocked;
3491        if (ret)
3492                goto done;
3493
3494        ret = __extent_writepage_io(inode, page, wbc, epd,
3495                                    i_size, nr_written, write_flags, &nr);
3496        if (ret == 1)
3497                goto done_unlocked;
3498
3499done:
3500        if (nr == 0) {
3501                /* make sure the mapping tag for page dirty gets cleared */
3502                set_page_writeback(page);
3503                end_page_writeback(page);
3504        }
3505        if (PageError(page)) {
3506                ret = ret < 0 ? ret : -EIO;
3507                end_extent_writepage(page, ret, start, page_end);
3508        }
3509        unlock_page(page);
3510        return ret;
3511
3512done_unlocked:
3513        return 0;
3514}
3515
3516void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3517{
3518        wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3519                       TASK_UNINTERRUPTIBLE);
3520}
3521
3522static noinline_for_stack int
3523lock_extent_buffer_for_io(struct extent_buffer *eb,
3524                          struct btrfs_fs_info *fs_info,
3525                          struct extent_page_data *epd)
3526{
3527        int i, num_pages;
3528        int flush = 0;
3529        int ret = 0;
3530
3531        if (!btrfs_try_tree_write_lock(eb)) {
3532                flush = 1;
3533                flush_write_bio(epd);
3534                btrfs_tree_lock(eb);
3535        }
3536
3537        if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3538                btrfs_tree_unlock(eb);
3539                if (!epd->sync_io)
3540                        return 0;
3541                if (!flush) {
3542                        flush_write_bio(epd);
3543                        flush = 1;
3544                }
3545                while (1) {
3546                        wait_on_extent_buffer_writeback(eb);
3547                        btrfs_tree_lock(eb);
3548                        if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3549                                break;
3550                        btrfs_tree_unlock(eb);
3551                }
3552        }
3553
3554        /*
3555         * We need to do this to prevent races in people who check if the eb is
3556         * under IO since we can end up having no IO bits set for a short period
3557         * of time.
3558         */
3559        spin_lock(&eb->refs_lock);
3560        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3561                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3562                spin_unlock(&eb->refs_lock);
3563                btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3564                percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3565                                         -eb->len,
3566                                         fs_info->dirty_metadata_batch);
3567                ret = 1;
3568        } else {
3569                spin_unlock(&eb->refs_lock);
3570        }
3571
3572        btrfs_tree_unlock(eb);
3573
3574        if (!ret)
3575                return ret;
3576
3577        num_pages = num_extent_pages(eb);
3578        for (i = 0; i < num_pages; i++) {
3579                struct page *p = eb->pages[i];
3580
3581                if (!trylock_page(p)) {
3582                        if (!flush) {
3583                                flush_write_bio(epd);
3584                                flush = 1;
3585                        }
3586                        lock_page(p);
3587                }
3588        }
3589
3590        return ret;
3591}
3592
3593static void end_extent_buffer_writeback(struct extent_buffer *eb)
3594{
3595        clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3596        smp_mb__after_atomic();
3597        wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3598}
3599
3600static void set_btree_ioerr(struct page *page)
3601{
3602        struct extent_buffer *eb = (struct extent_buffer *)page->private;
3603
3604        SetPageError(page);
3605        if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3606                return;
3607
3608        /*
3609         * If writeback for a btree extent that doesn't belong to a log tree
3610         * failed, increment the counter transaction->eb_write_errors.
3611         * We do this because while the transaction is running and before it's
3612         * committing (when we call filemap_fdata[write|wait]_range against
3613         * the btree inode), we might have
3614         * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3615         * returns an error or an error happens during writeback, when we're
3616         * committing the transaction we wouldn't know about it, since the pages
3617         * can be no longer dirty nor marked anymore for writeback (if a
3618         * subsequent modification to the extent buffer didn't happen before the
3619         * transaction commit), which makes filemap_fdata[write|wait]_range not
3620         * able to find the pages tagged with SetPageError at transaction
3621         * commit time. So if this happens we must abort the transaction,
3622         * otherwise we commit a super block with btree roots that point to
3623         * btree nodes/leafs whose content on disk is invalid - either garbage
3624         * or the content of some node/leaf from a past generation that got
3625         * cowed or deleted and is no longer valid.
3626         *
3627         * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3628         * not be enough - we need to distinguish between log tree extents vs
3629         * non-log tree extents, and the next filemap_fdatawait_range() call
3630         * will catch and clear such errors in the mapping - and that call might
3631         * be from a log sync and not from a transaction commit. Also, checking
3632         * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3633         * not done and would not be reliable - the eb might have been released
3634         * from memory and reading it back again means that flag would not be
3635         * set (since it's a runtime flag, not persisted on disk).
3636         *
3637         * Using the flags below in the btree inode also makes us achieve the
3638         * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3639         * writeback for all dirty pages and before filemap_fdatawait_range()
3640         * is called, the writeback for all dirty pages had already finished
3641         * with errors - because we were not using AS_EIO/AS_ENOSPC,
3642         * filemap_fdatawait_range() would return success, as it could not know
3643         * that writeback errors happened (the pages were no longer tagged for
3644         * writeback).
3645         */
3646        switch (eb->log_index) {
3647        case -1:
3648                set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
3649                break;
3650        case 0:
3651                set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
3652                break;
3653        case 1:
3654                set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
3655                break;
3656        default:
3657                BUG(); /* unexpected, logic error */
3658        }
3659}
3660
3661static void end_bio_extent_buffer_writepage(struct bio *bio)
3662{
3663        struct bio_vec *bvec;
3664        struct extent_buffer *eb;
3665        int i, done;
3666
3667        ASSERT(!bio_flagged(bio, BIO_CLONED));
3668        bio_for_each_segment_all(bvec, bio, i) {
3669                struct page *page = bvec->bv_page;
3670
3671                eb = (struct extent_buffer *)page->private;
3672                BUG_ON(!eb);
3673                done = atomic_dec_and_test(&eb->io_pages);
3674
3675                if (bio->bi_status ||
3676                    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3677                        ClearPageUptodate(page);
3678                        set_btree_ioerr(page);
3679                }
3680
3681                end_page_writeback(page);
3682
3683                if (!done)
3684                        continue;
3685
3686                end_extent_buffer_writeback(eb);
3687        }
3688
3689        bio_put(bio);
3690}
3691
3692static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3693                        struct btrfs_fs_info *fs_info,
3694                        struct writeback_control *wbc,
3695                        struct extent_page_data *epd)
3696{
3697        struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3698        struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
3699        u64 offset = eb->start;
3700        u32 nritems;
3701        int i, num_pages;
3702        unsigned long start, end;
3703        unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
3704        int ret = 0;
3705
3706        clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3707        num_pages = num_extent_pages(eb);
3708        atomic_set(&eb->io_pages, num_pages);
3709
3710        /* set btree blocks beyond nritems with 0 to avoid stale content. */
3711        nritems = btrfs_header_nritems(eb);
3712        if (btrfs_header_level(eb) > 0) {
3713                end = btrfs_node_key_ptr_offset(nritems);
3714
3715                memzero_extent_buffer(eb, end, eb->len - end);
3716        } else {
3717                /*
3718                 * leaf:
3719                 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
3720                 */
3721                start = btrfs_item_nr_offset(nritems);
3722                end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb);
3723                memzero_extent_buffer(eb, start, end - start);
3724        }
3725
3726        for (i = 0; i < num_pages; i++) {
3727                struct page *p = eb->pages[i];
3728
3729                clear_page_dirty_for_io(p);
3730                set_page_writeback(p);
3731                ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
3732                                         p, offset, PAGE_SIZE, 0, bdev,
3733                                         &epd->bio,
3734                                         end_bio_extent_buffer_writepage,
3735                                         0, 0, 0, false);
3736                if (ret) {
3737                        set_btree_ioerr(p);
3738                        if (PageWriteback(p))
3739                                end_page_writeback(p);
3740                        if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3741                                end_extent_buffer_writeback(eb);
3742                        ret = -EIO;
3743                        break;
3744                }
3745                offset += PAGE_SIZE;
3746                update_nr_written(wbc, 1);
3747                unlock_page(p);
3748        }
3749
3750        if (unlikely(ret)) {
3751                for (; i < num_pages; i++) {
3752                        struct page *p = eb->pages[i];
3753                        clear_page_dirty_for_io(p);
3754                        unlock_page(p);
3755                }
3756        }
3757
3758        return ret;
3759}
3760
3761int btree_write_cache_pages(struct address_space *mapping,
3762                                   struct writeback_control *wbc)
3763{
3764        struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3765        struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3766        struct extent_buffer *eb, *prev_eb = NULL;
3767        struct extent_page_data epd = {
3768                .bio = NULL,
3769                .tree = tree,
3770                .extent_locked = 0,
3771                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3772        };
3773        int ret = 0;
3774        int done = 0;
3775        int nr_to_write_done = 0;
3776        struct pagevec pvec;
3777        int nr_pages;
3778        pgoff_t index;
3779        pgoff_t end;            /* Inclusive */
3780        int scanned = 0;
3781        int tag;
3782
3783        pagevec_init(&pvec);
3784        if (wbc->range_cyclic) {
3785                index = mapping->writeback_index; /* Start from prev offset */
3786                end = -1;
3787        } else {
3788                index = wbc->range_start >> PAGE_SHIFT;
3789                end = wbc->range_end >> PAGE_SHIFT;
3790                scanned = 1;
3791        }
3792        if (wbc->sync_mode == WB_SYNC_ALL)
3793                tag = PAGECACHE_TAG_TOWRITE;
3794        else
3795                tag = PAGECACHE_TAG_DIRTY;
3796retry:
3797        if (wbc->sync_mode == WB_SYNC_ALL)
3798                tag_pages_for_writeback(mapping, index, end);
3799        while (!done && !nr_to_write_done && (index <= end) &&
3800               (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
3801                        tag))) {
3802                unsigned i;
3803
3804                scanned = 1;
3805                for (i = 0; i < nr_pages; i++) {
3806                        struct page *page = pvec.pages[i];
3807
3808                        if (!PagePrivate(page))
3809                                continue;
3810
3811                        spin_lock(&mapping->private_lock);
3812                        if (!PagePrivate(page)) {
3813                                spin_unlock(&mapping->private_lock);
3814                                continue;
3815                        }
3816
3817                        eb = (struct extent_buffer *)page->private;
3818
3819                        /*
3820                         * Shouldn't happen and normally this would be a BUG_ON
3821                         * but no sense in crashing the users box for something
3822                         * we can survive anyway.
3823                         */
3824                        if (WARN_ON(!eb)) {
3825                                spin_unlock(&mapping->private_lock);
3826                                continue;
3827                        }
3828
3829                        if (eb == prev_eb) {
3830                                spin_unlock(&mapping->private_lock);
3831                                continue;
3832                        }
3833
3834                        ret = atomic_inc_not_zero(&eb->refs);
3835                        spin_unlock(&mapping->private_lock);
3836                        if (!ret)
3837                                continue;
3838
3839                        prev_eb = eb;
3840                        ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3841                        if (!ret) {
3842                                free_extent_buffer(eb);
3843                                continue;
3844                        }
3845
3846                        ret = write_one_eb(eb, fs_info, wbc, &epd);
3847                        if (ret) {
3848                                done = 1;
3849                                free_extent_buffer(eb);
3850                                break;
3851                        }
3852                        free_extent_buffer(eb);
3853
3854                        /*
3855                         * the filesystem may choose to bump up nr_to_write.
3856                         * We have to make sure to honor the new nr_to_write
3857                         * at any time
3858                         */
3859                        nr_to_write_done = wbc->nr_to_write <= 0;
3860                }
3861                pagevec_release(&pvec);
3862                cond_resched();
3863        }
3864        if (!scanned && !done) {
3865                /*
3866                 * We hit the last page and there is more work to be done: wrap
3867                 * back to the start of the file
3868                 */
3869                scanned = 1;
3870                index = 0;
3871                goto retry;
3872        }
3873        flush_write_bio(&epd);
3874        return ret;
3875}
3876
3877/**
3878 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3879 * @mapping: address space structure to write
3880 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3881 * @data: data passed to __extent_writepage function
3882 *
3883 * If a page is already under I/O, write_cache_pages() skips it, even
3884 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3885 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3886 * and msync() need to guarantee that all the data which was dirty at the time
3887 * the call was made get new I/O started against them.  If wbc->sync_mode is
3888 * WB_SYNC_ALL then we were called for data integrity and we must wait for
3889 * existing IO to complete.
3890 */
3891static int extent_write_cache_pages(struct address_space *mapping,
3892                             struct writeback_control *wbc,
3893                             struct extent_page_data *epd)
3894{
3895        struct inode *inode = mapping->host;
3896        int ret = 0;
3897        int done = 0;
3898        int nr_to_write_done = 0;
3899        struct pagevec pvec;
3900        int nr_pages;
3901        pgoff_t index;
3902        pgoff_t end;            /* Inclusive */
3903        pgoff_t done_index;
3904        int range_whole = 0;
3905        int scanned = 0;
3906        int tag;
3907
3908        /*
3909         * We have to hold onto the inode so that ordered extents can do their
3910         * work when the IO finishes.  The alternative to this is failing to add
3911         * an ordered extent if the igrab() fails there and that is a huge pain
3912         * to deal with, so instead just hold onto the inode throughout the
3913         * writepages operation.  If it fails here we are freeing up the inode
3914         * anyway and we'd rather not waste our time writing out stuff that is
3915         * going to be truncated anyway.
3916         */
3917        if (!igrab(inode))
3918                return 0;
3919
3920        pagevec_init(&pvec);
3921        if (wbc->range_cyclic) {
3922                index = mapping->writeback_index; /* Start from prev offset */
3923                end = -1;
3924        } else {
3925                index = wbc->range_start >> PAGE_SHIFT;
3926                end = wbc->range_end >> PAGE_SHIFT;
3927                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
3928                        range_whole = 1;
3929                scanned = 1;
3930        }
3931        if (wbc->sync_mode == WB_SYNC_ALL)
3932                tag = PAGECACHE_TAG_TOWRITE;
3933        else
3934                tag = PAGECACHE_TAG_DIRTY;
3935retry:
3936        if (wbc->sync_mode == WB_SYNC_ALL)
3937                tag_pages_for_writeback(mapping, index, end);
3938        done_index = index;
3939        while (!done && !nr_to_write_done && (index <= end) &&
3940                        (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
3941                                                &index, end, tag))) {
3942                unsigned i;
3943
3944                scanned = 1;
3945                for (i = 0; i < nr_pages; i++) {
3946                        struct page *page = pvec.pages[i];
3947
3948                        done_index = page->index;
3949                        /*
3950                         * At this point we hold neither the i_pages lock nor
3951                         * the page lock: the page may be truncated or
3952                         * invalidated (changing page->mapping to NULL),
3953                         * or even swizzled back from swapper_space to
3954                         * tmpfs file mapping
3955                         */
3956                        if (!trylock_page(page)) {
3957                                flush_write_bio(epd);
3958                                lock_page(page);
3959                        }
3960
3961                        if (unlikely(page->mapping != mapping)) {
3962                                unlock_page(page);
3963                                continue;
3964                        }
3965
3966                        if (wbc->sync_mode != WB_SYNC_NONE) {
3967                                if (PageWriteback(page))
3968                                        flush_write_bio(epd);
3969                                wait_on_page_writeback(page);
3970                        }
3971
3972                        if (PageWriteback(page) ||
3973                            !clear_page_dirty_for_io(page)) {
3974                                unlock_page(page);
3975                                continue;
3976                        }
3977
3978                        ret = __extent_writepage(page, wbc, epd);
3979
3980                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3981                                unlock_page(page);
3982                                ret = 0;
3983                        }
3984                        if (ret < 0) {
3985                                /*
3986                                 * done_index is set past this page,
3987                                 * so media errors will not choke
3988                                 * background writeout for the entire
3989                                 * file. This has consequences for
3990                                 * range_cyclic semantics (ie. it may
3991                                 * not be suitable for data integrity
3992                                 * writeout).
3993                                 */
3994                                done_index = page->index + 1;
3995                                done = 1;
3996                                break;
3997                        }
3998
3999                        /*
4000                         * the filesystem may choose to bump up nr_to_write.
4001                         * We have to make sure to honor the new nr_to_write
4002                         * at any time
4003                         */
4004                        nr_to_write_done = wbc->nr_to_write <= 0;
4005                }
4006                pagevec_release(&pvec);
4007                cond_resched();
4008        }
4009        if (!scanned && !done) {
4010                /*
4011                 * We hit the last page and there is more work to be done: wrap
4012                 * back to the start of the file
4013                 */
4014                scanned = 1;
4015                index = 0;
4016                goto retry;
4017        }
4018
4019        if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4020                mapping->writeback_index = done_index;
4021
4022        btrfs_add_delayed_iput(inode);
4023        return ret;
4024}
4025
4026static void flush_write_bio(struct extent_page_data *epd)
4027{
4028        if (epd->bio) {
4029                int ret;
4030
4031                ret = submit_one_bio(epd->bio, 0, 0);
4032                BUG_ON(ret < 0); /* -ENOMEM */
4033                epd->bio = NULL;
4034        }
4035}
4036
4037int extent_write_full_page(struct page *page, struct writeback_control *wbc)
4038{
4039        int ret;
4040        struct extent_page_data epd = {
4041                .bio = NULL,
4042                .tree = &BTRFS_I(page->mapping->host)->io_tree,
4043                .extent_locked = 0,
4044                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4045        };
4046
4047        ret = __extent_writepage(page, wbc, &epd);
4048
4049        flush_write_bio(&epd);
4050        return ret;
4051}
4052
4053int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
4054                              int mode)
4055{
4056        int ret = 0;
4057        struct address_space *mapping = inode->i_mapping;
4058        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
4059        struct page *page;
4060        unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4061                PAGE_SHIFT;
4062
4063        struct extent_page_data epd = {
4064                .bio = NULL,
4065                .tree = tree,
4066                .extent_locked = 1,
4067                .sync_io = mode == WB_SYNC_ALL,
4068        };
4069        struct writeback_control wbc_writepages = {
4070                .sync_mode      = mode,
4071                .nr_to_write    = nr_pages * 2,
4072                .range_start    = start,
4073                .range_end      = end + 1,
4074        };
4075
4076        while (start <= end) {
4077                page = find_get_page(mapping, start >> PAGE_SHIFT);
4078                if (clear_page_dirty_for_io(page))
4079                        ret = __extent_writepage(page, &wbc_writepages, &epd);
4080                else {
4081                        if (tree->ops && tree->ops->writepage_end_io_hook)
4082                                tree->ops->writepage_end_io_hook(page, start,
4083                                                 start + PAGE_SIZE - 1,
4084                                                 NULL, 1);
4085                        unlock_page(page);
4086                }
4087                put_page(page);
4088                start += PAGE_SIZE;
4089        }
4090
4091        flush_write_bio(&epd);
4092        return ret;
4093}
4094
4095int extent_writepages(struct address_space *mapping,
4096                      struct writeback_control *wbc)
4097{
4098        int ret = 0;
4099        struct extent_page_data epd = {
4100                .bio = NULL,
4101                .tree = &BTRFS_I(mapping->host)->io_tree,
4102                .extent_locked = 0,
4103                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4104        };
4105
4106        ret = extent_write_cache_pages(mapping, wbc, &epd);
4107        flush_write_bio(&epd);
4108        return ret;
4109}
4110
4111int extent_readpages(struct address_space *mapping, struct list_head *pages,
4112                     unsigned nr_pages)
4113{
4114        struct bio *bio = NULL;
4115        unsigned page_idx;
4116        unsigned long bio_flags = 0;
4117        struct page *pagepool[16];
4118        struct page *page;
4119        struct extent_map *em_cached = NULL;
4120        struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
4121        int nr = 0;
4122        u64 prev_em_start = (u64)-1;
4123
4124        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
4125                page = list_entry(pages->prev, struct page, lru);
4126
4127                prefetchw(&page->flags);
4128                list_del(&page->lru);
4129                if (add_to_page_cache_lru(page, mapping,
4130                                        page->index,
4131                                        readahead_gfp_mask(mapping))) {
4132                        put_page(page);
4133                        continue;
4134                }
4135
4136                pagepool[nr++] = page;
4137                if (nr < ARRAY_SIZE(pagepool))
4138                        continue;
4139                __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
4140                                &bio_flags, &prev_em_start);
4141                nr = 0;
4142        }
4143        if (nr)
4144                __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
4145                                &bio_flags, &prev_em_start);
4146
4147        if (em_cached)
4148                free_extent_map(em_cached);
4149
4150        BUG_ON(!list_empty(pages));
4151        if (bio)
4152                return submit_one_bio(bio, 0, bio_flags);
4153        return 0;
4154}
4155
4156/*
4157 * basic invalidatepage code, this waits on any locked or writeback
4158 * ranges corresponding to the page, and then deletes any extent state
4159 * records from the tree
4160 */
4161int extent_invalidatepage(struct extent_io_tree *tree,
4162                          struct page *page, unsigned long offset)
4163{
4164        struct extent_state *cached_state = NULL;
4165        u64 start = page_offset(page);
4166        u64 end = start + PAGE_SIZE - 1;
4167        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4168
4169        start += ALIGN(offset, blocksize);
4170        if (start > end)
4171                return 0;
4172
4173        lock_extent_bits(tree, start, end, &cached_state);
4174        wait_on_page_writeback(page);
4175        clear_extent_bit(tree, start, end,
4176                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4177                         EXTENT_DO_ACCOUNTING,
4178                         1, 1, &cached_state);
4179        return 0;
4180}
4181
4182/*
4183 * a helper for releasepage, this tests for areas of the page that
4184 * are locked or under IO and drops the related state bits if it is safe
4185 * to drop the page.
4186 */
4187static int try_release_extent_state(struct extent_io_tree *tree,
4188                                    struct page *page, gfp_t mask)
4189{
4190        u64 start = page_offset(page);
4191        u64 end = start + PAGE_SIZE - 1;
4192        int ret = 1;
4193
4194        if (test_range_bit(tree, start, end,
4195                           EXTENT_IOBITS, 0, NULL))
4196                ret = 0;
4197        else {
4198                /*
4199                 * at this point we can safely clear everything except the
4200                 * locked bit and the nodatasum bit
4201                 */
4202                ret = __clear_extent_bit(tree, start, end,
4203                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
4204                                 0, 0, NULL, mask, NULL);
4205
4206                /* if clear_extent_bit failed for enomem reasons,
4207                 * we can't allow the release to continue.
4208                 */
4209                if (ret < 0)
4210                        ret = 0;
4211                else
4212                        ret = 1;
4213        }
4214        return ret;
4215}
4216
4217/*
4218 * a helper for releasepage.  As long as there are no locked extents
4219 * in the range corresponding to the page, both state records and extent
4220 * map records are removed
4221 */
4222int try_release_extent_mapping(struct page *page, gfp_t mask)
4223{
4224        struct extent_map *em;
4225        u64 start = page_offset(page);
4226        u64 end = start + PAGE_SIZE - 1;
4227        struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
4228        struct extent_io_tree *tree = &btrfs_inode->io_tree;
4229        struct extent_map_tree *map = &btrfs_inode->extent_tree;
4230
4231        if (gfpflags_allow_blocking(mask) &&
4232            page->mapping->host->i_size > SZ_16M) {
4233                u64 len;
4234                while (start <= end) {
4235                        len = end - start + 1;
4236                        write_lock(&map->lock);
4237                        em = lookup_extent_mapping(map, start, len);
4238                        if (!em) {
4239                                write_unlock(&map->lock);
4240                                break;
4241                        }
4242                        if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4243                            em->start != start) {
4244                                write_unlock(&map->lock);
4245                                free_extent_map(em);
4246                                break;
4247                        }
4248                        if (!test_range_bit(tree, em->start,
4249                                            extent_map_end(em) - 1,
4250                                            EXTENT_LOCKED | EXTENT_WRITEBACK,
4251                                            0, NULL)) {
4252                                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4253                                        &btrfs_inode->runtime_flags);
4254                                remove_extent_mapping(map, em);
4255                                /* once for the rb tree */
4256                                free_extent_map(em);
4257                        }
4258                        start = extent_map_end(em);
4259                        write_unlock(&map->lock);
4260
4261                        /* once for us */
4262                        free_extent_map(em);
4263                }
4264        }
4265        return try_release_extent_state(tree, page, mask);
4266}
4267
4268/*
4269 * helper function for fiemap, which doesn't want to see any holes.
4270 * This maps until we find something past 'last'
4271 */
4272static struct extent_map *get_extent_skip_holes(struct inode *inode,
4273                                                u64 offset, u64 last)
4274{
4275        u64 sectorsize = btrfs_inode_sectorsize(inode);
4276        struct extent_map *em;
4277        u64 len;
4278
4279        if (offset >= last)
4280                return NULL;
4281
4282        while (1) {
4283                len = last - offset;
4284                if (len == 0)
4285                        break;
4286                len = ALIGN(len, sectorsize);
4287                em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
4288                                len, 0);
4289                if (IS_ERR_OR_NULL(em))
4290                        return em;
4291
4292                /* if this isn't a hole return it */
4293                if (em->block_start != EXTENT_MAP_HOLE)
4294                        return em;
4295
4296                /* this is a hole, advance to the next extent */
4297                offset = extent_map_end(em);
4298                free_extent_map(em);
4299                if (offset >= last)
4300                        break;
4301        }
4302        return NULL;
4303}
4304
4305/*
4306 * To cache previous fiemap extent
4307 *
4308 * Will be used for merging fiemap extent
4309 */
4310struct fiemap_cache {
4311        u64 offset;
4312        u64 phys;
4313        u64 len;
4314        u32 flags;
4315        bool cached;
4316};
4317
4318/*
4319 * Helper to submit fiemap extent.
4320 *
4321 * Will try to merge current fiemap extent specified by @offset, @phys,
4322 * @len and @flags with cached one.
4323 * And only when we fails to merge, cached one will be submitted as
4324 * fiemap extent.
4325 *
4326 * Return value is the same as fiemap_fill_next_extent().
4327 */
4328static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
4329                                struct fiemap_cache *cache,
4330                                u64 offset, u64 phys, u64 len, u32 flags)
4331{
4332        int ret = 0;
4333
4334        if (!cache->cached)
4335                goto assign;
4336
4337        /*
4338         * Sanity check, extent_fiemap() should have ensured that new
4339         * fiemap extent won't overlap with cahced one.
4340         * Not recoverable.
4341         *
4342         * NOTE: Physical address can overlap, due to compression
4343         */
4344        if (cache->offset + cache->len > offset) {
4345                WARN_ON(1);
4346                return -EINVAL;
4347        }
4348
4349        /*
4350         * Only merges fiemap extents if
4351         * 1) Their logical addresses are continuous
4352         *
4353         * 2) Their physical addresses are continuous
4354         *    So truly compressed (physical size smaller than logical size)
4355         *    extents won't get merged with each other
4356         *
4357         * 3) Share same flags except FIEMAP_EXTENT_LAST
4358         *    So regular extent won't get merged with prealloc extent
4359         */
4360        if (cache->offset + cache->len  == offset &&
4361            cache->phys + cache->len == phys  &&
4362            (cache->flags & ~FIEMAP_EXTENT_LAST) ==
4363                        (flags & ~FIEMAP_EXTENT_LAST)) {
4364                cache->len += len;
4365                cache->flags |= flags;
4366                goto try_submit_last;
4367        }
4368
4369        /* Not mergeable, need to submit cached one */
4370        ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4371                                      cache->len, cache->flags);
4372        cache->cached = false;
4373        if (ret)
4374                return ret;
4375assign:
4376        cache->cached = true;
4377        cache->offset = offset;
4378        cache->phys = phys;
4379        cache->len = len;
4380        cache->flags = flags;
4381try_submit_last:
4382        if (cache->flags & FIEMAP_EXTENT_LAST) {
4383                ret = fiemap_fill_next_extent(fieinfo, cache->offset,
4384                                cache->phys, cache->len, cache->flags);
4385                cache->cached = false;
4386        }
4387        return ret;
4388}
4389
4390/*
4391 * Emit last fiemap cache
4392 *
4393 * The last fiemap cache may still be cached in the following case:
4394 * 0                  4k                    8k
4395 * |<- Fiemap range ->|
4396 * |<------------  First extent ----------->|
4397 *
4398 * In this case, the first extent range will be cached but not emitted.
4399 * So we must emit it before ending extent_fiemap().
4400 */
4401static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
4402                                  struct fiemap_extent_info *fieinfo,
4403                                  struct fiemap_cache *cache)
4404{
4405        int ret;
4406
4407        if (!cache->cached)
4408                return 0;
4409
4410        ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4411                                      cache->len, cache->flags);
4412        cache->cached = false;
4413        if (ret > 0)
4414                ret = 0;
4415        return ret;
4416}
4417
4418int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4419                __u64 start, __u64 len)
4420{
4421        int ret = 0;
4422        u64 off = start;
4423        u64 max = start + len;
4424        u32 flags = 0;
4425        u32 found_type;
4426        u64 last;
4427        u64 last_for_get_extent = 0;
4428        u64 disko = 0;
4429        u64 isize = i_size_read(inode);
4430        struct btrfs_key found_key;
4431        struct extent_map *em = NULL;
4432        struct extent_state *cached_state = NULL;
4433        struct btrfs_path *path;
4434        struct btrfs_root *root = BTRFS_I(inode)->root;
4435        struct fiemap_cache cache = { 0 };
4436        int end = 0;
4437        u64 em_start = 0;
4438        u64 em_len = 0;
4439        u64 em_end = 0;
4440
4441        if (len == 0)
4442                return -EINVAL;
4443
4444        path = btrfs_alloc_path();
4445        if (!path)
4446                return -ENOMEM;
4447        path->leave_spinning = 1;
4448
4449        start = round_down(start, btrfs_inode_sectorsize(inode));
4450        len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4451
4452        /*
4453         * lookup the last file extent.  We're not using i_size here
4454         * because there might be preallocation past i_size
4455         */
4456        ret = btrfs_lookup_file_extent(NULL, root, path,
4457                        btrfs_ino(BTRFS_I(inode)), -1, 0);
4458        if (ret < 0) {
4459                btrfs_free_path(path);
4460                return ret;
4461        } else {
4462                WARN_ON(!ret);
4463                if (ret == 1)
4464                        ret = 0;
4465        }
4466
4467        path->slots[0]--;
4468        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4469        found_type = found_key.type;
4470
4471        /* No extents, but there might be delalloc bits */
4472        if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
4473            found_type != BTRFS_EXTENT_DATA_KEY) {
4474                /* have to trust i_size as the end */
4475                last = (u64)-1;
4476                last_for_get_extent = isize;
4477        } else {
4478                /*
4479                 * remember the start of the last extent.  There are a
4480                 * bunch of different factors that go into the length of the
4481                 * extent, so its much less complex to remember where it started
4482                 */
4483                last = found_key.offset;
4484                last_for_get_extent = last + 1;
4485        }
4486        btrfs_release_path(path);
4487
4488        /*
4489         * we might have some extents allocated but more delalloc past those
4490         * extents.  so, we trust isize unless the start of the last extent is
4491         * beyond isize
4492         */
4493        if (last < isize) {
4494                last = (u64)-1;
4495                last_for_get_extent = isize;
4496        }
4497
4498        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4499                         &cached_state);
4500
4501        em = get_extent_skip_holes(inode, start, last_for_get_extent);
4502        if (!em)
4503                goto out;
4504        if (IS_ERR(em)) {
4505                ret = PTR_ERR(em);
4506                goto out;
4507        }
4508
4509        while (!end) {
4510                u64 offset_in_extent = 0;
4511
4512                /* break if the extent we found is outside the range */
4513                if (em->start >= max || extent_map_end(em) < off)
4514                        break;
4515
4516                /*
4517                 * get_extent may return an extent that starts before our
4518                 * requested range.  We have to make sure the ranges
4519                 * we return to fiemap always move forward and don't
4520                 * overlap, so adjust the offsets here
4521                 */
4522                em_start = max(em->start, off);
4523
4524                /*
4525                 * record the offset from the start of the extent
4526                 * for adjusting the disk offset below.  Only do this if the
4527                 * extent isn't compressed since our in ram offset may be past
4528                 * what we have actually allocated on disk.
4529                 */
4530                if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4531                        offset_in_extent = em_start - em->start;
4532                em_end = extent_map_end(em);
4533                em_len = em_end - em_start;
4534                flags = 0;
4535                if (em->block_start < EXTENT_MAP_LAST_BYTE)
4536                        disko = em->block_start + offset_in_extent;
4537                else
4538                        disko = 0;
4539
4540                /*
4541                 * bump off for our next call to get_extent
4542                 */
4543                off = extent_map_end(em);
4544                if (off >= max)
4545                        end = 1;
4546
4547                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
4548                        end = 1;
4549                        flags |= FIEMAP_EXTENT_LAST;
4550                } else if (em->block_start == EXTENT_MAP_INLINE) {
4551                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
4552                                  FIEMAP_EXTENT_NOT_ALIGNED);
4553                } else if (em->block_start == EXTENT_MAP_DELALLOC) {
4554                        flags |= (FIEMAP_EXTENT_DELALLOC |
4555                                  FIEMAP_EXTENT_UNKNOWN);
4556                } else if (fieinfo->fi_extents_max) {
4557                        u64 bytenr = em->block_start -
4558                                (em->start - em->orig_start);
4559
4560                        /*
4561                         * As btrfs supports shared space, this information
4562                         * can be exported to userspace tools via
4563                         * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
4564                         * then we're just getting a count and we can skip the
4565                         * lookup stuff.
4566                         */
4567                        ret = btrfs_check_shared(root,
4568                                                 btrfs_ino(BTRFS_I(inode)),
4569                                                 bytenr);
4570                        if (ret < 0)
4571                                goto out_free;
4572                        if (ret)
4573                                flags |= FIEMAP_EXTENT_SHARED;
4574                        ret = 0;
4575                }
4576                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4577                        flags |= FIEMAP_EXTENT_ENCODED;
4578                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4579                        flags |= FIEMAP_EXTENT_UNWRITTEN;
4580
4581                free_extent_map(em);
4582                em = NULL;
4583                if ((em_start >= last) || em_len == (u64)-1 ||
4584                   (last == (u64)-1 && isize <= em_end)) {
4585                        flags |= FIEMAP_EXTENT_LAST;
4586                        end = 1;
4587                }
4588
4589                /* now scan forward to see if this is really the last extent. */
4590                em = get_extent_skip_holes(inode, off, last_for_get_extent);
4591                if (IS_ERR(em)) {
4592                        ret = PTR_ERR(em);
4593                        goto out;
4594                }
4595                if (!em) {
4596                        flags |= FIEMAP_EXTENT_LAST;
4597                        end = 1;
4598                }
4599                ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
4600                                           em_len, flags);
4601                if (ret) {
4602                        if (ret == 1)
4603                                ret = 0;
4604                        goto out_free;
4605                }
4606        }
4607out_free:
4608        if (!ret)
4609                ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache);
4610        free_extent_map(em);
4611out:
4612        btrfs_free_path(path);
4613        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4614                             &cached_state);
4615        return ret;
4616}
4617
4618static void __free_extent_buffer(struct extent_buffer *eb)
4619{
4620        btrfs_leak_debug_del(&eb->leak_list);
4621        kmem_cache_free(extent_buffer_cache, eb);
4622}
4623
4624int extent_buffer_under_io(struct extent_buffer *eb)
4625{
4626        return (atomic_read(&eb->io_pages) ||
4627                test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4628                test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4629}
4630
4631/*
4632 * Release all pages attached to the extent buffer.
4633 */
4634static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
4635{
4636        int i;
4637        int num_pages;
4638        int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
4639
4640        BUG_ON(extent_buffer_under_io(eb));
4641
4642        num_pages = num_extent_pages(eb);
4643        for (i = 0; i < num_pages; i++) {
4644                struct page *page = eb->pages[i];
4645
4646                if (!page)
4647                        continue;
4648                if (mapped)
4649                        spin_lock(&page->mapping->private_lock);
4650                /*
4651                 * We do this since we'll remove the pages after we've
4652                 * removed the eb from the radix tree, so we could race
4653                 * and have this page now attached to the new eb.  So
4654                 * only clear page_private if it's still connected to
4655                 * this eb.
4656                 */
4657                if (PagePrivate(page) &&
4658                    page->private == (unsigned long)eb) {
4659                        BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4660                        BUG_ON(PageDirty(page));
4661                        BUG_ON(PageWriteback(page));
4662                        /*
4663                         * We need to make sure we haven't be attached
4664                         * to a new eb.
4665                         */
4666                        ClearPagePrivate(page);
4667                        set_page_private(page, 0);
4668                        /* One for the page private */
4669                        put_page(page);
4670                }
4671
4672                if (mapped)
4673                        spin_unlock(&page->mapping->private_lock);
4674
4675                /* One for when we allocated the page */
4676                put_page(page);
4677        }
4678}
4679
4680/*
4681 * Helper for releasing the extent buffer.
4682 */
4683static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4684{
4685        btrfs_release_extent_buffer_pages(eb);
4686        __free_extent_buffer(eb);
4687}
4688
4689static struct extent_buffer *
4690__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
4691                      unsigned long len)
4692{
4693        struct extent_buffer *eb = NULL;
4694
4695        eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
4696        eb->start = start;
4697        eb->len = len;
4698        eb->fs_info = fs_info;
4699        eb->bflags = 0;
4700        rwlock_init(&eb->lock);
4701        atomic_set(&eb->write_locks, 0);
4702        atomic_set(&eb->read_locks, 0);
4703        atomic_set(&eb->blocking_readers, 0);
4704        atomic_set(&eb->blocking_writers, 0);
4705        atomic_set(&eb->spinning_readers, 0);
4706        atomic_set(&eb->spinning_writers, 0);
4707        eb->lock_nested = 0;
4708        init_waitqueue_head(&eb->write_lock_wq);
4709        init_waitqueue_head(&eb->read_lock_wq);
4710
4711        btrfs_leak_debug_add(&eb->leak_list, &buffers);
4712
4713        spin_lock_init(&eb->refs_lock);
4714        atomic_set(&eb->refs, 1);
4715        atomic_set(&eb->io_pages, 0);
4716
4717        /*
4718         * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4719         */
4720        BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4721                > MAX_INLINE_EXTENT_BUFFER_SIZE);
4722        BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4723
4724        return eb;
4725}
4726
4727struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4728{
4729        int i;
4730        struct page *p;
4731        struct extent_buffer *new;
4732        int num_pages = num_extent_pages(src);
4733
4734        new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
4735        if (new == NULL)
4736                return NULL;
4737
4738        for (i = 0; i < num_pages; i++) {
4739                p = alloc_page(GFP_NOFS);
4740                if (!p) {
4741                        btrfs_release_extent_buffer(new);
4742                        return NULL;
4743                }
4744                attach_extent_buffer_page(new, p);
4745                WARN_ON(PageDirty(p));
4746                SetPageUptodate(p);
4747                new->pages[i] = p;
4748                copy_page(page_address(p), page_address(src->pages[i]));
4749        }
4750
4751        set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
4752        set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
4753
4754        return new;
4755}
4756
4757struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4758                                                  u64 start, unsigned long len)
4759{
4760        struct extent_buffer *eb;
4761        int num_pages;
4762        int i;
4763
4764        eb = __alloc_extent_buffer(fs_info, start, len);
4765        if (!eb)
4766                return NULL;
4767
4768        num_pages = num_extent_pages(eb);
4769        for (i = 0; i < num_pages; i++) {
4770                eb->pages[i] = alloc_page(GFP_NOFS);
4771                if (!eb->pages[i])
4772                        goto err;
4773        }
4774        set_extent_buffer_uptodate(eb);
4775        btrfs_set_header_nritems(eb, 0);
4776        set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
4777
4778        return eb;
4779err:
4780        for (; i > 0; i--)
4781                __free_page(eb->pages[i - 1]);
4782        __free_extent_buffer(eb);
4783        return NULL;
4784}
4785
4786struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4787                                                u64 start)
4788{
4789        return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
4790}
4791
4792static void check_buffer_tree_ref(struct extent_buffer *eb)
4793{
4794        int refs;
4795        /* the ref bit is tricky.  We have to make sure it is set
4796         * if we have the buffer dirty.   Otherwise the
4797         * code to free a buffer can end up dropping a dirty
4798         * page
4799         *
4800         * Once the ref bit is set, it won't go away while the
4801         * buffer is dirty or in writeback, and it also won't
4802         * go away while we have the reference count on the
4803         * eb bumped.
4804         *
4805         * We can't just set the ref bit without bumping the
4806         * ref on the eb because free_extent_buffer might
4807         * see the ref bit and try to clear it.  If this happens
4808         * free_extent_buffer might end up dropping our original
4809         * ref by mistake and freeing the page before we are able
4810         * to add one more ref.
4811         *
4812         * So bump the ref count first, then set the bit.  If someone
4813         * beat us to it, drop the ref we added.
4814         */
4815        refs = atomic_read(&eb->refs);
4816        if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4817                return;
4818
4819        spin_lock(&eb->refs_lock);
4820        if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4821                atomic_inc(&eb->refs);
4822        spin_unlock(&eb->refs_lock);
4823}
4824
4825static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4826                struct page *accessed)
4827{
4828        int num_pages, i;
4829
4830        check_buffer_tree_ref(eb);
4831
4832        num_pages = num_extent_pages(eb);
4833        for (i = 0; i < num_pages; i++) {
4834                struct page *p = eb->pages[i];
4835
4836                if (p != accessed)
4837                        mark_page_accessed(p);
4838        }
4839}
4840
4841struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4842                                         u64 start)
4843{
4844        struct extent_buffer *eb;
4845
4846        rcu_read_lock();
4847        eb = radix_tree_lookup(&fs_info->buffer_radix,
4848                               start >> PAGE_SHIFT);
4849        if (eb && atomic_inc_not_zero(&eb->refs)) {
4850                rcu_read_unlock();
4851                /*
4852                 * Lock our eb's refs_lock to avoid races with
4853                 * free_extent_buffer. When we get our eb it might be flagged
4854                 * with EXTENT_BUFFER_STALE and another task running
4855                 * free_extent_buffer might have seen that flag set,
4856                 * eb->refs == 2, that the buffer isn't under IO (dirty and
4857                 * writeback flags not set) and it's still in the tree (flag
4858                 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
4859                 * of decrementing the extent buffer's reference count twice.
4860                 * So here we could race and increment the eb's reference count,
4861                 * clear its stale flag, mark it as dirty and drop our reference
4862                 * before the other task finishes executing free_extent_buffer,
4863                 * which would later result in an attempt to free an extent
4864                 * buffer that is dirty.
4865                 */
4866                if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
4867                        spin_lock(&eb->refs_lock);
4868                        spin_unlock(&eb->refs_lock);
4869                }
4870                mark_extent_buffer_accessed(eb, NULL);
4871                return eb;
4872        }
4873        rcu_read_unlock();
4874
4875        return NULL;
4876}
4877
4878#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4879struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
4880                                        u64 start)
4881{
4882        struct extent_buffer *eb, *exists = NULL;
4883        int ret;
4884
4885        eb = find_extent_buffer(fs_info, start);
4886        if (eb)
4887                return eb;
4888        eb = alloc_dummy_extent_buffer(fs_info, start);
4889        if (!eb)
4890                return NULL;
4891        eb->fs_info = fs_info;
4892again:
4893        ret = radix_tree_preload(GFP_NOFS);
4894        if (ret)
4895                goto free_eb;
4896        spin_lock(&fs_info->buffer_lock);
4897        ret = radix_tree_insert(&fs_info->buffer_radix,
4898                                start >> PAGE_SHIFT, eb);
4899        spin_unlock(&fs_info->buffer_lock);
4900        radix_tree_preload_end();
4901        if (ret == -EEXIST) {
4902                exists = find_extent_buffer(fs_info, start);
4903                if (exists)
4904                        goto free_eb;
4905                else
4906                        goto again;
4907        }
4908        check_buffer_tree_ref(eb);
4909        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
4910
4911        /*
4912         * We will free dummy extent buffer's if they come into
4913         * free_extent_buffer with a ref count of 2, but if we are using this we
4914         * want the buffers to stay in memory until we're done with them, so
4915         * bump the ref count again.
4916         */
4917        atomic_inc(&eb->refs);
4918        return eb;
4919free_eb:
4920        btrfs_release_extent_buffer(eb);
4921        return exists;
4922}
4923#endif
4924
4925struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4926                                          u64 start)
4927{
4928        unsigned long len = fs_info->nodesize;
4929        int num_pages;
4930        int i;
4931        unsigned long index = start >> PAGE_SHIFT;
4932        struct extent_buffer *eb;
4933        struct extent_buffer *exists = NULL;
4934        struct page *p;
4935        struct address_space *mapping = fs_info->btree_inode->i_mapping;
4936        int uptodate = 1;
4937        int ret;
4938
4939        if (!IS_ALIGNED(start, fs_info->sectorsize)) {
4940                btrfs_err(fs_info, "bad tree block start %llu", start);
4941                return ERR_PTR(-EINVAL);
4942        }
4943
4944        eb = find_extent_buffer(fs_info, start);
4945        if (eb)
4946                return eb;
4947
4948        eb = __alloc_extent_buffer(fs_info, start, len);
4949        if (!eb)
4950                return ERR_PTR(-ENOMEM);
4951
4952        num_pages = num_extent_pages(eb);
4953        for (i = 0; i < num_pages; i++, index++) {
4954                p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
4955                if (!p) {
4956                        exists = ERR_PTR(-ENOMEM);
4957                        goto free_eb;
4958                }
4959
4960                spin_lock(&mapping->private_lock);
4961                if (PagePrivate(p)) {
4962                        /*
4963                         * We could have already allocated an eb for this page
4964                         * and attached one so lets see if we can get a ref on
4965                         * the existing eb, and if we can we know it's good and
4966                         * we can just return that one, else we know we can just
4967                         * overwrite page->private.
4968                         */
4969                        exists = (struct extent_buffer *)p->private;
4970                        if (atomic_inc_not_zero(&exists->refs)) {
4971                                spin_unlock(&mapping->private_lock);
4972                                unlock_page(p);
4973                                put_page(p);
4974                                mark_extent_buffer_accessed(exists, p);
4975                                goto free_eb;
4976                        }
4977                        exists = NULL;
4978
4979                        /*
4980                         * Do this so attach doesn't complain and we need to
4981                         * drop the ref the old guy had.
4982                         */
4983                        ClearPagePrivate(p);
4984                        WARN_ON(PageDirty(p));
4985                        put_page(p);
4986                }
4987                attach_extent_buffer_page(eb, p);
4988                spin_unlock(&mapping->private_lock);
4989                WARN_ON(PageDirty(p));
4990                eb->pages[i] = p;
4991                if (!PageUptodate(p))
4992                        uptodate = 0;
4993
4994                /*
4995                 * We can't unlock the pages just yet since the extent buffer
4996                 * hasn't been properly inserted in the radix tree, this
4997                 * opens a race with btree_releasepage which can free a page
4998                 * while we are still filling in all pages for the buffer and
4999                 * we could crash.
5000                 */
5001        }
5002        if (uptodate)
5003                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5004again:
5005        ret = radix_tree_preload(GFP_NOFS);
5006        if (ret) {
5007                exists = ERR_PTR(ret);
5008                goto free_eb;
5009        }
5010
5011        spin_lock(&fs_info->buffer_lock);
5012        ret = radix_tree_insert(&fs_info->buffer_radix,
5013                                start >> PAGE_SHIFT, eb);
5014        spin_unlock(&fs_info->buffer_lock);
5015        radix_tree_preload_end();
5016        if (ret == -EEXIST) {
5017                exists = find_extent_buffer(fs_info, start);
5018                if (exists)
5019                        goto free_eb;
5020                else
5021                        goto again;
5022        }
5023        /* add one reference for the tree */
5024        check_buffer_tree_ref(eb);
5025        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5026
5027        /*
5028         * Now it's safe to unlock the pages because any calls to
5029         * btree_releasepage will correctly detect that a page belongs to a
5030         * live buffer and won't free them prematurely.
5031         */
5032        for (i = 0; i < num_pages; i++)
5033                unlock_page(eb->pages[i]);
5034        return eb;
5035
5036free_eb:
5037        WARN_ON(!atomic_dec_and_test(&eb->refs));
5038        for (i = 0; i < num_pages; i++) {
5039                if (eb->pages[i])
5040                        unlock_page(eb->pages[i]);
5041        }
5042
5043        btrfs_release_extent_buffer(eb);
5044        return exists;
5045}
5046
5047static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5048{
5049        struct extent_buffer *eb =
5050                        container_of(head, struct extent_buffer, rcu_head);
5051
5052        __free_extent_buffer(eb);
5053}
5054
5055static int release_extent_buffer(struct extent_buffer *eb)
5056{
5057        lockdep_assert_held(&eb->refs_lock);
5058
5059        WARN_ON(atomic_read(&eb->refs) == 0);
5060        if (atomic_dec_and_test(&eb->refs)) {
5061                if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
5062                        struct btrfs_fs_info *fs_info = eb->fs_info;
5063
5064                        spin_unlock(&eb->refs_lock);
5065
5066                        spin_lock(&fs_info->buffer_lock);
5067                        radix_tree_delete(&fs_info->buffer_radix,
5068                                          eb->start >> PAGE_SHIFT);
5069                        spin_unlock(&fs_info->buffer_lock);
5070                } else {
5071                        spin_unlock(&eb->refs_lock);
5072                }
5073
5074                /* Should be safe to release our pages at this point */
5075                btrfs_release_extent_buffer_pages(eb);
5076#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5077                if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
5078                        __free_extent_buffer(eb);
5079                        return 1;
5080                }
5081#endif
5082                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
5083                return 1;
5084        }
5085        spin_unlock(&eb->refs_lock);
5086
5087        return 0;
5088}
5089
5090void free_extent_buffer(struct extent_buffer *eb)
5091{
5092        int refs;
5093        int old;
5094        if (!eb)
5095                return;
5096
5097        while (1) {
5098                refs = atomic_read(&eb->refs);
5099                if (refs <= 3)
5100                        break;
5101                old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5102                if (old == refs)
5103                        return;
5104        }
5105
5106        spin_lock(&eb->refs_lock);
5107        if (atomic_read(&eb->refs) == 2 &&
5108            test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))
5109                atomic_dec(&eb->refs);
5110
5111        if (atomic_read(&eb->refs) == 2 &&
5112            test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
5113            !extent_buffer_under_io(eb) &&
5114            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5115                atomic_dec(&eb->refs);
5116
5117        /*
5118         * I know this is terrible, but it's temporary until we stop tracking
5119         * the uptodate bits and such for the extent buffers.
5120         */
5121        release_extent_buffer(eb);
5122}
5123
5124void free_extent_buffer_stale(struct extent_buffer *eb)
5125{
5126        if (!eb)
5127                return;
5128
5129        spin_lock(&eb->refs_lock);
5130        set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5131
5132        if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
5133            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5134                atomic_dec(&eb->refs);
5135        release_extent_buffer(eb);
5136}
5137
5138void clear_extent_buffer_dirty(struct extent_buffer *eb)
5139{
5140        int i;
5141        int num_pages;
5142        struct page *page;
5143
5144        num_pages = num_extent_pages(eb);
5145
5146        for (i = 0; i < num_pages; i++) {
5147                page = eb->pages[i];
5148                if (!PageDirty(page))
5149                        continue;
5150
5151                lock_page(page);
5152                WARN_ON(!PagePrivate(page));
5153
5154                clear_page_dirty_for_io(page);
5155                xa_lock_irq(&page->mapping->i_pages);
5156                if (!PageDirty(page)) {
5157                        radix_tree_tag_clear(&page->mapping->i_pages,
5158                                                page_index(page),
5159                                                PAGECACHE_TAG_DIRTY);
5160                }
5161                xa_unlock_irq(&page->mapping->i_pages);
5162                ClearPageError(page);
5163                unlock_page(page);
5164        }
5165        WARN_ON(atomic_read(&eb->refs) == 0);
5166}
5167
5168int set_extent_buffer_dirty(struct extent_buffer *eb)
5169{
5170        int i;
5171        int num_pages;
5172        int was_dirty = 0;
5173
5174        check_buffer_tree_ref(eb);
5175
5176        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
5177
5178        num_pages = num_extent_pages(eb);
5179        WARN_ON(atomic_read(&eb->refs) == 0);
5180        WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5181
5182        for (i = 0; i < num_pages; i++)
5183                set_page_dirty(eb->pages[i]);
5184        return was_dirty;
5185}
5186
5187void clear_extent_buffer_uptodate(struct extent_buffer *eb)
5188{
5189        int i;
5190        struct page *page;
5191        int num_pages;
5192
5193        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5194        num_pages = num_extent_pages(eb);
5195        for (i = 0; i < num_pages; i++) {
5196                page = eb->pages[i];
5197                if (page)
5198                        ClearPageUptodate(page);
5199        }
5200}
5201
5202void set_extent_buffer_uptodate(struct extent_buffer *eb)
5203{
5204        int i;
5205        struct page *page;
5206        int num_pages;
5207
5208        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5209        num_pages = num_extent_pages(eb);
5210        for (i = 0; i < num_pages; i++) {
5211                page = eb->pages[i];
5212                SetPageUptodate(page);
5213        }
5214}
5215
5216int read_extent_buffer_pages(struct extent_io_tree *tree,
5217                             struct extent_buffer *eb, int wait, int mirror_num)
5218{
5219        int i;
5220        struct page *page;
5221        int err;
5222        int ret = 0;
5223        int locked_pages = 0;
5224        int all_uptodate = 1;
5225        int num_pages;
5226        unsigned long num_reads = 0;
5227        struct bio *bio = NULL;
5228        unsigned long bio_flags = 0;
5229
5230        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
5231                return 0;
5232
5233        num_pages = num_extent_pages(eb);
5234        for (i = 0; i < num_pages; i++) {
5235                page = eb->pages[i];
5236                if (wait == WAIT_NONE) {
5237                        if (!trylock_page(page))
5238                                goto unlock_exit;
5239                } else {
5240                        lock_page(page);
5241                }
5242                locked_pages++;
5243        }
5244        /*
5245         * We need to firstly lock all pages to make sure that
5246         * the uptodate bit of our pages won't be affected by
5247         * clear_extent_buffer_uptodate().
5248         */
5249        for (i = 0; i < num_pages; i++) {
5250                page = eb->pages[i];
5251                if (!PageUptodate(page)) {
5252                        num_reads++;
5253                        all_uptodate = 0;
5254                }
5255        }
5256
5257        if (all_uptodate) {
5258                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5259                goto unlock_exit;
5260        }
5261
5262        clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5263        eb->read_mirror = 0;
5264        atomic_set(&eb->io_pages, num_reads);
5265        for (i = 0; i < num_pages; i++) {
5266                page = eb->pages[i];
5267
5268                if (!PageUptodate(page)) {
5269                        if (ret) {
5270                                atomic_dec(&eb->io_pages);
5271                                unlock_page(page);
5272                                continue;
5273                        }
5274
5275                        ClearPageError(page);
5276                        err = __extent_read_full_page(tree, page,
5277                                                      btree_get_extent, &bio,
5278                                                      mirror_num, &bio_flags,
5279                                                      REQ_META);
5280                        if (err) {
5281                                ret = err;
5282                                /*
5283                                 * We use &bio in above __extent_read_full_page,
5284                                 * so we ensure that if it returns error, the
5285                                 * current page fails to add itself to bio and
5286                                 * it's been unlocked.
5287                                 *
5288                                 * We must dec io_pages by ourselves.
5289                                 */
5290                                atomic_dec(&eb->io_pages);
5291                        }
5292                } else {
5293                        unlock_page(page);
5294                }
5295        }
5296
5297        if (bio) {
5298                err = submit_one_bio(bio, mirror_num, bio_flags);
5299                if (err)
5300                        return err;
5301        }
5302
5303        if (ret || wait != WAIT_COMPLETE)
5304                return ret;
5305
5306        for (i = 0; i < num_pages; i++) {
5307                page = eb->pages[i];
5308                wait_on_page_locked(page);
5309                if (!PageUptodate(page))
5310                        ret = -EIO;
5311        }
5312
5313        return ret;
5314
5315unlock_exit:
5316        while (locked_pages > 0) {
5317                locked_pages--;
5318                page = eb->pages[locked_pages];
5319                unlock_page(page);
5320        }
5321        return ret;
5322}
5323
5324void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
5325                        unsigned long start, unsigned long len)
5326{
5327        size_t cur;
5328        size_t offset;
5329        struct page *page;
5330        char *kaddr;
5331        char *dst = (char *)dstv;
5332        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5333        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5334
5335        if (start + len > eb->len) {
5336                WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5337                     eb->start, eb->len, start, len);
5338                memset(dst, 0, len);
5339                return;
5340        }
5341
5342        offset = (start_offset + start) & (PAGE_SIZE - 1);
5343
5344        while (len > 0) {
5345                page = eb->pages[i];
5346
5347                cur = min(len, (PAGE_SIZE - offset));
5348                kaddr = page_address(page);
5349                memcpy(dst, kaddr + offset, cur);
5350
5351                dst += cur;
5352                len -= cur;
5353                offset = 0;
5354                i++;
5355        }
5356}
5357
5358int read_extent_buffer_to_user(const struct extent_buffer *eb,
5359                               void __user *dstv,
5360                               unsigned long start, unsigned long len)
5361{
5362        size_t cur;
5363        size_t offset;
5364        struct page *page;
5365        char *kaddr;
5366        char __user *dst = (char __user *)dstv;
5367        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5368        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5369        int ret = 0;
5370
5371        WARN_ON(start > eb->len);
5372        WARN_ON(start + len > eb->start + eb->len);
5373
5374        offset = (start_offset + start) & (PAGE_SIZE - 1);
5375
5376        while (len > 0) {
5377                page = eb->pages[i];
5378
5379                cur = min(len, (PAGE_SIZE - offset));
5380                kaddr = page_address(page);
5381                if (copy_to_user(dst, kaddr + offset, cur)) {
5382                        ret = -EFAULT;
5383                        break;
5384                }
5385
5386                dst += cur;
5387                len -= cur;
5388                offset = 0;
5389                i++;
5390        }
5391
5392        return ret;
5393}
5394
5395/*
5396 * return 0 if the item is found within a page.
5397 * return 1 if the item spans two pages.
5398 * return -EINVAL otherwise.
5399 */
5400int map_private_extent_buffer(const struct extent_buffer *eb,
5401                              unsigned long start, unsigned long min_len,
5402                              char **map, unsigned long *map_start,
5403                              unsigned long *map_len)
5404{
5405        size_t offset = start & (PAGE_SIZE - 1);
5406        char *kaddr;
5407        struct page *p;
5408        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5409        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5410        unsigned long end_i = (start_offset + start + min_len - 1) >>
5411                PAGE_SHIFT;
5412
5413        if (start + min_len > eb->len) {
5414                WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5415                       eb->start, eb->len, start, min_len);
5416                return -EINVAL;
5417        }
5418
5419        if (i != end_i)
5420                return 1;
5421
5422        if (i == 0) {
5423                offset = start_offset;
5424                *map_start = 0;
5425        } else {
5426                offset = 0;
5427                *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
5428        }
5429
5430        p = eb->pages[i];
5431        kaddr = page_address(p);
5432        *map = kaddr + offset;
5433        *map_len = PAGE_SIZE - offset;
5434        return 0;
5435}
5436
5437int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
5438                         unsigned long start, unsigned long len)
5439{
5440        size_t cur;
5441        size_t offset;
5442        struct page *page;
5443        char *kaddr;
5444        char *ptr = (char *)ptrv;
5445        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5446        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5447        int ret = 0;
5448
5449        WARN_ON(start > eb->len);
5450        WARN_ON(start + len > eb->start + eb->len);
5451
5452        offset = (start_offset + start) & (PAGE_SIZE - 1);
5453
5454        while (len > 0) {
5455                page = eb->pages[i];
5456
5457                cur = min(len, (PAGE_SIZE - offset));
5458
5459                kaddr = page_address(page);
5460                ret = memcmp(ptr, kaddr + offset, cur);
5461                if (ret)
5462                        break;
5463
5464                ptr += cur;
5465                len -= cur;
5466                offset = 0;
5467                i++;
5468        }
5469        return ret;
5470}
5471
5472void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
5473                const void *srcv)
5474{
5475        char *kaddr;
5476
5477        WARN_ON(!PageUptodate(eb->pages[0]));
5478        kaddr = page_address(eb->pages[0]);
5479        memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
5480                        BTRFS_FSID_SIZE);
5481}
5482
5483void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
5484{
5485        char *kaddr;
5486
5487        WARN_ON(!PageUptodate(eb->pages[0]));
5488        kaddr = page_address(eb->pages[0]);
5489        memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
5490                        BTRFS_FSID_SIZE);
5491}
5492
5493void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5494                         unsigned long start, unsigned long len)
5495{
5496        size_t cur;
5497        size_t offset;
5498        struct page *page;
5499        char *kaddr;
5500        char *src = (char *)srcv;
5501        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5502        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5503
5504        WARN_ON(start > eb->len);
5505        WARN_ON(start + len > eb->start + eb->len);
5506
5507        offset = (start_offset + start) & (PAGE_SIZE - 1);
5508
5509        while (len > 0) {
5510                page = eb->pages[i];
5511                WARN_ON(!PageUptodate(page));
5512
5513                cur = min(len, PAGE_SIZE - offset);
5514                kaddr = page_address(page);
5515                memcpy(kaddr + offset, src, cur);
5516
5517                src += cur;
5518                len -= cur;
5519                offset = 0;
5520                i++;
5521        }
5522}
5523
5524void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
5525                unsigned long len)
5526{
5527        size_t cur;
5528        size_t offset;
5529        struct page *page;
5530        char *kaddr;
5531        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5532        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5533
5534        WARN_ON(start > eb->len);
5535        WARN_ON(start + len > eb->start + eb->len);
5536
5537        offset = (start_offset + start) & (PAGE_SIZE - 1);
5538
5539        while (len > 0) {
5540                page = eb->pages[i];
5541                WARN_ON(!PageUptodate(page));
5542
5543                cur = min(len, PAGE_SIZE - offset);
5544                kaddr = page_address(page);
5545                memset(kaddr + offset, 0, cur);
5546
5547                len -= cur;
5548                offset = 0;
5549                i++;
5550        }
5551}
5552
5553void copy_extent_buffer_full(struct extent_buffer *dst,
5554                             struct extent_buffer *src)
5555{
5556        int i;
5557        int num_pages;
5558
5559        ASSERT(dst->len == src->len);
5560
5561        num_pages = num_extent_pages(dst);
5562        for (i = 0; i < num_pages; i++)
5563                copy_page(page_address(dst->pages[i]),
5564                                page_address(src->pages[i]));
5565}
5566
5567void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5568                        unsigned long dst_offset, unsigned long src_offset,
5569                        unsigned long len)
5570{
5571        u64 dst_len = dst->len;
5572        size_t cur;
5573        size_t offset;
5574        struct page *page;
5575        char *kaddr;
5576        size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
5577        unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
5578
5579        WARN_ON(src->len != dst_len);
5580
5581        offset = (start_offset + dst_offset) &
5582                (PAGE_SIZE - 1);
5583
5584        while (len > 0) {
5585                page = dst->pages[i];
5586                WARN_ON(!PageUptodate(page));
5587
5588                cur = min(len, (unsigned long)(PAGE_SIZE - offset));
5589
5590                kaddr = page_address(page);
5591                read_extent_buffer(src, kaddr + offset, src_offset, cur);
5592
5593                src_offset += cur;
5594                len -= cur;
5595                offset = 0;
5596                i++;
5597        }
5598}
5599
5600/*
5601 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
5602 * given bit number
5603 * @eb: the extent buffer
5604 * @start: offset of the bitmap item in the extent buffer
5605 * @nr: bit number
5606 * @page_index: return index of the page in the extent buffer that contains the
5607 * given bit number
5608 * @page_offset: return offset into the page given by page_index
5609 *
5610 * This helper hides the ugliness of finding the byte in an extent buffer which
5611 * contains a given bit.
5612 */
5613static inline void eb_bitmap_offset(struct extent_buffer *eb,
5614                                    unsigned long start, unsigned long nr,
5615                                    unsigned long *page_index,
5616                                    size_t *page_offset)
5617{
5618        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5619        size_t byte_offset = BIT_BYTE(nr);
5620        size_t offset;
5621
5622        /*
5623         * The byte we want is the offset of the extent buffer + the offset of
5624         * the bitmap item in the extent buffer + the offset of the byte in the
5625         * bitmap item.
5626         */
5627        offset = start_offset + start + byte_offset;
5628
5629        *page_index = offset >> PAGE_SHIFT;
5630        *page_offset = offset & (PAGE_SIZE - 1);
5631}
5632
5633/**
5634 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
5635 * @eb: the extent buffer
5636 * @start: offset of the bitmap item in the extent buffer
5637 * @nr: bit number to test
5638 */
5639int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
5640                           unsigned long nr)
5641{
5642        u8 *kaddr;
5643        struct page *page;
5644        unsigned long i;
5645        size_t offset;
5646
5647        eb_bitmap_offset(eb, start, nr, &i, &offset);
5648        page = eb->pages[i];
5649        WARN_ON(!PageUptodate(page));
5650        kaddr = page_address(page);
5651        return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
5652}
5653
5654/**
5655 * extent_buffer_bitmap_set - set an area of a bitmap
5656 * @eb: the extent buffer
5657 * @start: offset of the bitmap item in the extent buffer
5658 * @pos: bit number of the first bit
5659 * @len: number of bits to set
5660 */
5661void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
5662                              unsigned long pos, unsigned long len)
5663{
5664        u8 *kaddr;
5665        struct page *page;
5666        unsigned long i;
5667        size_t offset;
5668        const unsigned int size = pos + len;
5669        int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
5670        u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
5671
5672        eb_bitmap_offset(eb, start, pos, &i, &offset);
5673        page = eb->pages[i];
5674        WARN_ON(!PageUptodate(page));
5675        kaddr = page_address(page);
5676
5677        while (len >= bits_to_set) {
5678                kaddr[offset] |= mask_to_set;
5679                len -= bits_to_set;
5680                bits_to_set = BITS_PER_BYTE;
5681                mask_to_set = ~0;
5682                if (++offset >= PAGE_SIZE && len > 0) {
5683                        offset = 0;
5684                        page = eb->pages[++i];
5685                        WARN_ON(!PageUptodate(page));
5686                        kaddr = page_address(page);
5687                }
5688        }
5689        if (len) {
5690                mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
5691                kaddr[offset] |= mask_to_set;
5692        }
5693}
5694
5695
5696/**
5697 * extent_buffer_bitmap_clear - clear an area of a bitmap
5698 * @eb: the extent buffer
5699 * @start: offset of the bitmap item in the extent buffer
5700 * @pos: bit number of the first bit
5701 * @len: number of bits to clear
5702 */
5703void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
5704                                unsigned long pos, unsigned long len)
5705{
5706        u8 *kaddr;
5707        struct page *page;
5708        unsigned long i;
5709        size_t offset;
5710        const unsigned int size = pos + len;
5711        int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
5712        u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
5713
5714        eb_bitmap_offset(eb, start, pos, &i, &offset);
5715        page = eb->pages[i];
5716        WARN_ON(!PageUptodate(page));
5717        kaddr = page_address(page);
5718
5719        while (len >= bits_to_clear) {
5720                kaddr[offset] &= ~mask_to_clear;
5721                len -= bits_to_clear;
5722                bits_to_clear = BITS_PER_BYTE;
5723                mask_to_clear = ~0;
5724                if (++offset >= PAGE_SIZE && len > 0) {
5725                        offset = 0;
5726                        page = eb->pages[++i];
5727                        WARN_ON(!PageUptodate(page));
5728                        kaddr = page_address(page);
5729                }
5730        }
5731        if (len) {
5732                mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
5733                kaddr[offset] &= ~mask_to_clear;
5734        }
5735}
5736
5737static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
5738{
5739        unsigned long distance = (src > dst) ? src - dst : dst - src;
5740        return distance < len;
5741}
5742
5743static void copy_pages(struct page *dst_page, struct page *src_page,
5744                       unsigned long dst_off, unsigned long src_off,
5745                       unsigned long len)
5746{
5747        char *dst_kaddr = page_address(dst_page);
5748        char *src_kaddr;
5749        int must_memmove = 0;
5750
5751        if (dst_page != src_page) {
5752                src_kaddr = page_address(src_page);
5753        } else {
5754                src_kaddr = dst_kaddr;
5755                if (areas_overlap(src_off, dst_off, len))
5756                        must_memmove = 1;
5757        }
5758
5759        if (must_memmove)
5760                memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
5761        else
5762                memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
5763}
5764
5765void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5766                           unsigned long src_offset, unsigned long len)
5767{
5768        struct btrfs_fs_info *fs_info = dst->fs_info;
5769        size_t cur;
5770        size_t dst_off_in_page;
5771        size_t src_off_in_page;
5772        size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
5773        unsigned long dst_i;
5774        unsigned long src_i;
5775
5776        if (src_offset + len > dst->len) {
5777                btrfs_err(fs_info,
5778                        "memmove bogus src_offset %lu move len %lu dst len %lu",
5779                         src_offset, len, dst->len);
5780                BUG_ON(1);
5781        }
5782        if (dst_offset + len > dst->len) {
5783                btrfs_err(fs_info,
5784                        "memmove bogus dst_offset %lu move len %lu dst len %lu",
5785                         dst_offset, len, dst->len);
5786                BUG_ON(1);
5787        }
5788
5789        while (len > 0) {
5790                dst_off_in_page = (start_offset + dst_offset) &
5791                        (PAGE_SIZE - 1);
5792                src_off_in_page = (start_offset + src_offset) &
5793                        (PAGE_SIZE - 1);
5794
5795                dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
5796                src_i = (start_offset + src_offset) >> PAGE_SHIFT;
5797
5798                cur = min(len, (unsigned long)(PAGE_SIZE -
5799                                               src_off_in_page));
5800                cur = min_t(unsigned long, cur,
5801                        (unsigned long)(PAGE_SIZE - dst_off_in_page));
5802
5803                copy_pages(dst->pages[dst_i], dst->pages[src_i],
5804                           dst_off_in_page, src_off_in_page, cur);
5805
5806                src_offset += cur;
5807                dst_offset += cur;
5808                len -= cur;
5809        }
5810}
5811
5812void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5813                           unsigned long src_offset, unsigned long len)
5814{
5815        struct btrfs_fs_info *fs_info = dst->fs_info;
5816        size_t cur;
5817        size_t dst_off_in_page;
5818        size_t src_off_in_page;
5819        unsigned long dst_end = dst_offset + len - 1;
5820        unsigned long src_end = src_offset + len - 1;
5821        size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
5822        unsigned long dst_i;
5823        unsigned long src_i;
5824
5825        if (src_offset + len > dst->len) {
5826                btrfs_err(fs_info,
5827                          "memmove bogus src_offset %lu move len %lu len %lu",
5828                          src_offset, len, dst->len);
5829                BUG_ON(1);
5830        }
5831        if (dst_offset + len > dst->len) {
5832                btrfs_err(fs_info,
5833                          "memmove bogus dst_offset %lu move len %lu len %lu",
5834                          dst_offset, len, dst->len);
5835                BUG_ON(1);
5836        }
5837        if (dst_offset < src_offset) {
5838                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
5839                return;
5840        }
5841        while (len > 0) {
5842                dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
5843                src_i = (start_offset + src_end) >> PAGE_SHIFT;
5844
5845                dst_off_in_page = (start_offset + dst_end) &
5846                        (PAGE_SIZE - 1);
5847                src_off_in_page = (start_offset + src_end) &
5848                        (PAGE_SIZE - 1);
5849
5850                cur = min_t(unsigned long, len, src_off_in_page + 1);
5851                cur = min(cur, dst_off_in_page + 1);
5852                copy_pages(dst->pages[dst_i], dst->pages[src_i],
5853                           dst_off_in_page - cur + 1,
5854                           src_off_in_page - cur + 1, cur);
5855
5856                dst_end -= cur;
5857                src_end -= cur;
5858                len -= cur;
5859        }
5860}
5861
5862int try_release_extent_buffer(struct page *page)
5863{
5864        struct extent_buffer *eb;
5865
5866        /*
5867         * We need to make sure nobody is attaching this page to an eb right
5868         * now.
5869         */
5870        spin_lock(&page->mapping->private_lock);
5871        if (!PagePrivate(page)) {
5872                spin_unlock(&page->mapping->private_lock);
5873                return 1;
5874        }
5875
5876        eb = (struct extent_buffer *)page->private;
5877        BUG_ON(!eb);
5878
5879        /*
5880         * This is a little awful but should be ok, we need to make sure that
5881         * the eb doesn't disappear out from under us while we're looking at
5882         * this page.
5883         */
5884        spin_lock(&eb->refs_lock);
5885        if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
5886                spin_unlock(&eb->refs_lock);
5887                spin_unlock(&page->mapping->private_lock);
5888                return 0;
5889        }
5890        spin_unlock(&page->mapping->private_lock);
5891
5892        /*
5893         * If tree ref isn't set then we know the ref on this eb is a real ref,
5894         * so just return, this page will likely be freed soon anyway.
5895         */
5896        if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
5897                spin_unlock(&eb->refs_lock);
5898                return 0;
5899        }
5900
5901        return release_extent_buffer(eb);
5902}
5903