linux/fs/btrfs/extent_io.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/bitops.h>
   4#include <linux/slab.h>
   5#include <linux/bio.h>
   6#include <linux/mm.h>
   7#include <linux/pagemap.h>
   8#include <linux/page-flags.h>
   9#include <linux/spinlock.h>
  10#include <linux/blkdev.h>
  11#include <linux/swap.h>
  12#include <linux/writeback.h>
  13#include <linux/pagevec.h>
  14#include <linux/prefetch.h>
  15#include <linux/cleancache.h>
  16#include "extent_io.h"
  17#include "extent_map.h"
  18#include "ctree.h"
  19#include "btrfs_inode.h"
  20#include "volumes.h"
  21#include "check-integrity.h"
  22#include "locking.h"
  23#include "rcu-string.h"
  24#include "backref.h"
  25#include "disk-io.h"
  26
  27static struct kmem_cache *extent_state_cache;
  28static struct kmem_cache *extent_buffer_cache;
  29static struct bio_set *btrfs_bioset;
  30
  31static inline bool extent_state_in_tree(const struct extent_state *state)
  32{
  33        return !RB_EMPTY_NODE(&state->rb_node);
  34}
  35
  36#ifdef CONFIG_BTRFS_DEBUG
  37static LIST_HEAD(buffers);
  38static LIST_HEAD(states);
  39
  40static DEFINE_SPINLOCK(leak_lock);
  41
  42static inline
  43void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
  44{
  45        unsigned long flags;
  46
  47        spin_lock_irqsave(&leak_lock, flags);
  48        list_add(new, head);
  49        spin_unlock_irqrestore(&leak_lock, flags);
  50}
  51
  52static inline
  53void btrfs_leak_debug_del(struct list_head *entry)
  54{
  55        unsigned long flags;
  56
  57        spin_lock_irqsave(&leak_lock, flags);
  58        list_del(entry);
  59        spin_unlock_irqrestore(&leak_lock, flags);
  60}
  61
  62static inline
  63void btrfs_leak_debug_check(void)
  64{
  65        struct extent_state *state;
  66        struct extent_buffer *eb;
  67
  68        while (!list_empty(&states)) {
  69                state = list_entry(states.next, struct extent_state, leak_list);
  70                pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
  71                       state->start, state->end, state->state,
  72                       extent_state_in_tree(state),
  73                       refcount_read(&state->refs));
  74                list_del(&state->leak_list);
  75                kmem_cache_free(extent_state_cache, state);
  76        }
  77
  78        while (!list_empty(&buffers)) {
  79                eb = list_entry(buffers.next, struct extent_buffer, leak_list);
  80                pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
  81                       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
  82                list_del(&eb->leak_list);
  83                kmem_cache_free(extent_buffer_cache, eb);
  84        }
  85}
  86
  87#define btrfs_debug_check_extent_io_range(tree, start, end)             \
  88        __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
  89static inline void __btrfs_debug_check_extent_io_range(const char *caller,
  90                struct extent_io_tree *tree, u64 start, u64 end)
  91{
  92        if (tree->ops && tree->ops->check_extent_io_range)
  93                tree->ops->check_extent_io_range(tree->private_data, caller,
  94                                                 start, end);
  95}
  96#else
  97#define btrfs_leak_debug_add(new, head) do {} while (0)
  98#define btrfs_leak_debug_del(entry)     do {} while (0)
  99#define btrfs_leak_debug_check()        do {} while (0)
 100#define btrfs_debug_check_extent_io_range(c, s, e)      do {} while (0)
 101#endif
 102
 103#define BUFFER_LRU_MAX 64
 104
 105struct tree_entry {
 106        u64 start;
 107        u64 end;
 108        struct rb_node rb_node;
 109};
 110
 111struct extent_page_data {
 112        struct bio *bio;
 113        struct extent_io_tree *tree;
 114        /* tells writepage not to lock the state bits for this range
 115         * it still does the unlocking
 116         */
 117        unsigned int extent_locked:1;
 118
 119        /* tells the submit_bio code to use REQ_SYNC */
 120        unsigned int sync_io:1;
 121};
 122
 123static int add_extent_changeset(struct extent_state *state, unsigned bits,
 124                                 struct extent_changeset *changeset,
 125                                 int set)
 126{
 127        int ret;
 128
 129        if (!changeset)
 130                return 0;
 131        if (set && (state->state & bits) == bits)
 132                return 0;
 133        if (!set && (state->state & bits) == 0)
 134                return 0;
 135        changeset->bytes_changed += state->end - state->start + 1;
 136        ret = ulist_add(&changeset->range_changed, state->start, state->end,
 137                        GFP_ATOMIC);
 138        return ret;
 139}
 140
 141static void flush_write_bio(struct extent_page_data *epd);
 142
 143static inline struct btrfs_fs_info *
 144tree_fs_info(struct extent_io_tree *tree)
 145{
 146        if (tree->ops)
 147                return tree->ops->tree_fs_info(tree->private_data);
 148        return NULL;
 149}
 150
 151int __init extent_io_init(void)
 152{
 153        extent_state_cache = kmem_cache_create("btrfs_extent_state",
 154                        sizeof(struct extent_state), 0,
 155                        SLAB_MEM_SPREAD, NULL);
 156        if (!extent_state_cache)
 157                return -ENOMEM;
 158
 159        extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 160                        sizeof(struct extent_buffer), 0,
 161                        SLAB_MEM_SPREAD, NULL);
 162        if (!extent_buffer_cache)
 163                goto free_state_cache;
 164
 165        btrfs_bioset = bioset_create(BIO_POOL_SIZE,
 166                                     offsetof(struct btrfs_io_bio, bio),
 167                                     BIOSET_NEED_BVECS);
 168        if (!btrfs_bioset)
 169                goto free_buffer_cache;
 170
 171        if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
 172                goto free_bioset;
 173
 174        return 0;
 175
 176free_bioset:
 177        bioset_free(btrfs_bioset);
 178        btrfs_bioset = NULL;
 179
 180free_buffer_cache:
 181        kmem_cache_destroy(extent_buffer_cache);
 182        extent_buffer_cache = NULL;
 183
 184free_state_cache:
 185        kmem_cache_destroy(extent_state_cache);
 186        extent_state_cache = NULL;
 187        return -ENOMEM;
 188}
 189
 190void __cold extent_io_exit(void)
 191{
 192        btrfs_leak_debug_check();
 193
 194        /*
 195         * Make sure all delayed rcu free are flushed before we
 196         * destroy caches.
 197         */
 198        rcu_barrier();
 199        kmem_cache_destroy(extent_state_cache);
 200        kmem_cache_destroy(extent_buffer_cache);
 201        if (btrfs_bioset)
 202                bioset_free(btrfs_bioset);
 203}
 204
 205void extent_io_tree_init(struct extent_io_tree *tree,
 206                         void *private_data)
 207{
 208        tree->state = RB_ROOT;
 209        tree->ops = NULL;
 210        tree->dirty_bytes = 0;
 211        spin_lock_init(&tree->lock);
 212        tree->private_data = private_data;
 213}
 214
 215static struct extent_state *alloc_extent_state(gfp_t mask)
 216{
 217        struct extent_state *state;
 218
 219        /*
 220         * The given mask might be not appropriate for the slab allocator,
 221         * drop the unsupported bits
 222         */
 223        mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
 224        state = kmem_cache_alloc(extent_state_cache, mask);
 225        if (!state)
 226                return state;
 227        state->state = 0;
 228        state->failrec = NULL;
 229        RB_CLEAR_NODE(&state->rb_node);
 230        btrfs_leak_debug_add(&state->leak_list, &states);
 231        refcount_set(&state->refs, 1);
 232        init_waitqueue_head(&state->wq);
 233        trace_alloc_extent_state(state, mask, _RET_IP_);
 234        return state;
 235}
 236
 237void free_extent_state(struct extent_state *state)
 238{
 239        if (!state)
 240                return;
 241        if (refcount_dec_and_test(&state->refs)) {
 242                WARN_ON(extent_state_in_tree(state));
 243                btrfs_leak_debug_del(&state->leak_list);
 244                trace_free_extent_state(state, _RET_IP_);
 245                kmem_cache_free(extent_state_cache, state);
 246        }
 247}
 248
 249static struct rb_node *tree_insert(struct rb_root *root,
 250                                   struct rb_node *search_start,
 251                                   u64 offset,
 252                                   struct rb_node *node,
 253                                   struct rb_node ***p_in,
 254                                   struct rb_node **parent_in)
 255{
 256        struct rb_node **p;
 257        struct rb_node *parent = NULL;
 258        struct tree_entry *entry;
 259
 260        if (p_in && parent_in) {
 261                p = *p_in;
 262                parent = *parent_in;
 263                goto do_insert;
 264        }
 265
 266        p = search_start ? &search_start : &root->rb_node;
 267        while (*p) {
 268                parent = *p;
 269                entry = rb_entry(parent, struct tree_entry, rb_node);
 270
 271                if (offset < entry->start)
 272                        p = &(*p)->rb_left;
 273                else if (offset > entry->end)
 274                        p = &(*p)->rb_right;
 275                else
 276                        return parent;
 277        }
 278
 279do_insert:
 280        rb_link_node(node, parent, p);
 281        rb_insert_color(node, root);
 282        return NULL;
 283}
 284
 285static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 286                                      struct rb_node **prev_ret,
 287                                      struct rb_node **next_ret,
 288                                      struct rb_node ***p_ret,
 289                                      struct rb_node **parent_ret)
 290{
 291        struct rb_root *root = &tree->state;
 292        struct rb_node **n = &root->rb_node;
 293        struct rb_node *prev = NULL;
 294        struct rb_node *orig_prev = NULL;
 295        struct tree_entry *entry;
 296        struct tree_entry *prev_entry = NULL;
 297
 298        while (*n) {
 299                prev = *n;
 300                entry = rb_entry(prev, struct tree_entry, rb_node);
 301                prev_entry = entry;
 302
 303                if (offset < entry->start)
 304                        n = &(*n)->rb_left;
 305                else if (offset > entry->end)
 306                        n = &(*n)->rb_right;
 307                else
 308                        return *n;
 309        }
 310
 311        if (p_ret)
 312                *p_ret = n;
 313        if (parent_ret)
 314                *parent_ret = prev;
 315
 316        if (prev_ret) {
 317                orig_prev = prev;
 318                while (prev && offset > prev_entry->end) {
 319                        prev = rb_next(prev);
 320                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 321                }
 322                *prev_ret = prev;
 323                prev = orig_prev;
 324        }
 325
 326        if (next_ret) {
 327                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 328                while (prev && offset < prev_entry->start) {
 329                        prev = rb_prev(prev);
 330                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 331                }
 332                *next_ret = prev;
 333        }
 334        return NULL;
 335}
 336
 337static inline struct rb_node *
 338tree_search_for_insert(struct extent_io_tree *tree,
 339                       u64 offset,
 340                       struct rb_node ***p_ret,
 341                       struct rb_node **parent_ret)
 342{
 343        struct rb_node *prev = NULL;
 344        struct rb_node *ret;
 345
 346        ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
 347        if (!ret)
 348                return prev;
 349        return ret;
 350}
 351
 352static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 353                                          u64 offset)
 354{
 355        return tree_search_for_insert(tree, offset, NULL, NULL);
 356}
 357
 358static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 359                     struct extent_state *other)
 360{
 361        if (tree->ops && tree->ops->merge_extent_hook)
 362                tree->ops->merge_extent_hook(tree->private_data, new, other);
 363}
 364
 365/*
 366 * utility function to look for merge candidates inside a given range.
 367 * Any extents with matching state are merged together into a single
 368 * extent in the tree.  Extents with EXTENT_IO in their state field
 369 * are not merged because the end_io handlers need to be able to do
 370 * operations on them without sleeping (or doing allocations/splits).
 371 *
 372 * This should be called with the tree lock held.
 373 */
 374static void merge_state(struct extent_io_tree *tree,
 375                        struct extent_state *state)
 376{
 377        struct extent_state *other;
 378        struct rb_node *other_node;
 379
 380        if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 381                return;
 382
 383        other_node = rb_prev(&state->rb_node);
 384        if (other_node) {
 385                other = rb_entry(other_node, struct extent_state, rb_node);
 386                if (other->end == state->start - 1 &&
 387                    other->state == state->state) {
 388                        merge_cb(tree, state, other);
 389                        state->start = other->start;
 390                        rb_erase(&other->rb_node, &tree->state);
 391                        RB_CLEAR_NODE(&other->rb_node);
 392                        free_extent_state(other);
 393                }
 394        }
 395        other_node = rb_next(&state->rb_node);
 396        if (other_node) {
 397                other = rb_entry(other_node, struct extent_state, rb_node);
 398                if (other->start == state->end + 1 &&
 399                    other->state == state->state) {
 400                        merge_cb(tree, state, other);
 401                        state->end = other->end;
 402                        rb_erase(&other->rb_node, &tree->state);
 403                        RB_CLEAR_NODE(&other->rb_node);
 404                        free_extent_state(other);
 405                }
 406        }
 407}
 408
 409static void set_state_cb(struct extent_io_tree *tree,
 410                         struct extent_state *state, unsigned *bits)
 411{
 412        if (tree->ops && tree->ops->set_bit_hook)
 413                tree->ops->set_bit_hook(tree->private_data, state, bits);
 414}
 415
 416static void clear_state_cb(struct extent_io_tree *tree,
 417                           struct extent_state *state, unsigned *bits)
 418{
 419        if (tree->ops && tree->ops->clear_bit_hook)
 420                tree->ops->clear_bit_hook(tree->private_data, state, bits);
 421}
 422
 423static void set_state_bits(struct extent_io_tree *tree,
 424                           struct extent_state *state, unsigned *bits,
 425                           struct extent_changeset *changeset);
 426
 427/*
 428 * insert an extent_state struct into the tree.  'bits' are set on the
 429 * struct before it is inserted.
 430 *
 431 * This may return -EEXIST if the extent is already there, in which case the
 432 * state struct is freed.
 433 *
 434 * The tree lock is not taken internally.  This is a utility function and
 435 * probably isn't what you want to call (see set/clear_extent_bit).
 436 */
 437static int insert_state(struct extent_io_tree *tree,
 438                        struct extent_state *state, u64 start, u64 end,
 439                        struct rb_node ***p,
 440                        struct rb_node **parent,
 441                        unsigned *bits, struct extent_changeset *changeset)
 442{
 443        struct rb_node *node;
 444
 445        if (end < start)
 446                WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
 447                       end, start);
 448        state->start = start;
 449        state->end = end;
 450
 451        set_state_bits(tree, state, bits, changeset);
 452
 453        node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 454        if (node) {
 455                struct extent_state *found;
 456                found = rb_entry(node, struct extent_state, rb_node);
 457                pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n",
 458                       found->start, found->end, start, end);
 459                return -EEXIST;
 460        }
 461        merge_state(tree, state);
 462        return 0;
 463}
 464
 465static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 466                     u64 split)
 467{
 468        if (tree->ops && tree->ops->split_extent_hook)
 469                tree->ops->split_extent_hook(tree->private_data, orig, split);
 470}
 471
 472/*
 473 * split a given extent state struct in two, inserting the preallocated
 474 * struct 'prealloc' as the newly created second half.  'split' indicates an
 475 * offset inside 'orig' where it should be split.
 476 *
 477 * Before calling,
 478 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 479 * are two extent state structs in the tree:
 480 * prealloc: [orig->start, split - 1]
 481 * orig: [ split, orig->end ]
 482 *
 483 * The tree locks are not taken by this function. They need to be held
 484 * by the caller.
 485 */
 486static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 487                       struct extent_state *prealloc, u64 split)
 488{
 489        struct rb_node *node;
 490
 491        split_cb(tree, orig, split);
 492
 493        prealloc->start = orig->start;
 494        prealloc->end = split - 1;
 495        prealloc->state = orig->state;
 496        orig->start = split;
 497
 498        node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
 499                           &prealloc->rb_node, NULL, NULL);
 500        if (node) {
 501                free_extent_state(prealloc);
 502                return -EEXIST;
 503        }
 504        return 0;
 505}
 506
 507static struct extent_state *next_state(struct extent_state *state)
 508{
 509        struct rb_node *next = rb_next(&state->rb_node);
 510        if (next)
 511                return rb_entry(next, struct extent_state, rb_node);
 512        else
 513                return NULL;
 514}
 515
 516/*
 517 * utility function to clear some bits in an extent state struct.
 518 * it will optionally wake up any one waiting on this state (wake == 1).
 519 *
 520 * If no bits are set on the state struct after clearing things, the
 521 * struct is freed and removed from the tree
 522 */
 523static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 524                                            struct extent_state *state,
 525                                            unsigned *bits, int wake,
 526                                            struct extent_changeset *changeset)
 527{
 528        struct extent_state *next;
 529        unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
 530        int ret;
 531
 532        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 533                u64 range = state->end - state->start + 1;
 534                WARN_ON(range > tree->dirty_bytes);
 535                tree->dirty_bytes -= range;
 536        }
 537        clear_state_cb(tree, state, bits);
 538        ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
 539        BUG_ON(ret < 0);
 540        state->state &= ~bits_to_clear;
 541        if (wake)
 542                wake_up(&state->wq);
 543        if (state->state == 0) {
 544                next = next_state(state);
 545                if (extent_state_in_tree(state)) {
 546                        rb_erase(&state->rb_node, &tree->state);
 547                        RB_CLEAR_NODE(&state->rb_node);
 548                        free_extent_state(state);
 549                } else {
 550                        WARN_ON(1);
 551                }
 552        } else {
 553                merge_state(tree, state);
 554                next = next_state(state);
 555        }
 556        return next;
 557}
 558
 559static struct extent_state *
 560alloc_extent_state_atomic(struct extent_state *prealloc)
 561{
 562        if (!prealloc)
 563                prealloc = alloc_extent_state(GFP_ATOMIC);
 564
 565        return prealloc;
 566}
 567
 568static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 569{
 570        btrfs_panic(tree_fs_info(tree), err,
 571                    "Locking error: Extent tree was modified by another thread while locked.");
 572}
 573
 574/*
 575 * clear some bits on a range in the tree.  This may require splitting
 576 * or inserting elements in the tree, so the gfp mask is used to
 577 * indicate which allocations or sleeping are allowed.
 578 *
 579 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 580 * the given range from the tree regardless of state (ie for truncate).
 581 *
 582 * the range [start, end] is inclusive.
 583 *
 584 * This takes the tree lock, and returns 0 on success and < 0 on error.
 585 */
 586int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 587                              unsigned bits, int wake, int delete,
 588                              struct extent_state **cached_state,
 589                              gfp_t mask, struct extent_changeset *changeset)
 590{
 591        struct extent_state *state;
 592        struct extent_state *cached;
 593        struct extent_state *prealloc = NULL;
 594        struct rb_node *node;
 595        u64 last_end;
 596        int err;
 597        int clear = 0;
 598
 599        btrfs_debug_check_extent_io_range(tree, start, end);
 600
 601        if (bits & EXTENT_DELALLOC)
 602                bits |= EXTENT_NORESERVE;
 603
 604        if (delete)
 605                bits |= ~EXTENT_CTLBITS;
 606        bits |= EXTENT_FIRST_DELALLOC;
 607
 608        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 609                clear = 1;
 610again:
 611        if (!prealloc && gfpflags_allow_blocking(mask)) {
 612                /*
 613                 * Don't care for allocation failure here because we might end
 614                 * up not needing the pre-allocated extent state at all, which
 615                 * is the case if we only have in the tree extent states that
 616                 * cover our input range and don't cover too any other range.
 617                 * If we end up needing a new extent state we allocate it later.
 618                 */
 619                prealloc = alloc_extent_state(mask);
 620        }
 621
 622        spin_lock(&tree->lock);
 623        if (cached_state) {
 624                cached = *cached_state;
 625
 626                if (clear) {
 627                        *cached_state = NULL;
 628                        cached_state = NULL;
 629                }
 630
 631                if (cached && extent_state_in_tree(cached) &&
 632                    cached->start <= start && cached->end > start) {
 633                        if (clear)
 634                                refcount_dec(&cached->refs);
 635                        state = cached;
 636                        goto hit_next;
 637                }
 638                if (clear)
 639                        free_extent_state(cached);
 640        }
 641        /*
 642         * this search will find the extents that end after
 643         * our range starts
 644         */
 645        node = tree_search(tree, start);
 646        if (!node)
 647                goto out;
 648        state = rb_entry(node, struct extent_state, rb_node);
 649hit_next:
 650        if (state->start > end)
 651                goto out;
 652        WARN_ON(state->end < start);
 653        last_end = state->end;
 654
 655        /* the state doesn't have the wanted bits, go ahead */
 656        if (!(state->state & bits)) {
 657                state = next_state(state);
 658                goto next;
 659        }
 660
 661        /*
 662         *     | ---- desired range ---- |
 663         *  | state | or
 664         *  | ------------- state -------------- |
 665         *
 666         * We need to split the extent we found, and may flip
 667         * bits on second half.
 668         *
 669         * If the extent we found extends past our range, we
 670         * just split and search again.  It'll get split again
 671         * the next time though.
 672         *
 673         * If the extent we found is inside our range, we clear
 674         * the desired bit on it.
 675         */
 676
 677        if (state->start < start) {
 678                prealloc = alloc_extent_state_atomic(prealloc);
 679                BUG_ON(!prealloc);
 680                err = split_state(tree, state, prealloc, start);
 681                if (err)
 682                        extent_io_tree_panic(tree, err);
 683
 684                prealloc = NULL;
 685                if (err)
 686                        goto out;
 687                if (state->end <= end) {
 688                        state = clear_state_bit(tree, state, &bits, wake,
 689                                                changeset);
 690                        goto next;
 691                }
 692                goto search_again;
 693        }
 694        /*
 695         * | ---- desired range ---- |
 696         *                        | state |
 697         * We need to split the extent, and clear the bit
 698         * on the first half
 699         */
 700        if (state->start <= end && state->end > end) {
 701                prealloc = alloc_extent_state_atomic(prealloc);
 702                BUG_ON(!prealloc);
 703                err = split_state(tree, state, prealloc, end + 1);
 704                if (err)
 705                        extent_io_tree_panic(tree, err);
 706
 707                if (wake)
 708                        wake_up(&state->wq);
 709
 710                clear_state_bit(tree, prealloc, &bits, wake, changeset);
 711
 712                prealloc = NULL;
 713                goto out;
 714        }
 715
 716        state = clear_state_bit(tree, state, &bits, wake, changeset);
 717next:
 718        if (last_end == (u64)-1)
 719                goto out;
 720        start = last_end + 1;
 721        if (start <= end && state && !need_resched())
 722                goto hit_next;
 723
 724search_again:
 725        if (start > end)
 726                goto out;
 727        spin_unlock(&tree->lock);
 728        if (gfpflags_allow_blocking(mask))
 729                cond_resched();
 730        goto again;
 731
 732out:
 733        spin_unlock(&tree->lock);
 734        if (prealloc)
 735                free_extent_state(prealloc);
 736
 737        return 0;
 738
 739}
 740
 741static void wait_on_state(struct extent_io_tree *tree,
 742                          struct extent_state *state)
 743                __releases(tree->lock)
 744                __acquires(tree->lock)
 745{
 746        DEFINE_WAIT(wait);
 747        prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 748        spin_unlock(&tree->lock);
 749        schedule();
 750        spin_lock(&tree->lock);
 751        finish_wait(&state->wq, &wait);
 752}
 753
 754/*
 755 * waits for one or more bits to clear on a range in the state tree.
 756 * The range [start, end] is inclusive.
 757 * The tree lock is taken by this function
 758 */
 759static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 760                            unsigned long bits)
 761{
 762        struct extent_state *state;
 763        struct rb_node *node;
 764
 765        btrfs_debug_check_extent_io_range(tree, start, end);
 766
 767        spin_lock(&tree->lock);
 768again:
 769        while (1) {
 770                /*
 771                 * this search will find all the extents that end after
 772                 * our range starts
 773                 */
 774                node = tree_search(tree, start);
 775process_node:
 776                if (!node)
 777                        break;
 778
 779                state = rb_entry(node, struct extent_state, rb_node);
 780
 781                if (state->start > end)
 782                        goto out;
 783
 784                if (state->state & bits) {
 785                        start = state->start;
 786                        refcount_inc(&state->refs);
 787                        wait_on_state(tree, state);
 788                        free_extent_state(state);
 789                        goto again;
 790                }
 791                start = state->end + 1;
 792
 793                if (start > end)
 794                        break;
 795
 796                if (!cond_resched_lock(&tree->lock)) {
 797                        node = rb_next(node);
 798                        goto process_node;
 799                }
 800        }
 801out:
 802        spin_unlock(&tree->lock);
 803}
 804
 805static void set_state_bits(struct extent_io_tree *tree,
 806                           struct extent_state *state,
 807                           unsigned *bits, struct extent_changeset *changeset)
 808{
 809        unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
 810        int ret;
 811
 812        set_state_cb(tree, state, bits);
 813        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 814                u64 range = state->end - state->start + 1;
 815                tree->dirty_bytes += range;
 816        }
 817        ret = add_extent_changeset(state, bits_to_set, changeset, 1);
 818        BUG_ON(ret < 0);
 819        state->state |= bits_to_set;
 820}
 821
 822static void cache_state_if_flags(struct extent_state *state,
 823                                 struct extent_state **cached_ptr,
 824                                 unsigned flags)
 825{
 826        if (cached_ptr && !(*cached_ptr)) {
 827                if (!flags || (state->state & flags)) {
 828                        *cached_ptr = state;
 829                        refcount_inc(&state->refs);
 830                }
 831        }
 832}
 833
 834static void cache_state(struct extent_state *state,
 835                        struct extent_state **cached_ptr)
 836{
 837        return cache_state_if_flags(state, cached_ptr,
 838                                    EXTENT_IOBITS | EXTENT_BOUNDARY);
 839}
 840
 841/*
 842 * set some bits on a range in the tree.  This may require allocations or
 843 * sleeping, so the gfp mask is used to indicate what is allowed.
 844 *
 845 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 846 * part of the range already has the desired bits set.  The start of the
 847 * existing range is returned in failed_start in this case.
 848 *
 849 * [start, end] is inclusive This takes the tree lock.
 850 */
 851
 852static int __must_check
 853__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 854                 unsigned bits, unsigned exclusive_bits,
 855                 u64 *failed_start, struct extent_state **cached_state,
 856                 gfp_t mask, struct extent_changeset *changeset)
 857{
 858        struct extent_state *state;
 859        struct extent_state *prealloc = NULL;
 860        struct rb_node *node;
 861        struct rb_node **p;
 862        struct rb_node *parent;
 863        int err = 0;
 864        u64 last_start;
 865        u64 last_end;
 866
 867        btrfs_debug_check_extent_io_range(tree, start, end);
 868
 869        bits |= EXTENT_FIRST_DELALLOC;
 870again:
 871        if (!prealloc && gfpflags_allow_blocking(mask)) {
 872                /*
 873                 * Don't care for allocation failure here because we might end
 874                 * up not needing the pre-allocated extent state at all, which
 875                 * is the case if we only have in the tree extent states that
 876                 * cover our input range and don't cover too any other range.
 877                 * If we end up needing a new extent state we allocate it later.
 878                 */
 879                prealloc = alloc_extent_state(mask);
 880        }
 881
 882        spin_lock(&tree->lock);
 883        if (cached_state && *cached_state) {
 884                state = *cached_state;
 885                if (state->start <= start && state->end > start &&
 886                    extent_state_in_tree(state)) {
 887                        node = &state->rb_node;
 888                        goto hit_next;
 889                }
 890        }
 891        /*
 892         * this search will find all the extents that end after
 893         * our range starts.
 894         */
 895        node = tree_search_for_insert(tree, start, &p, &parent);
 896        if (!node) {
 897                prealloc = alloc_extent_state_atomic(prealloc);
 898                BUG_ON(!prealloc);
 899                err = insert_state(tree, prealloc, start, end,
 900                                   &p, &parent, &bits, changeset);
 901                if (err)
 902                        extent_io_tree_panic(tree, err);
 903
 904                cache_state(prealloc, cached_state);
 905                prealloc = NULL;
 906                goto out;
 907        }
 908        state = rb_entry(node, struct extent_state, rb_node);
 909hit_next:
 910        last_start = state->start;
 911        last_end = state->end;
 912
 913        /*
 914         * | ---- desired range ---- |
 915         * | state |
 916         *
 917         * Just lock what we found and keep going
 918         */
 919        if (state->start == start && state->end <= end) {
 920                if (state->state & exclusive_bits) {
 921                        *failed_start = state->start;
 922                        err = -EEXIST;
 923                        goto out;
 924                }
 925
 926                set_state_bits(tree, state, &bits, changeset);
 927                cache_state(state, cached_state);
 928                merge_state(tree, state);
 929                if (last_end == (u64)-1)
 930                        goto out;
 931                start = last_end + 1;
 932                state = next_state(state);
 933                if (start < end && state && state->start == start &&
 934                    !need_resched())
 935                        goto hit_next;
 936                goto search_again;
 937        }
 938
 939        /*
 940         *     | ---- desired range ---- |
 941         * | state |
 942         *   or
 943         * | ------------- state -------------- |
 944         *
 945         * We need to split the extent we found, and may flip bits on
 946         * second half.
 947         *
 948         * If the extent we found extends past our
 949         * range, we just split and search again.  It'll get split
 950         * again the next time though.
 951         *
 952         * If the extent we found is inside our range, we set the
 953         * desired bit on it.
 954         */
 955        if (state->start < start) {
 956                if (state->state & exclusive_bits) {
 957                        *failed_start = start;
 958                        err = -EEXIST;
 959                        goto out;
 960                }
 961
 962                prealloc = alloc_extent_state_atomic(prealloc);
 963                BUG_ON(!prealloc);
 964                err = split_state(tree, state, prealloc, start);
 965                if (err)
 966                        extent_io_tree_panic(tree, err);
 967
 968                prealloc = NULL;
 969                if (err)
 970                        goto out;
 971                if (state->end <= end) {
 972                        set_state_bits(tree, state, &bits, changeset);
 973                        cache_state(state, cached_state);
 974                        merge_state(tree, state);
 975                        if (last_end == (u64)-1)
 976                                goto out;
 977                        start = last_end + 1;
 978                        state = next_state(state);
 979                        if (start < end && state && state->start == start &&
 980                            !need_resched())
 981                                goto hit_next;
 982                }
 983                goto search_again;
 984        }
 985        /*
 986         * | ---- desired range ---- |
 987         *     | state | or               | state |
 988         *
 989         * There's a hole, we need to insert something in it and
 990         * ignore the extent we found.
 991         */
 992        if (state->start > start) {
 993                u64 this_end;
 994                if (end < last_start)
 995                        this_end = end;
 996                else
 997                        this_end = last_start - 1;
 998
 999                prealloc = alloc_extent_state_atomic(prealloc);
1000                BUG_ON(!prealloc);
1001
1002                /*
1003                 * Avoid to free 'prealloc' if it can be merged with
1004                 * the later extent.
1005                 */
1006                err = insert_state(tree, prealloc, start, this_end,
1007                                   NULL, NULL, &bits, changeset);
1008                if (err)
1009                        extent_io_tree_panic(tree, err);
1010
1011                cache_state(prealloc, cached_state);
1012                prealloc = NULL;
1013                start = this_end + 1;
1014                goto search_again;
1015        }
1016        /*
1017         * | ---- desired range ---- |
1018         *                        | state |
1019         * We need to split the extent, and set the bit
1020         * on the first half
1021         */
1022        if (state->start <= end && state->end > end) {
1023                if (state->state & exclusive_bits) {
1024                        *failed_start = start;
1025                        err = -EEXIST;
1026                        goto out;
1027                }
1028
1029                prealloc = alloc_extent_state_atomic(prealloc);
1030                BUG_ON(!prealloc);
1031                err = split_state(tree, state, prealloc, end + 1);
1032                if (err)
1033                        extent_io_tree_panic(tree, err);
1034
1035                set_state_bits(tree, prealloc, &bits, changeset);
1036                cache_state(prealloc, cached_state);
1037                merge_state(tree, prealloc);
1038                prealloc = NULL;
1039                goto out;
1040        }
1041
1042search_again:
1043        if (start > end)
1044                goto out;
1045        spin_unlock(&tree->lock);
1046        if (gfpflags_allow_blocking(mask))
1047                cond_resched();
1048        goto again;
1049
1050out:
1051        spin_unlock(&tree->lock);
1052        if (prealloc)
1053                free_extent_state(prealloc);
1054
1055        return err;
1056
1057}
1058
1059int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1060                   unsigned bits, u64 * failed_start,
1061                   struct extent_state **cached_state, gfp_t mask)
1062{
1063        return __set_extent_bit(tree, start, end, bits, 0, failed_start,
1064                                cached_state, mask, NULL);
1065}
1066
1067
1068/**
1069 * convert_extent_bit - convert all bits in a given range from one bit to
1070 *                      another
1071 * @tree:       the io tree to search
1072 * @start:      the start offset in bytes
1073 * @end:        the end offset in bytes (inclusive)
1074 * @bits:       the bits to set in this range
1075 * @clear_bits: the bits to clear in this range
1076 * @cached_state:       state that we're going to cache
1077 *
1078 * This will go through and set bits for the given range.  If any states exist
1079 * already in this range they are set with the given bit and cleared of the
1080 * clear_bits.  This is only meant to be used by things that are mergeable, ie
1081 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1082 * boundary bits like LOCK.
1083 *
1084 * All allocations are done with GFP_NOFS.
1085 */
1086int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1087                       unsigned bits, unsigned clear_bits,
1088                       struct extent_state **cached_state)
1089{
1090        struct extent_state *state;
1091        struct extent_state *prealloc = NULL;
1092        struct rb_node *node;
1093        struct rb_node **p;
1094        struct rb_node *parent;
1095        int err = 0;
1096        u64 last_start;
1097        u64 last_end;
1098        bool first_iteration = true;
1099
1100        btrfs_debug_check_extent_io_range(tree, start, end);
1101
1102again:
1103        if (!prealloc) {
1104                /*
1105                 * Best effort, don't worry if extent state allocation fails
1106                 * here for the first iteration. We might have a cached state
1107                 * that matches exactly the target range, in which case no
1108                 * extent state allocations are needed. We'll only know this
1109                 * after locking the tree.
1110                 */
1111                prealloc = alloc_extent_state(GFP_NOFS);
1112                if (!prealloc && !first_iteration)
1113                        return -ENOMEM;
1114        }
1115
1116        spin_lock(&tree->lock);
1117        if (cached_state && *cached_state) {
1118                state = *cached_state;
1119                if (state->start <= start && state->end > start &&
1120                    extent_state_in_tree(state)) {
1121                        node = &state->rb_node;
1122                        goto hit_next;
1123                }
1124        }
1125
1126        /*
1127         * this search will find all the extents that end after
1128         * our range starts.
1129         */
1130        node = tree_search_for_insert(tree, start, &p, &parent);
1131        if (!node) {
1132                prealloc = alloc_extent_state_atomic(prealloc);
1133                if (!prealloc) {
1134                        err = -ENOMEM;
1135                        goto out;
1136                }
1137                err = insert_state(tree, prealloc, start, end,
1138                                   &p, &parent, &bits, NULL);
1139                if (err)
1140                        extent_io_tree_panic(tree, err);
1141                cache_state(prealloc, cached_state);
1142                prealloc = NULL;
1143                goto out;
1144        }
1145        state = rb_entry(node, struct extent_state, rb_node);
1146hit_next:
1147        last_start = state->start;
1148        last_end = state->end;
1149
1150        /*
1151         * | ---- desired range ---- |
1152         * | state |
1153         *
1154         * Just lock what we found and keep going
1155         */
1156        if (state->start == start && state->end <= end) {
1157                set_state_bits(tree, state, &bits, NULL);
1158                cache_state(state, cached_state);
1159                state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
1160                if (last_end == (u64)-1)
1161                        goto out;
1162                start = last_end + 1;
1163                if (start < end && state && state->start == start &&
1164                    !need_resched())
1165                        goto hit_next;
1166                goto search_again;
1167        }
1168
1169        /*
1170         *     | ---- desired range ---- |
1171         * | state |
1172         *   or
1173         * | ------------- state -------------- |
1174         *
1175         * We need to split the extent we found, and may flip bits on
1176         * second half.
1177         *
1178         * If the extent we found extends past our
1179         * range, we just split and search again.  It'll get split
1180         * again the next time though.
1181         *
1182         * If the extent we found is inside our range, we set the
1183         * desired bit on it.
1184         */
1185        if (state->start < start) {
1186                prealloc = alloc_extent_state_atomic(prealloc);
1187                if (!prealloc) {
1188                        err = -ENOMEM;
1189                        goto out;
1190                }
1191                err = split_state(tree, state, prealloc, start);
1192                if (err)
1193                        extent_io_tree_panic(tree, err);
1194                prealloc = NULL;
1195                if (err)
1196                        goto out;
1197                if (state->end <= end) {
1198                        set_state_bits(tree, state, &bits, NULL);
1199                        cache_state(state, cached_state);
1200                        state = clear_state_bit(tree, state, &clear_bits, 0,
1201                                                NULL);
1202                        if (last_end == (u64)-1)
1203                                goto out;
1204                        start = last_end + 1;
1205                        if (start < end && state && state->start == start &&
1206                            !need_resched())
1207                                goto hit_next;
1208                }
1209                goto search_again;
1210        }
1211        /*
1212         * | ---- desired range ---- |
1213         *     | state | or               | state |
1214         *
1215         * There's a hole, we need to insert something in it and
1216         * ignore the extent we found.
1217         */
1218        if (state->start > start) {
1219                u64 this_end;
1220                if (end < last_start)
1221                        this_end = end;
1222                else
1223                        this_end = last_start - 1;
1224
1225                prealloc = alloc_extent_state_atomic(prealloc);
1226                if (!prealloc) {
1227                        err = -ENOMEM;
1228                        goto out;
1229                }
1230
1231                /*
1232                 * Avoid to free 'prealloc' if it can be merged with
1233                 * the later extent.
1234                 */
1235                err = insert_state(tree, prealloc, start, this_end,
1236                                   NULL, NULL, &bits, NULL);
1237                if (err)
1238                        extent_io_tree_panic(tree, err);
1239                cache_state(prealloc, cached_state);
1240                prealloc = NULL;
1241                start = this_end + 1;
1242                goto search_again;
1243        }
1244        /*
1245         * | ---- desired range ---- |
1246         *                        | state |
1247         * We need to split the extent, and set the bit
1248         * on the first half
1249         */
1250        if (state->start <= end && state->end > end) {
1251                prealloc = alloc_extent_state_atomic(prealloc);
1252                if (!prealloc) {
1253                        err = -ENOMEM;
1254                        goto out;
1255                }
1256
1257                err = split_state(tree, state, prealloc, end + 1);
1258                if (err)
1259                        extent_io_tree_panic(tree, err);
1260
1261                set_state_bits(tree, prealloc, &bits, NULL);
1262                cache_state(prealloc, cached_state);
1263                clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
1264                prealloc = NULL;
1265                goto out;
1266        }
1267
1268search_again:
1269        if (start > end)
1270                goto out;
1271        spin_unlock(&tree->lock);
1272        cond_resched();
1273        first_iteration = false;
1274        goto again;
1275
1276out:
1277        spin_unlock(&tree->lock);
1278        if (prealloc)
1279                free_extent_state(prealloc);
1280
1281        return err;
1282}
1283
1284/* wrappers around set/clear extent bit */
1285int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1286                           unsigned bits, struct extent_changeset *changeset)
1287{
1288        /*
1289         * We don't support EXTENT_LOCKED yet, as current changeset will
1290         * record any bits changed, so for EXTENT_LOCKED case, it will
1291         * either fail with -EEXIST or changeset will record the whole
1292         * range.
1293         */
1294        BUG_ON(bits & EXTENT_LOCKED);
1295
1296        return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1297                                changeset);
1298}
1299
1300int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1301                     unsigned bits, int wake, int delete,
1302                     struct extent_state **cached)
1303{
1304        return __clear_extent_bit(tree, start, end, bits, wake, delete,
1305                                  cached, GFP_NOFS, NULL);
1306}
1307
1308int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1309                unsigned bits, struct extent_changeset *changeset)
1310{
1311        /*
1312         * Don't support EXTENT_LOCKED case, same reason as
1313         * set_record_extent_bits().
1314         */
1315        BUG_ON(bits & EXTENT_LOCKED);
1316
1317        return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
1318                                  changeset);
1319}
1320
1321/*
1322 * either insert or lock state struct between start and end use mask to tell
1323 * us if waiting is desired.
1324 */
1325int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1326                     struct extent_state **cached_state)
1327{
1328        int err;
1329        u64 failed_start;
1330
1331        while (1) {
1332                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
1333                                       EXTENT_LOCKED, &failed_start,
1334                                       cached_state, GFP_NOFS, NULL);
1335                if (err == -EEXIST) {
1336                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1337                        start = failed_start;
1338                } else
1339                        break;
1340                WARN_ON(start > end);
1341        }
1342        return err;
1343}
1344
1345int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1346{
1347        int err;
1348        u64 failed_start;
1349
1350        err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1351                               &failed_start, NULL, GFP_NOFS, NULL);
1352        if (err == -EEXIST) {
1353                if (failed_start > start)
1354                        clear_extent_bit(tree, start, failed_start - 1,
1355                                         EXTENT_LOCKED, 1, 0, NULL);
1356                return 0;
1357        }
1358        return 1;
1359}
1360
1361void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1362{
1363        unsigned long index = start >> PAGE_SHIFT;
1364        unsigned long end_index = end >> PAGE_SHIFT;
1365        struct page *page;
1366
1367        while (index <= end_index) {
1368                page = find_get_page(inode->i_mapping, index);
1369                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1370                clear_page_dirty_for_io(page);
1371                put_page(page);
1372                index++;
1373        }
1374}
1375
1376void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1377{
1378        unsigned long index = start >> PAGE_SHIFT;
1379        unsigned long end_index = end >> PAGE_SHIFT;
1380        struct page *page;
1381
1382        while (index <= end_index) {
1383                page = find_get_page(inode->i_mapping, index);
1384                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1385                __set_page_dirty_nobuffers(page);
1386                account_page_redirty(page);
1387                put_page(page);
1388                index++;
1389        }
1390}
1391
1392/*
1393 * helper function to set both pages and extents in the tree writeback
1394 */
1395static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1396{
1397        tree->ops->set_range_writeback(tree->private_data, start, end);
1398}
1399
1400/* find the first state struct with 'bits' set after 'start', and
1401 * return it.  tree->lock must be held.  NULL will returned if
1402 * nothing was found after 'start'
1403 */
1404static struct extent_state *
1405find_first_extent_bit_state(struct extent_io_tree *tree,
1406                            u64 start, unsigned bits)
1407{
1408        struct rb_node *node;
1409        struct extent_state *state;
1410
1411        /*
1412         * this search will find all the extents that end after
1413         * our range starts.
1414         */
1415        node = tree_search(tree, start);
1416        if (!node)
1417                goto out;
1418
1419        while (1) {
1420                state = rb_entry(node, struct extent_state, rb_node);
1421                if (state->end >= start && (state->state & bits))
1422                        return state;
1423
1424                node = rb_next(node);
1425                if (!node)
1426                        break;
1427        }
1428out:
1429        return NULL;
1430}
1431
1432/*
1433 * find the first offset in the io tree with 'bits' set. zero is
1434 * returned if we find something, and *start_ret and *end_ret are
1435 * set to reflect the state struct that was found.
1436 *
1437 * If nothing was found, 1 is returned. If found something, return 0.
1438 */
1439int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1440                          u64 *start_ret, u64 *end_ret, unsigned bits,
1441                          struct extent_state **cached_state)
1442{
1443        struct extent_state *state;
1444        struct rb_node *n;
1445        int ret = 1;
1446
1447        spin_lock(&tree->lock);
1448        if (cached_state && *cached_state) {
1449                state = *cached_state;
1450                if (state->end == start - 1 && extent_state_in_tree(state)) {
1451                        n = rb_next(&state->rb_node);
1452                        while (n) {
1453                                state = rb_entry(n, struct extent_state,
1454                                                 rb_node);
1455                                if (state->state & bits)
1456                                        goto got_it;
1457                                n = rb_next(n);
1458                        }
1459                        free_extent_state(*cached_state);
1460                        *cached_state = NULL;
1461                        goto out;
1462                }
1463                free_extent_state(*cached_state);
1464                *cached_state = NULL;
1465        }
1466
1467        state = find_first_extent_bit_state(tree, start, bits);
1468got_it:
1469        if (state) {
1470                cache_state_if_flags(state, cached_state, 0);
1471                *start_ret = state->start;
1472                *end_ret = state->end;
1473                ret = 0;
1474        }
1475out:
1476        spin_unlock(&tree->lock);
1477        return ret;
1478}
1479
1480/*
1481 * find a contiguous range of bytes in the file marked as delalloc, not
1482 * more than 'max_bytes'.  start and end are used to return the range,
1483 *
1484 * 1 is returned if we find something, 0 if nothing was in the tree
1485 */
1486static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1487                                        u64 *start, u64 *end, u64 max_bytes,
1488                                        struct extent_state **cached_state)
1489{
1490        struct rb_node *node;
1491        struct extent_state *state;
1492        u64 cur_start = *start;
1493        u64 found = 0;
1494        u64 total_bytes = 0;
1495
1496        spin_lock(&tree->lock);
1497
1498        /*
1499         * this search will find all the extents that end after
1500         * our range starts.
1501         */
1502        node = tree_search(tree, cur_start);
1503        if (!node) {
1504                if (!found)
1505                        *end = (u64)-1;
1506                goto out;
1507        }
1508
1509        while (1) {
1510                state = rb_entry(node, struct extent_state, rb_node);
1511                if (found && (state->start != cur_start ||
1512                              (state->state & EXTENT_BOUNDARY))) {
1513                        goto out;
1514                }
1515                if (!(state->state & EXTENT_DELALLOC)) {
1516                        if (!found)
1517                                *end = state->end;
1518                        goto out;
1519                }
1520                if (!found) {
1521                        *start = state->start;
1522                        *cached_state = state;
1523                        refcount_inc(&state->refs);
1524                }
1525                found++;
1526                *end = state->end;
1527                cur_start = state->end + 1;
1528                node = rb_next(node);
1529                total_bytes += state->end - state->start + 1;
1530                if (total_bytes >= max_bytes)
1531                        break;
1532                if (!node)
1533                        break;
1534        }
1535out:
1536        spin_unlock(&tree->lock);
1537        return found;
1538}
1539
1540static int __process_pages_contig(struct address_space *mapping,
1541                                  struct page *locked_page,
1542                                  pgoff_t start_index, pgoff_t end_index,
1543                                  unsigned long page_ops, pgoff_t *index_ret);
1544
1545static noinline void __unlock_for_delalloc(struct inode *inode,
1546                                           struct page *locked_page,
1547                                           u64 start, u64 end)
1548{
1549        unsigned long index = start >> PAGE_SHIFT;
1550        unsigned long end_index = end >> PAGE_SHIFT;
1551
1552        ASSERT(locked_page);
1553        if (index == locked_page->index && end_index == index)
1554                return;
1555
1556        __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1557                               PAGE_UNLOCK, NULL);
1558}
1559
1560static noinline int lock_delalloc_pages(struct inode *inode,
1561                                        struct page *locked_page,
1562                                        u64 delalloc_start,
1563                                        u64 delalloc_end)
1564{
1565        unsigned long index = delalloc_start >> PAGE_SHIFT;
1566        unsigned long index_ret = index;
1567        unsigned long end_index = delalloc_end >> PAGE_SHIFT;
1568        int ret;
1569
1570        ASSERT(locked_page);
1571        if (index == locked_page->index && index == end_index)
1572                return 0;
1573
1574        ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1575                                     end_index, PAGE_LOCK, &index_ret);
1576        if (ret == -EAGAIN)
1577                __unlock_for_delalloc(inode, locked_page, delalloc_start,
1578                                      (u64)index_ret << PAGE_SHIFT);
1579        return ret;
1580}
1581
1582/*
1583 * find a contiguous range of bytes in the file marked as delalloc, not
1584 * more than 'max_bytes'.  start and end are used to return the range,
1585 *
1586 * 1 is returned if we find something, 0 if nothing was in the tree
1587 */
1588STATIC u64 find_lock_delalloc_range(struct inode *inode,
1589                                    struct extent_io_tree *tree,
1590                                    struct page *locked_page, u64 *start,
1591                                    u64 *end, u64 max_bytes)
1592{
1593        u64 delalloc_start;
1594        u64 delalloc_end;
1595        u64 found;
1596        struct extent_state *cached_state = NULL;
1597        int ret;
1598        int loops = 0;
1599
1600again:
1601        /* step one, find a bunch of delalloc bytes starting at start */
1602        delalloc_start = *start;
1603        delalloc_end = 0;
1604        found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1605                                    max_bytes, &cached_state);
1606        if (!found || delalloc_end <= *start) {
1607                *start = delalloc_start;
1608                *end = delalloc_end;
1609                free_extent_state(cached_state);
1610                return 0;
1611        }
1612
1613        /*
1614         * start comes from the offset of locked_page.  We have to lock
1615         * pages in order, so we can't process delalloc bytes before
1616         * locked_page
1617         */
1618        if (delalloc_start < *start)
1619                delalloc_start = *start;
1620
1621        /*
1622         * make sure to limit the number of pages we try to lock down
1623         */
1624        if (delalloc_end + 1 - delalloc_start > max_bytes)
1625                delalloc_end = delalloc_start + max_bytes - 1;
1626
1627        /* step two, lock all the pages after the page that has start */
1628        ret = lock_delalloc_pages(inode, locked_page,
1629                                  delalloc_start, delalloc_end);
1630        if (ret == -EAGAIN) {
1631                /* some of the pages are gone, lets avoid looping by
1632                 * shortening the size of the delalloc range we're searching
1633                 */
1634                free_extent_state(cached_state);
1635                cached_state = NULL;
1636                if (!loops) {
1637                        max_bytes = PAGE_SIZE;
1638                        loops = 1;
1639                        goto again;
1640                } else {
1641                        found = 0;
1642                        goto out_failed;
1643                }
1644        }
1645        BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1646
1647        /* step three, lock the state bits for the whole range */
1648        lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
1649
1650        /* then test to make sure it is all still delalloc */
1651        ret = test_range_bit(tree, delalloc_start, delalloc_end,
1652                             EXTENT_DELALLOC, 1, cached_state);
1653        if (!ret) {
1654                unlock_extent_cached(tree, delalloc_start, delalloc_end,
1655                                     &cached_state);
1656                __unlock_for_delalloc(inode, locked_page,
1657                              delalloc_start, delalloc_end);
1658                cond_resched();
1659                goto again;
1660        }
1661        free_extent_state(cached_state);
1662        *start = delalloc_start;
1663        *end = delalloc_end;
1664out_failed:
1665        return found;
1666}
1667
1668static int __process_pages_contig(struct address_space *mapping,
1669                                  struct page *locked_page,
1670                                  pgoff_t start_index, pgoff_t end_index,
1671                                  unsigned long page_ops, pgoff_t *index_ret)
1672{
1673        unsigned long nr_pages = end_index - start_index + 1;
1674        unsigned long pages_locked = 0;
1675        pgoff_t index = start_index;
1676        struct page *pages[16];
1677        unsigned ret;
1678        int err = 0;
1679        int i;
1680
1681        if (page_ops & PAGE_LOCK) {
1682                ASSERT(page_ops == PAGE_LOCK);
1683                ASSERT(index_ret && *index_ret == start_index);
1684        }
1685
1686        if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1687                mapping_set_error(mapping, -EIO);
1688
1689        while (nr_pages > 0) {
1690                ret = find_get_pages_contig(mapping, index,
1691                                     min_t(unsigned long,
1692                                     nr_pages, ARRAY_SIZE(pages)), pages);
1693                if (ret == 0) {
1694                        /*
1695                         * Only if we're going to lock these pages,
1696                         * can we find nothing at @index.
1697                         */
1698                        ASSERT(page_ops & PAGE_LOCK);
1699                        err = -EAGAIN;
1700                        goto out;
1701                }
1702
1703                for (i = 0; i < ret; i++) {
1704                        if (page_ops & PAGE_SET_PRIVATE2)
1705                                SetPagePrivate2(pages[i]);
1706
1707                        if (pages[i] == locked_page) {
1708                                put_page(pages[i]);
1709                                pages_locked++;
1710                                continue;
1711                        }
1712                        if (page_ops & PAGE_CLEAR_DIRTY)
1713                                clear_page_dirty_for_io(pages[i]);
1714                        if (page_ops & PAGE_SET_WRITEBACK)
1715                                set_page_writeback(pages[i]);
1716                        if (page_ops & PAGE_SET_ERROR)
1717                                SetPageError(pages[i]);
1718                        if (page_ops & PAGE_END_WRITEBACK)
1719                                end_page_writeback(pages[i]);
1720                        if (page_ops & PAGE_UNLOCK)
1721                                unlock_page(pages[i]);
1722                        if (page_ops & PAGE_LOCK) {
1723                                lock_page(pages[i]);
1724                                if (!PageDirty(pages[i]) ||
1725                                    pages[i]->mapping != mapping) {
1726                                        unlock_page(pages[i]);
1727                                        put_page(pages[i]);
1728                                        err = -EAGAIN;
1729                                        goto out;
1730                                }
1731                        }
1732                        put_page(pages[i]);
1733                        pages_locked++;
1734                }
1735                nr_pages -= ret;
1736                index += ret;
1737                cond_resched();
1738        }
1739out:
1740        if (err && index_ret)
1741                *index_ret = start_index + pages_locked - 1;
1742        return err;
1743}
1744
1745void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1746                                 u64 delalloc_end, struct page *locked_page,
1747                                 unsigned clear_bits,
1748                                 unsigned long page_ops)
1749{
1750        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
1751                         NULL);
1752
1753        __process_pages_contig(inode->i_mapping, locked_page,
1754                               start >> PAGE_SHIFT, end >> PAGE_SHIFT,
1755                               page_ops, NULL);
1756}
1757
1758/*
1759 * count the number of bytes in the tree that have a given bit(s)
1760 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1761 * cached.  The total number found is returned.
1762 */
1763u64 count_range_bits(struct extent_io_tree *tree,
1764                     u64 *start, u64 search_end, u64 max_bytes,
1765                     unsigned bits, int contig)
1766{
1767        struct rb_node *node;
1768        struct extent_state *state;
1769        u64 cur_start = *start;
1770        u64 total_bytes = 0;
1771        u64 last = 0;
1772        int found = 0;
1773
1774        if (WARN_ON(search_end <= cur_start))
1775                return 0;
1776
1777        spin_lock(&tree->lock);
1778        if (cur_start == 0 && bits == EXTENT_DIRTY) {
1779                total_bytes = tree->dirty_bytes;
1780                goto out;
1781        }
1782        /*
1783         * this search will find all the extents that end after
1784         * our range starts.
1785         */
1786        node = tree_search(tree, cur_start);
1787        if (!node)
1788                goto out;
1789
1790        while (1) {
1791                state = rb_entry(node, struct extent_state, rb_node);
1792                if (state->start > search_end)
1793                        break;
1794                if (contig && found && state->start > last + 1)
1795                        break;
1796                if (state->end >= cur_start && (state->state & bits) == bits) {
1797                        total_bytes += min(search_end, state->end) + 1 -
1798                                       max(cur_start, state->start);
1799                        if (total_bytes >= max_bytes)
1800                                break;
1801                        if (!found) {
1802                                *start = max(cur_start, state->start);
1803                                found = 1;
1804                        }
1805                        last = state->end;
1806                } else if (contig && found) {
1807                        break;
1808                }
1809                node = rb_next(node);
1810                if (!node)
1811                        break;
1812        }
1813out:
1814        spin_unlock(&tree->lock);
1815        return total_bytes;
1816}
1817
1818/*
1819 * set the private field for a given byte offset in the tree.  If there isn't
1820 * an extent_state there already, this does nothing.
1821 */
1822static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
1823                struct io_failure_record *failrec)
1824{
1825        struct rb_node *node;
1826        struct extent_state *state;
1827        int ret = 0;
1828
1829        spin_lock(&tree->lock);
1830        /*
1831         * this search will find all the extents that end after
1832         * our range starts.
1833         */
1834        node = tree_search(tree, start);
1835        if (!node) {
1836                ret = -ENOENT;
1837                goto out;
1838        }
1839        state = rb_entry(node, struct extent_state, rb_node);
1840        if (state->start != start) {
1841                ret = -ENOENT;
1842                goto out;
1843        }
1844        state->failrec = failrec;
1845out:
1846        spin_unlock(&tree->lock);
1847        return ret;
1848}
1849
1850static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
1851                struct io_failure_record **failrec)
1852{
1853        struct rb_node *node;
1854        struct extent_state *state;
1855        int ret = 0;
1856
1857        spin_lock(&tree->lock);
1858        /*
1859         * this search will find all the extents that end after
1860         * our range starts.
1861         */
1862        node = tree_search(tree, start);
1863        if (!node) {
1864                ret = -ENOENT;
1865                goto out;
1866        }
1867        state = rb_entry(node, struct extent_state, rb_node);
1868        if (state->start != start) {
1869                ret = -ENOENT;
1870                goto out;
1871        }
1872        *failrec = state->failrec;
1873out:
1874        spin_unlock(&tree->lock);
1875        return ret;
1876}
1877
1878/*
1879 * searches a range in the state tree for a given mask.
1880 * If 'filled' == 1, this returns 1 only if every extent in the tree
1881 * has the bits set.  Otherwise, 1 is returned if any bit in the
1882 * range is found set.
1883 */
1884int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1885                   unsigned bits, int filled, struct extent_state *cached)
1886{
1887        struct extent_state *state = NULL;
1888        struct rb_node *node;
1889        int bitset = 0;
1890
1891        spin_lock(&tree->lock);
1892        if (cached && extent_state_in_tree(cached) && cached->start <= start &&
1893            cached->end > start)
1894                node = &cached->rb_node;
1895        else
1896                node = tree_search(tree, start);
1897        while (node && start <= end) {
1898                state = rb_entry(node, struct extent_state, rb_node);
1899
1900                if (filled && state->start > start) {
1901                        bitset = 0;
1902                        break;
1903                }
1904
1905                if (state->start > end)
1906                        break;
1907
1908                if (state->state & bits) {
1909                        bitset = 1;
1910                        if (!filled)
1911                                break;
1912                } else if (filled) {
1913                        bitset = 0;
1914                        break;
1915                }
1916
1917                if (state->end == (u64)-1)
1918                        break;
1919
1920                start = state->end + 1;
1921                if (start > end)
1922                        break;
1923                node = rb_next(node);
1924                if (!node) {
1925                        if (filled)
1926                                bitset = 0;
1927                        break;
1928                }
1929        }
1930        spin_unlock(&tree->lock);
1931        return bitset;
1932}
1933
1934/*
1935 * helper function to set a given page up to date if all the
1936 * extents in the tree for that page are up to date
1937 */
1938static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1939{
1940        u64 start = page_offset(page);
1941        u64 end = start + PAGE_SIZE - 1;
1942        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1943                SetPageUptodate(page);
1944}
1945
1946int free_io_failure(struct extent_io_tree *failure_tree,
1947                    struct extent_io_tree *io_tree,
1948                    struct io_failure_record *rec)
1949{
1950        int ret;
1951        int err = 0;
1952
1953        set_state_failrec(failure_tree, rec->start, NULL);
1954        ret = clear_extent_bits(failure_tree, rec->start,
1955                                rec->start + rec->len - 1,
1956                                EXTENT_LOCKED | EXTENT_DIRTY);
1957        if (ret)
1958                err = ret;
1959
1960        ret = clear_extent_bits(io_tree, rec->start,
1961                                rec->start + rec->len - 1,
1962                                EXTENT_DAMAGED);
1963        if (ret && !err)
1964                err = ret;
1965
1966        kfree(rec);
1967        return err;
1968}
1969
1970/*
1971 * this bypasses the standard btrfs submit functions deliberately, as
1972 * the standard behavior is to write all copies in a raid setup. here we only
1973 * want to write the one bad copy. so we do the mapping for ourselves and issue
1974 * submit_bio directly.
1975 * to avoid any synchronization issues, wait for the data after writing, which
1976 * actually prevents the read that triggered the error from finishing.
1977 * currently, there can be no more than two copies of every data bit. thus,
1978 * exactly one rewrite is required.
1979 */
1980int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
1981                      u64 length, u64 logical, struct page *page,
1982                      unsigned int pg_offset, int mirror_num)
1983{
1984        struct bio *bio;
1985        struct btrfs_device *dev;
1986        u64 map_length = 0;
1987        u64 sector;
1988        struct btrfs_bio *bbio = NULL;
1989        int ret;
1990
1991        ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
1992        BUG_ON(!mirror_num);
1993
1994        bio = btrfs_io_bio_alloc(1);
1995        bio->bi_iter.bi_size = 0;
1996        map_length = length;
1997
1998        /*
1999         * Avoid races with device replace and make sure our bbio has devices
2000         * associated to its stripes that don't go away while we are doing the
2001         * read repair operation.
2002         */
2003        btrfs_bio_counter_inc_blocked(fs_info);
2004        if (btrfs_is_parity_mirror(fs_info, logical, length)) {
2005                /*
2006                 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2007                 * to update all raid stripes, but here we just want to correct
2008                 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2009                 * stripe's dev and sector.
2010                 */
2011                ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2012                                      &map_length, &bbio, 0);
2013                if (ret) {
2014                        btrfs_bio_counter_dec(fs_info);
2015                        bio_put(bio);
2016                        return -EIO;
2017                }
2018                ASSERT(bbio->mirror_num == 1);
2019        } else {
2020                ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2021                                      &map_length, &bbio, mirror_num);
2022                if (ret) {
2023                        btrfs_bio_counter_dec(fs_info);
2024                        bio_put(bio);
2025                        return -EIO;
2026                }
2027                BUG_ON(mirror_num != bbio->mirror_num);
2028        }
2029
2030        sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
2031        bio->bi_iter.bi_sector = sector;
2032        dev = bbio->stripes[bbio->mirror_num - 1].dev;
2033        btrfs_put_bbio(bbio);
2034        if (!dev || !dev->bdev ||
2035            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2036                btrfs_bio_counter_dec(fs_info);
2037                bio_put(bio);
2038                return -EIO;
2039        }
2040        bio_set_dev(bio, dev->bdev);
2041        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
2042        bio_add_page(bio, page, length, pg_offset);
2043
2044        if (btrfsic_submit_bio_wait(bio)) {
2045                /* try to remap that extent elsewhere? */
2046                btrfs_bio_counter_dec(fs_info);
2047                bio_put(bio);
2048                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2049                return -EIO;
2050        }
2051
2052        btrfs_info_rl_in_rcu(fs_info,
2053                "read error corrected: ino %llu off %llu (dev %s sector %llu)",
2054                                  ino, start,
2055                                  rcu_str_deref(dev->name), sector);
2056        btrfs_bio_counter_dec(fs_info);
2057        bio_put(bio);
2058        return 0;
2059}
2060
2061int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
2062                         struct extent_buffer *eb, int mirror_num)
2063{
2064        u64 start = eb->start;
2065        unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
2066        int ret = 0;
2067
2068        if (sb_rdonly(fs_info->sb))
2069                return -EROFS;
2070
2071        for (i = 0; i < num_pages; i++) {
2072                struct page *p = eb->pages[i];
2073
2074                ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2075                                        start - page_offset(p), mirror_num);
2076                if (ret)
2077                        break;
2078                start += PAGE_SIZE;
2079        }
2080
2081        return ret;
2082}
2083
2084/*
2085 * each time an IO finishes, we do a fast check in the IO failure tree
2086 * to see if we need to process or clean up an io_failure_record
2087 */
2088int clean_io_failure(struct btrfs_fs_info *fs_info,
2089                     struct extent_io_tree *failure_tree,
2090                     struct extent_io_tree *io_tree, u64 start,
2091                     struct page *page, u64 ino, unsigned int pg_offset)
2092{
2093        u64 private;
2094        struct io_failure_record *failrec;
2095        struct extent_state *state;
2096        int num_copies;
2097        int ret;
2098
2099        private = 0;
2100        ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2101                               EXTENT_DIRTY, 0);
2102        if (!ret)
2103                return 0;
2104
2105        ret = get_state_failrec(failure_tree, start, &failrec);
2106        if (ret)
2107                return 0;
2108
2109        BUG_ON(!failrec->this_mirror);
2110
2111        if (failrec->in_validation) {
2112                /* there was no real error, just free the record */
2113                btrfs_debug(fs_info,
2114                        "clean_io_failure: freeing dummy error at %llu",
2115                        failrec->start);
2116                goto out;
2117        }
2118        if (sb_rdonly(fs_info->sb))
2119                goto out;
2120
2121        spin_lock(&io_tree->lock);
2122        state = find_first_extent_bit_state(io_tree,
2123                                            failrec->start,
2124                                            EXTENT_LOCKED);
2125        spin_unlock(&io_tree->lock);
2126
2127        if (state && state->start <= failrec->start &&
2128            state->end >= failrec->start + failrec->len - 1) {
2129                num_copies = btrfs_num_copies(fs_info, failrec->logical,
2130                                              failrec->len);
2131                if (num_copies > 1)  {
2132                        repair_io_failure(fs_info, ino, start, failrec->len,
2133                                          failrec->logical, page, pg_offset,
2134                                          failrec->failed_mirror);
2135                }
2136        }
2137
2138out:
2139        free_io_failure(failure_tree, io_tree, failrec);
2140
2141        return 0;
2142}
2143
2144/*
2145 * Can be called when
2146 * - hold extent lock
2147 * - under ordered extent
2148 * - the inode is freeing
2149 */
2150void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
2151{
2152        struct extent_io_tree *failure_tree = &inode->io_failure_tree;
2153        struct io_failure_record *failrec;
2154        struct extent_state *state, *next;
2155
2156        if (RB_EMPTY_ROOT(&failure_tree->state))
2157                return;
2158
2159        spin_lock(&failure_tree->lock);
2160        state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2161        while (state) {
2162                if (state->start > end)
2163                        break;
2164
2165                ASSERT(state->end <= end);
2166
2167                next = next_state(state);
2168
2169                failrec = state->failrec;
2170                free_extent_state(state);
2171                kfree(failrec);
2172
2173                state = next;
2174        }
2175        spin_unlock(&failure_tree->lock);
2176}
2177
2178int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2179                struct io_failure_record **failrec_ret)
2180{
2181        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2182        struct io_failure_record *failrec;
2183        struct extent_map *em;
2184        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2185        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2186        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2187        int ret;
2188        u64 logical;
2189
2190        ret = get_state_failrec(failure_tree, start, &failrec);
2191        if (ret) {
2192                failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2193                if (!failrec)
2194                        return -ENOMEM;
2195
2196                failrec->start = start;
2197                failrec->len = end - start + 1;
2198                failrec->this_mirror = 0;
2199                failrec->bio_flags = 0;
2200                failrec->in_validation = 0;
2201
2202                read_lock(&em_tree->lock);
2203                em = lookup_extent_mapping(em_tree, start, failrec->len);
2204                if (!em) {
2205                        read_unlock(&em_tree->lock);
2206                        kfree(failrec);
2207                        return -EIO;
2208                }
2209
2210                if (em->start > start || em->start + em->len <= start) {
2211                        free_extent_map(em);
2212                        em = NULL;
2213                }
2214                read_unlock(&em_tree->lock);
2215                if (!em) {
2216                        kfree(failrec);
2217                        return -EIO;
2218                }
2219
2220                logical = start - em->start;
2221                logical = em->block_start + logical;
2222                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2223                        logical = em->block_start;
2224                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2225                        extent_set_compress_type(&failrec->bio_flags,
2226                                                 em->compress_type);
2227                }
2228
2229                btrfs_debug(fs_info,
2230                        "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2231                        logical, start, failrec->len);
2232
2233                failrec->logical = logical;
2234                free_extent_map(em);
2235
2236                /* set the bits in the private failure tree */
2237                ret = set_extent_bits(failure_tree, start, end,
2238                                        EXTENT_LOCKED | EXTENT_DIRTY);
2239                if (ret >= 0)
2240                        ret = set_state_failrec(failure_tree, start, failrec);
2241                /* set the bits in the inode's tree */
2242                if (ret >= 0)
2243                        ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
2244                if (ret < 0) {
2245                        kfree(failrec);
2246                        return ret;
2247                }
2248        } else {
2249                btrfs_debug(fs_info,
2250                        "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
2251                        failrec->logical, failrec->start, failrec->len,
2252                        failrec->in_validation);
2253                /*
2254                 * when data can be on disk more than twice, add to failrec here
2255                 * (e.g. with a list for failed_mirror) to make
2256                 * clean_io_failure() clean all those errors at once.
2257                 */
2258        }
2259
2260        *failrec_ret = failrec;
2261
2262        return 0;
2263}
2264
2265bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
2266                           struct io_failure_record *failrec, int failed_mirror)
2267{
2268        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2269        int num_copies;
2270
2271        num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
2272        if (num_copies == 1) {
2273                /*
2274                 * we only have a single copy of the data, so don't bother with
2275                 * all the retry and error correction code that follows. no
2276                 * matter what the error is, it is very likely to persist.
2277                 */
2278                btrfs_debug(fs_info,
2279                        "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2280                        num_copies, failrec->this_mirror, failed_mirror);
2281                return false;
2282        }
2283
2284        /*
2285         * there are two premises:
2286         *      a) deliver good data to the caller
2287         *      b) correct the bad sectors on disk
2288         */
2289        if (failed_bio_pages > 1) {
2290                /*
2291                 * to fulfill b), we need to know the exact failing sectors, as
2292                 * we don't want to rewrite any more than the failed ones. thus,
2293                 * we need separate read requests for the failed bio
2294                 *
2295                 * if the following BUG_ON triggers, our validation request got
2296                 * merged. we need separate requests for our algorithm to work.
2297                 */
2298                BUG_ON(failrec->in_validation);
2299                failrec->in_validation = 1;
2300                failrec->this_mirror = failed_mirror;
2301        } else {
2302                /*
2303                 * we're ready to fulfill a) and b) alongside. get a good copy
2304                 * of the failed sector and if we succeed, we have setup
2305                 * everything for repair_io_failure to do the rest for us.
2306                 */
2307                if (failrec->in_validation) {
2308                        BUG_ON(failrec->this_mirror != failed_mirror);
2309                        failrec->in_validation = 0;
2310                        failrec->this_mirror = 0;
2311                }
2312                failrec->failed_mirror = failed_mirror;
2313                failrec->this_mirror++;
2314                if (failrec->this_mirror == failed_mirror)
2315                        failrec->this_mirror++;
2316        }
2317
2318        if (failrec->this_mirror > num_copies) {
2319                btrfs_debug(fs_info,
2320                        "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2321                        num_copies, failrec->this_mirror, failed_mirror);
2322                return false;
2323        }
2324
2325        return true;
2326}
2327
2328
2329struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2330                                    struct io_failure_record *failrec,
2331                                    struct page *page, int pg_offset, int icsum,
2332                                    bio_end_io_t *endio_func, void *data)
2333{
2334        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2335        struct bio *bio;
2336        struct btrfs_io_bio *btrfs_failed_bio;
2337        struct btrfs_io_bio *btrfs_bio;
2338
2339        bio = btrfs_io_bio_alloc(1);
2340        bio->bi_end_io = endio_func;
2341        bio->bi_iter.bi_sector = failrec->logical >> 9;
2342        bio_set_dev(bio, fs_info->fs_devices->latest_bdev);
2343        bio->bi_iter.bi_size = 0;
2344        bio->bi_private = data;
2345
2346        btrfs_failed_bio = btrfs_io_bio(failed_bio);
2347        if (btrfs_failed_bio->csum) {
2348                u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
2349
2350                btrfs_bio = btrfs_io_bio(bio);
2351                btrfs_bio->csum = btrfs_bio->csum_inline;
2352                icsum *= csum_size;
2353                memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
2354                       csum_size);
2355        }
2356
2357        bio_add_page(bio, page, failrec->len, pg_offset);
2358
2359        return bio;
2360}
2361
2362/*
2363 * this is a generic handler for readpage errors (default
2364 * readpage_io_failed_hook). if other copies exist, read those and write back
2365 * good data to the failed position. does not investigate in remapping the
2366 * failed extent elsewhere, hoping the device will be smart enough to do this as
2367 * needed
2368 */
2369
2370static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2371                              struct page *page, u64 start, u64 end,
2372                              int failed_mirror)
2373{
2374        struct io_failure_record *failrec;
2375        struct inode *inode = page->mapping->host;
2376        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2377        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2378        struct bio *bio;
2379        int read_mode = 0;
2380        blk_status_t status;
2381        int ret;
2382        unsigned failed_bio_pages = bio_pages_all(failed_bio);
2383
2384        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2385
2386        ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2387        if (ret)
2388                return ret;
2389
2390        if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
2391                                    failed_mirror)) {
2392                free_io_failure(failure_tree, tree, failrec);
2393                return -EIO;
2394        }
2395
2396        if (failed_bio_pages > 1)
2397                read_mode |= REQ_FAILFAST_DEV;
2398
2399        phy_offset >>= inode->i_sb->s_blocksize_bits;
2400        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2401                                      start - page_offset(page),
2402                                      (int)phy_offset, failed_bio->bi_end_io,
2403                                      NULL);
2404        bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
2405
2406        btrfs_debug(btrfs_sb(inode->i_sb),
2407                "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
2408                read_mode, failrec->this_mirror, failrec->in_validation);
2409
2410        status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
2411                                         failrec->bio_flags, 0);
2412        if (status) {
2413                free_io_failure(failure_tree, tree, failrec);
2414                bio_put(bio);
2415                ret = blk_status_to_errno(status);
2416        }
2417
2418        return ret;
2419}
2420
2421/* lots and lots of room for performance fixes in the end_bio funcs */
2422
2423void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2424{
2425        int uptodate = (err == 0);
2426        struct extent_io_tree *tree;
2427        int ret = 0;
2428
2429        tree = &BTRFS_I(page->mapping->host)->io_tree;
2430
2431        if (tree->ops && tree->ops->writepage_end_io_hook)
2432                tree->ops->writepage_end_io_hook(page, start, end, NULL,
2433                                uptodate);
2434
2435        if (!uptodate) {
2436                ClearPageUptodate(page);
2437                SetPageError(page);
2438                ret = err < 0 ? err : -EIO;
2439                mapping_set_error(page->mapping, ret);
2440        }
2441}
2442
2443/*
2444 * after a writepage IO is done, we need to:
2445 * clear the uptodate bits on error
2446 * clear the writeback bits in the extent tree for this IO
2447 * end_page_writeback if the page has no more pending IO
2448 *
2449 * Scheduling is not allowed, so the extent state tree is expected
2450 * to have one and only one object corresponding to this IO.
2451 */
2452static void end_bio_extent_writepage(struct bio *bio)
2453{
2454        int error = blk_status_to_errno(bio->bi_status);
2455        struct bio_vec *bvec;
2456        u64 start;
2457        u64 end;
2458        int i;
2459
2460        ASSERT(!bio_flagged(bio, BIO_CLONED));
2461        bio_for_each_segment_all(bvec, bio, i) {
2462                struct page *page = bvec->bv_page;
2463                struct inode *inode = page->mapping->host;
2464                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2465
2466                /* We always issue full-page reads, but if some block
2467                 * in a page fails to read, blk_update_request() will
2468                 * advance bv_offset and adjust bv_len to compensate.
2469                 * Print a warning for nonzero offsets, and an error
2470                 * if they don't add up to a full page.  */
2471                if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2472                        if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
2473                                btrfs_err(fs_info,
2474                                   "partial page write in btrfs with offset %u and length %u",
2475                                        bvec->bv_offset, bvec->bv_len);
2476                        else
2477                                btrfs_info(fs_info,
2478                                   "incomplete page write in btrfs with offset %u and length %u",
2479                                        bvec->bv_offset, bvec->bv_len);
2480                }
2481
2482                start = page_offset(page);
2483                end = start + bvec->bv_offset + bvec->bv_len - 1;
2484
2485                end_extent_writepage(page, error, start, end);
2486                end_page_writeback(page);
2487        }
2488
2489        bio_put(bio);
2490}
2491
2492static void
2493endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
2494                              int uptodate)
2495{
2496        struct extent_state *cached = NULL;
2497        u64 end = start + len - 1;
2498
2499        if (uptodate && tree->track_uptodate)
2500                set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
2501        unlock_extent_cached_atomic(tree, start, end, &cached);
2502}
2503
2504/*
2505 * after a readpage IO is done, we need to:
2506 * clear the uptodate bits on error
2507 * set the uptodate bits if things worked
2508 * set the page up to date if all extents in the tree are uptodate
2509 * clear the lock bit in the extent tree
2510 * unlock the page if there are no other extents locked for it
2511 *
2512 * Scheduling is not allowed, so the extent state tree is expected
2513 * to have one and only one object corresponding to this IO.
2514 */
2515static void end_bio_extent_readpage(struct bio *bio)
2516{
2517        struct bio_vec *bvec;
2518        int uptodate = !bio->bi_status;
2519        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2520        struct extent_io_tree *tree, *failure_tree;
2521        u64 offset = 0;
2522        u64 start;
2523        u64 end;
2524        u64 len;
2525        u64 extent_start = 0;
2526        u64 extent_len = 0;
2527        int mirror;
2528        int ret;
2529        int i;
2530
2531        ASSERT(!bio_flagged(bio, BIO_CLONED));
2532        bio_for_each_segment_all(bvec, bio, i) {
2533                struct page *page = bvec->bv_page;
2534                struct inode *inode = page->mapping->host;
2535                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2536
2537                btrfs_debug(fs_info,
2538                        "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
2539                        (u64)bio->bi_iter.bi_sector, bio->bi_status,
2540                        io_bio->mirror_num);
2541                tree = &BTRFS_I(inode)->io_tree;
2542                failure_tree = &BTRFS_I(inode)->io_failure_tree;
2543
2544                /* We always issue full-page reads, but if some block
2545                 * in a page fails to read, blk_update_request() will
2546                 * advance bv_offset and adjust bv_len to compensate.
2547                 * Print a warning for nonzero offsets, and an error
2548                 * if they don't add up to a full page.  */
2549                if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2550                        if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
2551                                btrfs_err(fs_info,
2552                                        "partial page read in btrfs with offset %u and length %u",
2553                                        bvec->bv_offset, bvec->bv_len);
2554                        else
2555                                btrfs_info(fs_info,
2556                                        "incomplete page read in btrfs with offset %u and length %u",
2557                                        bvec->bv_offset, bvec->bv_len);
2558                }
2559
2560                start = page_offset(page);
2561                end = start + bvec->bv_offset + bvec->bv_len - 1;
2562                len = bvec->bv_len;
2563
2564                mirror = io_bio->mirror_num;
2565                if (likely(uptodate && tree->ops)) {
2566                        ret = tree->ops->readpage_end_io_hook(io_bio, offset,
2567                                                              page, start, end,
2568                                                              mirror);
2569                        if (ret)
2570                                uptodate = 0;
2571                        else
2572                                clean_io_failure(BTRFS_I(inode)->root->fs_info,
2573                                                 failure_tree, tree, start,
2574                                                 page,
2575                                                 btrfs_ino(BTRFS_I(inode)), 0);
2576                }
2577
2578                if (likely(uptodate))
2579                        goto readpage_ok;
2580
2581                if (tree->ops) {
2582                        ret = tree->ops->readpage_io_failed_hook(page, mirror);
2583                        if (ret == -EAGAIN) {
2584                                /*
2585                                 * Data inode's readpage_io_failed_hook() always
2586                                 * returns -EAGAIN.
2587                                 *
2588                                 * The generic bio_readpage_error handles errors
2589                                 * the following way: If possible, new read
2590                                 * requests are created and submitted and will
2591                                 * end up in end_bio_extent_readpage as well (if
2592                                 * we're lucky, not in the !uptodate case). In
2593                                 * that case it returns 0 and we just go on with
2594                                 * the next page in our bio. If it can't handle
2595                                 * the error it will return -EIO and we remain
2596                                 * responsible for that page.
2597                                 */
2598                                ret = bio_readpage_error(bio, offset, page,
2599                                                         start, end, mirror);
2600                                if (ret == 0) {
2601                                        uptodate = !bio->bi_status;
2602                                        offset += len;
2603                                        continue;
2604                                }
2605                        }
2606
2607                        /*
2608                         * metadata's readpage_io_failed_hook() always returns
2609                         * -EIO and fixes nothing.  -EIO is also returned if
2610                         * data inode error could not be fixed.
2611                         */
2612                        ASSERT(ret == -EIO);
2613                }
2614readpage_ok:
2615                if (likely(uptodate)) {
2616                        loff_t i_size = i_size_read(inode);
2617                        pgoff_t end_index = i_size >> PAGE_SHIFT;
2618                        unsigned off;
2619
2620                        /* Zero out the end if this page straddles i_size */
2621                        off = i_size & (PAGE_SIZE-1);
2622                        if (page->index == end_index && off)
2623                                zero_user_segment(page, off, PAGE_SIZE);
2624                        SetPageUptodate(page);
2625                } else {
2626                        ClearPageUptodate(page);
2627                        SetPageError(page);
2628                }
2629                unlock_page(page);
2630                offset += len;
2631
2632                if (unlikely(!uptodate)) {
2633                        if (extent_len) {
2634                                endio_readpage_release_extent(tree,
2635                                                              extent_start,
2636                                                              extent_len, 1);
2637                                extent_start = 0;
2638                                extent_len = 0;
2639                        }
2640                        endio_readpage_release_extent(tree, start,
2641                                                      end - start + 1, 0);
2642                } else if (!extent_len) {
2643                        extent_start = start;
2644                        extent_len = end + 1 - start;
2645                } else if (extent_start + extent_len == start) {
2646                        extent_len += end + 1 - start;
2647                } else {
2648                        endio_readpage_release_extent(tree, extent_start,
2649                                                      extent_len, uptodate);
2650                        extent_start = start;
2651                        extent_len = end + 1 - start;
2652                }
2653        }
2654
2655        if (extent_len)
2656                endio_readpage_release_extent(tree, extent_start, extent_len,
2657                                              uptodate);
2658        if (io_bio->end_io)
2659                io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
2660        bio_put(bio);
2661}
2662
2663/*
2664 * Initialize the members up to but not including 'bio'. Use after allocating a
2665 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
2666 * 'bio' because use of __GFP_ZERO is not supported.
2667 */
2668static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
2669{
2670        memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
2671}
2672
2673/*
2674 * The following helpers allocate a bio. As it's backed by a bioset, it'll
2675 * never fail.  We're returning a bio right now but you can call btrfs_io_bio
2676 * for the appropriate container_of magic
2677 */
2678struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte)
2679{
2680        struct bio *bio;
2681
2682        bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
2683        bio_set_dev(bio, bdev);
2684        bio->bi_iter.bi_sector = first_byte >> 9;
2685        btrfs_io_bio_init(btrfs_io_bio(bio));
2686        return bio;
2687}
2688
2689struct bio *btrfs_bio_clone(struct bio *bio)
2690{
2691        struct btrfs_io_bio *btrfs_bio;
2692        struct bio *new;
2693
2694        /* Bio allocation backed by a bioset does not fail */
2695        new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset);
2696        btrfs_bio = btrfs_io_bio(new);
2697        btrfs_io_bio_init(btrfs_bio);
2698        btrfs_bio->iter = bio->bi_iter;
2699        return new;
2700}
2701
2702struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
2703{
2704        struct bio *bio;
2705
2706        /* Bio allocation backed by a bioset does not fail */
2707        bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
2708        btrfs_io_bio_init(btrfs_io_bio(bio));
2709        return bio;
2710}
2711
2712struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
2713{
2714        struct bio *bio;
2715        struct btrfs_io_bio *btrfs_bio;
2716
2717        /* this will never fail when it's backed by a bioset */
2718        bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset);
2719        ASSERT(bio);
2720
2721        btrfs_bio = btrfs_io_bio(bio);
2722        btrfs_io_bio_init(btrfs_bio);
2723
2724        bio_trim(bio, offset >> 9, size >> 9);
2725        btrfs_bio->iter = bio->bi_iter;
2726        return bio;
2727}
2728
2729static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
2730                                       unsigned long bio_flags)
2731{
2732        blk_status_t ret = 0;
2733        struct bio_vec *bvec = bio_last_bvec_all(bio);
2734        struct page *page = bvec->bv_page;
2735        struct extent_io_tree *tree = bio->bi_private;
2736        u64 start;
2737
2738        start = page_offset(page) + bvec->bv_offset;
2739
2740        bio->bi_private = NULL;
2741
2742        if (tree->ops)
2743                ret = tree->ops->submit_bio_hook(tree->private_data, bio,
2744                                           mirror_num, bio_flags, start);
2745        else
2746                btrfsic_submit_bio(bio);
2747
2748        return blk_status_to_errno(ret);
2749}
2750
2751/*
2752 * @opf:        bio REQ_OP_* and REQ_* flags as one value
2753 * @tree:       tree so we can call our merge_bio hook
2754 * @wbc:        optional writeback control for io accounting
2755 * @page:       page to add to the bio
2756 * @pg_offset:  offset of the new bio or to check whether we are adding
2757 *              a contiguous page to the previous one
2758 * @size:       portion of page that we want to write
2759 * @offset:     starting offset in the page
2760 * @bdev:       attach newly created bios to this bdev
2761 * @bio_ret:    must be valid pointer, newly allocated bio will be stored there
2762 * @end_io_func:     end_io callback for new bio
2763 * @mirror_num:      desired mirror to read/write
2764 * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
2765 * @bio_flags:  flags of the current bio to see if we can merge them
2766 */
2767static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
2768                              struct writeback_control *wbc,
2769                              struct page *page, u64 offset,
2770                              size_t size, unsigned long pg_offset,
2771                              struct block_device *bdev,
2772                              struct bio **bio_ret,
2773                              bio_end_io_t end_io_func,
2774                              int mirror_num,
2775                              unsigned long prev_bio_flags,
2776                              unsigned long bio_flags,
2777                              bool force_bio_submit)
2778{
2779        int ret = 0;
2780        struct bio *bio;
2781        size_t page_size = min_t(size_t, size, PAGE_SIZE);
2782        sector_t sector = offset >> 9;
2783
2784        ASSERT(bio_ret);
2785
2786        if (*bio_ret) {
2787                bool contig;
2788                bool can_merge = true;
2789
2790                bio = *bio_ret;
2791                if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
2792                        contig = bio->bi_iter.bi_sector == sector;
2793                else
2794                        contig = bio_end_sector(bio) == sector;
2795
2796                if (tree->ops && tree->ops->merge_bio_hook(page, offset,
2797                                        page_size, bio, bio_flags))
2798                        can_merge = false;
2799
2800                if (prev_bio_flags != bio_flags || !contig || !can_merge ||
2801                    force_bio_submit ||
2802                    bio_add_page(bio, page, page_size, pg_offset) < page_size) {
2803                        ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
2804                        if (ret < 0) {
2805                                *bio_ret = NULL;
2806                                return ret;
2807                        }
2808                        bio = NULL;
2809                } else {
2810                        if (wbc)
2811                                wbc_account_io(wbc, page, page_size);
2812                        return 0;
2813                }
2814        }
2815
2816        bio = btrfs_bio_alloc(bdev, offset);
2817        bio_add_page(bio, page, page_size, pg_offset);
2818        bio->bi_end_io = end_io_func;
2819        bio->bi_private = tree;
2820        bio->bi_write_hint = page->mapping->host->i_write_hint;
2821        bio->bi_opf = opf;
2822        if (wbc) {
2823                wbc_init_bio(wbc, bio);
2824                wbc_account_io(wbc, page, page_size);
2825        }
2826
2827        *bio_ret = bio;
2828
2829        return ret;
2830}
2831
2832static void attach_extent_buffer_page(struct extent_buffer *eb,
2833                                      struct page *page)
2834{
2835        if (!PagePrivate(page)) {
2836                SetPagePrivate(page);
2837                get_page(page);
2838                set_page_private(page, (unsigned long)eb);
2839        } else {
2840                WARN_ON(page->private != (unsigned long)eb);
2841        }
2842}
2843
2844void set_page_extent_mapped(struct page *page)
2845{
2846        if (!PagePrivate(page)) {
2847                SetPagePrivate(page);
2848                get_page(page);
2849                set_page_private(page, EXTENT_PAGE_PRIVATE);
2850        }
2851}
2852
2853static struct extent_map *
2854__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
2855                 u64 start, u64 len, get_extent_t *get_extent,
2856                 struct extent_map **em_cached)
2857{
2858        struct extent_map *em;
2859
2860        if (em_cached && *em_cached) {
2861                em = *em_cached;
2862                if (extent_map_in_tree(em) && start >= em->start &&
2863                    start < extent_map_end(em)) {
2864                        refcount_inc(&em->refs);
2865                        return em;
2866                }
2867
2868                free_extent_map(em);
2869                *em_cached = NULL;
2870        }
2871
2872        em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
2873        if (em_cached && !IS_ERR_OR_NULL(em)) {
2874                BUG_ON(*em_cached);
2875                refcount_inc(&em->refs);
2876                *em_cached = em;
2877        }
2878        return em;
2879}
2880/*
2881 * basic readpage implementation.  Locked extent state structs are inserted
2882 * into the tree that are removed when the IO is done (by the end_io
2883 * handlers)
2884 * XXX JDM: This needs looking at to ensure proper page locking
2885 * return 0 on success, otherwise return error
2886 */
2887static int __do_readpage(struct extent_io_tree *tree,
2888                         struct page *page,
2889                         get_extent_t *get_extent,
2890                         struct extent_map **em_cached,
2891                         struct bio **bio, int mirror_num,
2892                         unsigned long *bio_flags, unsigned int read_flags,
2893                         u64 *prev_em_start)
2894{
2895        struct inode *inode = page->mapping->host;
2896        u64 start = page_offset(page);
2897        const u64 end = start + PAGE_SIZE - 1;
2898        u64 cur = start;
2899        u64 extent_offset;
2900        u64 last_byte = i_size_read(inode);
2901        u64 block_start;
2902        u64 cur_end;
2903        struct extent_map *em;
2904        struct block_device *bdev;
2905        int ret = 0;
2906        int nr = 0;
2907        size_t pg_offset = 0;
2908        size_t iosize;
2909        size_t disk_io_size;
2910        size_t blocksize = inode->i_sb->s_blocksize;
2911        unsigned long this_bio_flag = 0;
2912
2913        set_page_extent_mapped(page);
2914
2915        if (!PageUptodate(page)) {
2916                if (cleancache_get_page(page) == 0) {
2917                        BUG_ON(blocksize != PAGE_SIZE);
2918                        unlock_extent(tree, start, end);
2919                        goto out;
2920                }
2921        }
2922
2923        if (page->index == last_byte >> PAGE_SHIFT) {
2924                char *userpage;
2925                size_t zero_offset = last_byte & (PAGE_SIZE - 1);
2926
2927                if (zero_offset) {
2928                        iosize = PAGE_SIZE - zero_offset;
2929                        userpage = kmap_atomic(page);
2930                        memset(userpage + zero_offset, 0, iosize);
2931                        flush_dcache_page(page);
2932                        kunmap_atomic(userpage);
2933                }
2934        }
2935        while (cur <= end) {
2936                bool force_bio_submit = false;
2937                u64 offset;
2938
2939                if (cur >= last_byte) {
2940                        char *userpage;
2941                        struct extent_state *cached = NULL;
2942
2943                        iosize = PAGE_SIZE - pg_offset;
2944                        userpage = kmap_atomic(page);
2945                        memset(userpage + pg_offset, 0, iosize);
2946                        flush_dcache_page(page);
2947                        kunmap_atomic(userpage);
2948                        set_extent_uptodate(tree, cur, cur + iosize - 1,
2949                                            &cached, GFP_NOFS);
2950                        unlock_extent_cached(tree, cur,
2951                                             cur + iosize - 1, &cached);
2952                        break;
2953                }
2954                em = __get_extent_map(inode, page, pg_offset, cur,
2955                                      end - cur + 1, get_extent, em_cached);
2956                if (IS_ERR_OR_NULL(em)) {
2957                        SetPageError(page);
2958                        unlock_extent(tree, cur, end);
2959                        break;
2960                }
2961                extent_offset = cur - em->start;
2962                BUG_ON(extent_map_end(em) <= cur);
2963                BUG_ON(end < cur);
2964
2965                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2966                        this_bio_flag |= EXTENT_BIO_COMPRESSED;
2967                        extent_set_compress_type(&this_bio_flag,
2968                                                 em->compress_type);
2969                }
2970
2971                iosize = min(extent_map_end(em) - cur, end - cur + 1);
2972                cur_end = min(extent_map_end(em) - 1, end);
2973                iosize = ALIGN(iosize, blocksize);
2974                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2975                        disk_io_size = em->block_len;
2976                        offset = em->block_start;
2977                } else {
2978                        offset = em->block_start + extent_offset;
2979                        disk_io_size = iosize;
2980                }
2981                bdev = em->bdev;
2982                block_start = em->block_start;
2983                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2984                        block_start = EXTENT_MAP_HOLE;
2985
2986                /*
2987                 * If we have a file range that points to a compressed extent
2988                 * and it's followed by a consecutive file range that points to
2989                 * to the same compressed extent (possibly with a different
2990                 * offset and/or length, so it either points to the whole extent
2991                 * or only part of it), we must make sure we do not submit a
2992                 * single bio to populate the pages for the 2 ranges because
2993                 * this makes the compressed extent read zero out the pages
2994                 * belonging to the 2nd range. Imagine the following scenario:
2995                 *
2996                 *  File layout
2997                 *  [0 - 8K]                     [8K - 24K]
2998                 *    |                               |
2999                 *    |                               |
3000                 * points to extent X,         points to extent X,
3001                 * offset 4K, length of 8K     offset 0, length 16K
3002                 *
3003                 * [extent X, compressed length = 4K uncompressed length = 16K]
3004                 *
3005                 * If the bio to read the compressed extent covers both ranges,
3006                 * it will decompress extent X into the pages belonging to the
3007                 * first range and then it will stop, zeroing out the remaining
3008                 * pages that belong to the other range that points to extent X.
3009                 * So here we make sure we submit 2 bios, one for the first
3010                 * range and another one for the third range. Both will target
3011                 * the same physical extent from disk, but we can't currently
3012                 * make the compressed bio endio callback populate the pages
3013                 * for both ranges because each compressed bio is tightly
3014                 * coupled with a single extent map, and each range can have
3015                 * an extent map with a different offset value relative to the
3016                 * uncompressed data of our extent and different lengths. This
3017                 * is a corner case so we prioritize correctness over
3018                 * non-optimal behavior (submitting 2 bios for the same extent).
3019                 */
3020                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3021                    prev_em_start && *prev_em_start != (u64)-1 &&
3022                    *prev_em_start != em->orig_start)
3023                        force_bio_submit = true;
3024
3025                if (prev_em_start)
3026                        *prev_em_start = em->orig_start;
3027
3028                free_extent_map(em);
3029                em = NULL;
3030
3031                /* we've found a hole, just zero and go on */
3032                if (block_start == EXTENT_MAP_HOLE) {
3033                        char *userpage;
3034                        struct extent_state *cached = NULL;
3035
3036                        userpage = kmap_atomic(page);
3037                        memset(userpage + pg_offset, 0, iosize);
3038                        flush_dcache_page(page);
3039                        kunmap_atomic(userpage);
3040
3041                        set_extent_uptodate(tree, cur, cur + iosize - 1,
3042                                            &cached, GFP_NOFS);
3043                        unlock_extent_cached(tree, cur,
3044                                             cur + iosize - 1, &cached);
3045                        cur = cur + iosize;
3046                        pg_offset += iosize;
3047                        continue;
3048                }
3049                /* the get_extent function already copied into the page */
3050                if (test_range_bit(tree, cur, cur_end,
3051                                   EXTENT_UPTODATE, 1, NULL)) {
3052                        check_page_uptodate(tree, page);
3053                        unlock_extent(tree, cur, cur + iosize - 1);
3054                        cur = cur + iosize;
3055                        pg_offset += iosize;
3056                        continue;
3057                }
3058                /* we have an inline extent but it didn't get marked up
3059                 * to date.  Error out
3060                 */
3061                if (block_start == EXTENT_MAP_INLINE) {
3062                        SetPageError(page);
3063                        unlock_extent(tree, cur, cur + iosize - 1);
3064                        cur = cur + iosize;
3065                        pg_offset += iosize;
3066                        continue;
3067                }
3068
3069                ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
3070                                         page, offset, disk_io_size,
3071                                         pg_offset, bdev, bio,
3072                                         end_bio_extent_readpage, mirror_num,
3073                                         *bio_flags,
3074                                         this_bio_flag,
3075                                         force_bio_submit);
3076                if (!ret) {
3077                        nr++;
3078                        *bio_flags = this_bio_flag;
3079                } else {
3080                        SetPageError(page);
3081                        unlock_extent(tree, cur, cur + iosize - 1);
3082                        goto out;
3083                }
3084                cur = cur + iosize;
3085                pg_offset += iosize;
3086        }
3087out:
3088        if (!nr) {
3089                if (!PageError(page))
3090                        SetPageUptodate(page);
3091                unlock_page(page);
3092        }
3093        return ret;
3094}
3095
3096static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
3097                                             struct page *pages[], int nr_pages,
3098                                             u64 start, u64 end,
3099                                             struct extent_map **em_cached,
3100                                             struct bio **bio,
3101                                             unsigned long *bio_flags,
3102                                             u64 *prev_em_start)
3103{
3104        struct inode *inode;
3105        struct btrfs_ordered_extent *ordered;
3106        int index;
3107
3108        inode = pages[0]->mapping->host;
3109        while (1) {
3110                lock_extent(tree, start, end);
3111                ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
3112                                                     end - start + 1);
3113                if (!ordered)
3114                        break;
3115                unlock_extent(tree, start, end);
3116                btrfs_start_ordered_extent(inode, ordered, 1);
3117                btrfs_put_ordered_extent(ordered);
3118        }
3119
3120        for (index = 0; index < nr_pages; index++) {
3121                __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
3122                                bio, 0, bio_flags, 0, prev_em_start);
3123                put_page(pages[index]);
3124        }
3125}
3126
3127static void __extent_readpages(struct extent_io_tree *tree,
3128                               struct page *pages[],
3129                               int nr_pages,
3130                               struct extent_map **em_cached,
3131                               struct bio **bio, unsigned long *bio_flags,
3132                               u64 *prev_em_start)
3133{
3134        u64 start = 0;
3135        u64 end = 0;
3136        u64 page_start;
3137        int index;
3138        int first_index = 0;
3139
3140        for (index = 0; index < nr_pages; index++) {
3141                page_start = page_offset(pages[index]);
3142                if (!end) {
3143                        start = page_start;
3144                        end = start + PAGE_SIZE - 1;
3145                        first_index = index;
3146                } else if (end + 1 == page_start) {
3147                        end += PAGE_SIZE;
3148                } else {
3149                        __do_contiguous_readpages(tree, &pages[first_index],
3150                                                  index - first_index, start,
3151                                                  end, em_cached,
3152                                                  bio, bio_flags,
3153                                                  prev_em_start);
3154                        start = page_start;
3155                        end = start + PAGE_SIZE - 1;
3156                        first_index = index;
3157                }
3158        }
3159
3160        if (end)
3161                __do_contiguous_readpages(tree, &pages[first_index],
3162                                          index - first_index, start,
3163                                          end, em_cached, bio,
3164                                          bio_flags, prev_em_start);
3165}
3166
3167static int __extent_read_full_page(struct extent_io_tree *tree,
3168                                   struct page *page,
3169                                   get_extent_t *get_extent,
3170                                   struct bio **bio, int mirror_num,
3171                                   unsigned long *bio_flags,
3172                                   unsigned int read_flags)
3173{
3174        struct inode *inode = page->mapping->host;
3175        struct btrfs_ordered_extent *ordered;
3176        u64 start = page_offset(page);
3177        u64 end = start + PAGE_SIZE - 1;
3178        int ret;
3179
3180        while (1) {
3181                lock_extent(tree, start, end);
3182                ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
3183                                                PAGE_SIZE);
3184                if (!ordered)
3185                        break;
3186                unlock_extent(tree, start, end);
3187                btrfs_start_ordered_extent(inode, ordered, 1);
3188                btrfs_put_ordered_extent(ordered);
3189        }
3190
3191        ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
3192                            bio_flags, read_flags, NULL);
3193        return ret;
3194}
3195
3196int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
3197                            get_extent_t *get_extent, int mirror_num)
3198{
3199        struct bio *bio = NULL;
3200        unsigned long bio_flags = 0;
3201        int ret;
3202
3203        ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
3204                                      &bio_flags, 0);
3205        if (bio)
3206                ret = submit_one_bio(bio, mirror_num, bio_flags);
3207        return ret;
3208}
3209
3210static void update_nr_written(struct writeback_control *wbc,
3211                              unsigned long nr_written)
3212{
3213        wbc->nr_to_write -= nr_written;
3214}
3215
3216/*
3217 * helper for __extent_writepage, doing all of the delayed allocation setup.
3218 *
3219 * This returns 1 if our fill_delalloc function did all the work required
3220 * to write the page (copy into inline extent).  In this case the IO has
3221 * been started and the page is already unlocked.
3222 *
3223 * This returns 0 if all went well (page still locked)
3224 * This returns < 0 if there were errors (page still locked)
3225 */
3226static noinline_for_stack int writepage_delalloc(struct inode *inode,
3227                              struct page *page, struct writeback_control *wbc,
3228                              struct extent_page_data *epd,
3229                              u64 delalloc_start,
3230                              unsigned long *nr_written)
3231{
3232        struct extent_io_tree *tree = epd->tree;
3233        u64 page_end = delalloc_start + PAGE_SIZE - 1;
3234        u64 nr_delalloc;
3235        u64 delalloc_to_write = 0;
3236        u64 delalloc_end = 0;
3237        int ret;
3238        int page_started = 0;
3239
3240        if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
3241                return 0;
3242
3243        while (delalloc_end < page_end) {
3244                nr_delalloc = find_lock_delalloc_range(inode, tree,
3245                                               page,
3246                                               &delalloc_start,
3247                                               &delalloc_end,
3248                                               BTRFS_MAX_EXTENT_SIZE);
3249                if (nr_delalloc == 0) {
3250                        delalloc_start = delalloc_end + 1;
3251                        continue;
3252                }
3253                ret = tree->ops->fill_delalloc(inode, page,
3254                                               delalloc_start,
3255                                               delalloc_end,
3256                                               &page_started,
3257                                               nr_written, wbc);
3258                /* File system has been set read-only */
3259                if (ret) {
3260                        SetPageError(page);
3261                        /* fill_delalloc should be return < 0 for error
3262                         * but just in case, we use > 0 here meaning the
3263                         * IO is started, so we don't want to return > 0
3264                         * unless things are going well.
3265                         */
3266                        ret = ret < 0 ? ret : -EIO;
3267                        goto done;
3268                }
3269                /*
3270                 * delalloc_end is already one less than the total length, so
3271                 * we don't subtract one from PAGE_SIZE
3272                 */
3273                delalloc_to_write += (delalloc_end - delalloc_start +
3274                                      PAGE_SIZE) >> PAGE_SHIFT;
3275                delalloc_start = delalloc_end + 1;
3276        }
3277        if (wbc->nr_to_write < delalloc_to_write) {
3278                int thresh = 8192;
3279
3280                if (delalloc_to_write < thresh * 2)
3281                        thresh = delalloc_to_write;
3282                wbc->nr_to_write = min_t(u64, delalloc_to_write,
3283                                         thresh);
3284        }
3285
3286        /* did the fill delalloc function already unlock and start
3287         * the IO?
3288         */
3289        if (page_started) {
3290                /*
3291                 * we've unlocked the page, so we can't update
3292                 * the mapping's writeback index, just update
3293                 * nr_to_write.
3294                 */
3295                wbc->nr_to_write -= *nr_written;
3296                return 1;
3297        }
3298
3299        ret = 0;
3300
3301done:
3302        return ret;
3303}
3304
3305/*
3306 * helper for __extent_writepage.  This calls the writepage start hooks,
3307 * and does the loop to map the page into extents and bios.
3308 *
3309 * We return 1 if the IO is started and the page is unlocked,
3310 * 0 if all went well (page still locked)
3311 * < 0 if there were errors (page still locked)
3312 */
3313static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3314                                 struct page *page,
3315                                 struct writeback_control *wbc,
3316                                 struct extent_page_data *epd,
3317                                 loff_t i_size,
3318                                 unsigned long nr_written,
3319                                 unsigned int write_flags, int *nr_ret)
3320{
3321        struct extent_io_tree *tree = epd->tree;
3322        u64 start = page_offset(page);
3323        u64 page_end = start + PAGE_SIZE - 1;
3324        u64 end;
3325        u64 cur = start;
3326        u64 extent_offset;
3327        u64 block_start;
3328        u64 iosize;
3329        struct extent_map *em;
3330        struct block_device *bdev;
3331        size_t pg_offset = 0;
3332        size_t blocksize;
3333        int ret = 0;
3334        int nr = 0;
3335        bool compressed;
3336
3337        if (tree->ops && tree->ops->writepage_start_hook) {
3338                ret = tree->ops->writepage_start_hook(page, start,
3339                                                      page_end);
3340                if (ret) {
3341                        /* Fixup worker will requeue */
3342                        if (ret == -EBUSY)
3343                                wbc->pages_skipped++;
3344                        else
3345                                redirty_page_for_writepage(wbc, page);
3346
3347                        update_nr_written(wbc, nr_written);
3348                        unlock_page(page);
3349                        return 1;
3350                }
3351        }
3352
3353        /*
3354         * we don't want to touch the inode after unlocking the page,
3355         * so we update the mapping writeback index now
3356         */
3357        update_nr_written(wbc, nr_written + 1);
3358
3359        end = page_end;
3360        if (i_size <= start) {
3361                if (tree->ops && tree->ops->writepage_end_io_hook)
3362                        tree->ops->writepage_end_io_hook(page, start,
3363                                                         page_end, NULL, 1);
3364                goto done;
3365        }
3366
3367        blocksize = inode->i_sb->s_blocksize;
3368
3369        while (cur <= end) {
3370                u64 em_end;
3371                u64 offset;
3372
3373                if (cur >= i_size) {
3374                        if (tree->ops && tree->ops->writepage_end_io_hook)
3375                                tree->ops->writepage_end_io_hook(page, cur,
3376                                                         page_end, NULL, 1);
3377                        break;
3378                }
3379                em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
3380                                     end - cur + 1, 1);
3381                if (IS_ERR_OR_NULL(em)) {
3382                        SetPageError(page);
3383                        ret = PTR_ERR_OR_ZERO(em);
3384                        break;
3385                }
3386
3387                extent_offset = cur - em->start;
3388                em_end = extent_map_end(em);
3389                BUG_ON(em_end <= cur);
3390                BUG_ON(end < cur);
3391                iosize = min(em_end - cur, end - cur + 1);
3392                iosize = ALIGN(iosize, blocksize);
3393                offset = em->block_start + extent_offset;
3394                bdev = em->bdev;
3395                block_start = em->block_start;
3396                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3397                free_extent_map(em);
3398                em = NULL;
3399
3400                /*
3401                 * compressed and inline extents are written through other
3402                 * paths in the FS
3403                 */
3404                if (compressed || block_start == EXTENT_MAP_HOLE ||
3405                    block_start == EXTENT_MAP_INLINE) {
3406                        /*
3407                         * end_io notification does not happen here for
3408                         * compressed extents
3409                         */
3410                        if (!compressed && tree->ops &&
3411                            tree->ops->writepage_end_io_hook)
3412                                tree->ops->writepage_end_io_hook(page, cur,
3413                                                         cur + iosize - 1,
3414                                                         NULL, 1);
3415                        else if (compressed) {
3416                                /* we don't want to end_page_writeback on
3417                                 * a compressed extent.  this happens
3418                                 * elsewhere
3419                                 */
3420                                nr++;
3421                        }
3422
3423                        cur += iosize;
3424                        pg_offset += iosize;
3425                        continue;
3426                }
3427
3428                set_range_writeback(tree, cur, cur + iosize - 1);
3429                if (!PageWriteback(page)) {
3430                        btrfs_err(BTRFS_I(inode)->root->fs_info,
3431                                   "page %lu not writeback, cur %llu end %llu",
3432                               page->index, cur, end);
3433                }
3434
3435                ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
3436                                         page, offset, iosize, pg_offset,
3437                                         bdev, &epd->bio,
3438                                         end_bio_extent_writepage,
3439                                         0, 0, 0, false);
3440                if (ret) {
3441                        SetPageError(page);
3442                        if (PageWriteback(page))
3443                                end_page_writeback(page);
3444                }
3445
3446                cur = cur + iosize;
3447                pg_offset += iosize;
3448                nr++;
3449        }
3450done:
3451        *nr_ret = nr;
3452        return ret;
3453}
3454
3455/*
3456 * the writepage semantics are similar to regular writepage.  extent
3457 * records are inserted to lock ranges in the tree, and as dirty areas
3458 * are found, they are marked writeback.  Then the lock bits are removed
3459 * and the end_io handler clears the writeback ranges
3460 */
3461static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3462                              struct extent_page_data *epd)
3463{
3464        struct inode *inode = page->mapping->host;
3465        u64 start = page_offset(page);
3466        u64 page_end = start + PAGE_SIZE - 1;
3467        int ret;
3468        int nr = 0;
3469        size_t pg_offset = 0;
3470        loff_t i_size = i_size_read(inode);
3471        unsigned long end_index = i_size >> PAGE_SHIFT;
3472        unsigned int write_flags = 0;
3473        unsigned long nr_written = 0;
3474
3475        write_flags = wbc_to_write_flags(wbc);
3476
3477        trace___extent_writepage(page, inode, wbc);
3478
3479        WARN_ON(!PageLocked(page));
3480
3481        ClearPageError(page);
3482
3483        pg_offset = i_size & (PAGE_SIZE - 1);
3484        if (page->index > end_index ||
3485           (page->index == end_index && !pg_offset)) {
3486                page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
3487                unlock_page(page);
3488                return 0;
3489        }
3490
3491        if (page->index == end_index) {
3492                char *userpage;
3493
3494                userpage = kmap_atomic(page);
3495                memset(userpage + pg_offset, 0,
3496                       PAGE_SIZE - pg_offset);
3497                kunmap_atomic(userpage);
3498                flush_dcache_page(page);
3499        }
3500
3501        pg_offset = 0;
3502
3503        set_page_extent_mapped(page);
3504
3505        ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
3506        if (ret == 1)
3507                goto done_unlocked;
3508        if (ret)
3509                goto done;
3510
3511        ret = __extent_writepage_io(inode, page, wbc, epd,
3512                                    i_size, nr_written, write_flags, &nr);
3513        if (ret == 1)
3514                goto done_unlocked;
3515
3516done:
3517        if (nr == 0) {
3518                /* make sure the mapping tag for page dirty gets cleared */
3519                set_page_writeback(page);
3520                end_page_writeback(page);
3521        }
3522        if (PageError(page)) {
3523                ret = ret < 0 ? ret : -EIO;
3524                end_extent_writepage(page, ret, start, page_end);
3525        }
3526        unlock_page(page);
3527        return ret;
3528
3529done_unlocked:
3530        return 0;
3531}
3532
3533void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3534{
3535        wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3536                       TASK_UNINTERRUPTIBLE);
3537}
3538
3539static noinline_for_stack int
3540lock_extent_buffer_for_io(struct extent_buffer *eb,
3541                          struct btrfs_fs_info *fs_info,
3542                          struct extent_page_data *epd)
3543{
3544        unsigned long i, num_pages;
3545        int flush = 0;
3546        int ret = 0;
3547
3548        if (!btrfs_try_tree_write_lock(eb)) {
3549                flush = 1;
3550                flush_write_bio(epd);
3551                btrfs_tree_lock(eb);
3552        }
3553
3554        if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3555                btrfs_tree_unlock(eb);
3556                if (!epd->sync_io)
3557                        return 0;
3558                if (!flush) {
3559                        flush_write_bio(epd);
3560                        flush = 1;
3561                }
3562                while (1) {
3563                        wait_on_extent_buffer_writeback(eb);
3564                        btrfs_tree_lock(eb);
3565                        if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3566                                break;
3567                        btrfs_tree_unlock(eb);
3568                }
3569        }
3570
3571        /*
3572         * We need to do this to prevent races in people who check if the eb is
3573         * under IO since we can end up having no IO bits set for a short period
3574         * of time.
3575         */
3576        spin_lock(&eb->refs_lock);
3577        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3578                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3579                spin_unlock(&eb->refs_lock);
3580                btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3581                percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3582                                         -eb->len,
3583                                         fs_info->dirty_metadata_batch);
3584                ret = 1;
3585        } else {
3586                spin_unlock(&eb->refs_lock);
3587        }
3588
3589        btrfs_tree_unlock(eb);
3590
3591        if (!ret)
3592                return ret;
3593
3594        num_pages = num_extent_pages(eb->start, eb->len);
3595        for (i = 0; i < num_pages; i++) {
3596                struct page *p = eb->pages[i];
3597
3598                if (!trylock_page(p)) {
3599                        if (!flush) {
3600                                flush_write_bio(epd);
3601                                flush = 1;
3602                        }
3603                        lock_page(p);
3604                }
3605        }
3606
3607        return ret;
3608}
3609
3610static void end_extent_buffer_writeback(struct extent_buffer *eb)
3611{
3612        clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3613        smp_mb__after_atomic();
3614        wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3615}
3616
3617static void set_btree_ioerr(struct page *page)
3618{
3619        struct extent_buffer *eb = (struct extent_buffer *)page->private;
3620
3621        SetPageError(page);
3622        if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3623                return;
3624
3625        /*
3626         * If writeback for a btree extent that doesn't belong to a log tree
3627         * failed, increment the counter transaction->eb_write_errors.
3628         * We do this because while the transaction is running and before it's
3629         * committing (when we call filemap_fdata[write|wait]_range against
3630         * the btree inode), we might have
3631         * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3632         * returns an error or an error happens during writeback, when we're
3633         * committing the transaction we wouldn't know about it, since the pages
3634         * can be no longer dirty nor marked anymore for writeback (if a
3635         * subsequent modification to the extent buffer didn't happen before the
3636         * transaction commit), which makes filemap_fdata[write|wait]_range not
3637         * able to find the pages tagged with SetPageError at transaction
3638         * commit time. So if this happens we must abort the transaction,
3639         * otherwise we commit a super block with btree roots that point to
3640         * btree nodes/leafs whose content on disk is invalid - either garbage
3641         * or the content of some node/leaf from a past generation that got
3642         * cowed or deleted and is no longer valid.
3643         *
3644         * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3645         * not be enough - we need to distinguish between log tree extents vs
3646         * non-log tree extents, and the next filemap_fdatawait_range() call
3647         * will catch and clear such errors in the mapping - and that call might
3648         * be from a log sync and not from a transaction commit. Also, checking
3649         * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3650         * not done and would not be reliable - the eb might have been released
3651         * from memory and reading it back again means that flag would not be
3652         * set (since it's a runtime flag, not persisted on disk).
3653         *
3654         * Using the flags below in the btree inode also makes us achieve the
3655         * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3656         * writeback for all dirty pages and before filemap_fdatawait_range()
3657         * is called, the writeback for all dirty pages had already finished
3658         * with errors - because we were not using AS_EIO/AS_ENOSPC,
3659         * filemap_fdatawait_range() would return success, as it could not know
3660         * that writeback errors happened (the pages were no longer tagged for
3661         * writeback).
3662         */
3663        switch (eb->log_index) {
3664        case -1:
3665                set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
3666                break;
3667        case 0:
3668                set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
3669                break;
3670        case 1:
3671                set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
3672                break;
3673        default:
3674                BUG(); /* unexpected, logic error */
3675        }
3676}
3677
3678static void end_bio_extent_buffer_writepage(struct bio *bio)
3679{
3680        struct bio_vec *bvec;
3681        struct extent_buffer *eb;
3682        int i, done;
3683
3684        ASSERT(!bio_flagged(bio, BIO_CLONED));
3685        bio_for_each_segment_all(bvec, bio, i) {
3686                struct page *page = bvec->bv_page;
3687
3688                eb = (struct extent_buffer *)page->private;
3689                BUG_ON(!eb);
3690                done = atomic_dec_and_test(&eb->io_pages);
3691
3692                if (bio->bi_status ||
3693                    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3694                        ClearPageUptodate(page);
3695                        set_btree_ioerr(page);
3696                }
3697
3698                end_page_writeback(page);
3699
3700                if (!done)
3701                        continue;
3702
3703                end_extent_buffer_writeback(eb);
3704        }
3705
3706        bio_put(bio);
3707}
3708
3709static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3710                        struct btrfs_fs_info *fs_info,
3711                        struct writeback_control *wbc,
3712                        struct extent_page_data *epd)
3713{
3714        struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3715        struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
3716        u64 offset = eb->start;
3717        u32 nritems;
3718        unsigned long i, num_pages;
3719        unsigned long start, end;
3720        unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
3721        int ret = 0;
3722
3723        clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3724        num_pages = num_extent_pages(eb->start, eb->len);
3725        atomic_set(&eb->io_pages, num_pages);
3726
3727        /* set btree blocks beyond nritems with 0 to avoid stale content. */
3728        nritems = btrfs_header_nritems(eb);
3729        if (btrfs_header_level(eb) > 0) {
3730                end = btrfs_node_key_ptr_offset(nritems);
3731
3732                memzero_extent_buffer(eb, end, eb->len - end);
3733        } else {
3734                /*
3735                 * leaf:
3736                 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
3737                 */
3738                start = btrfs_item_nr_offset(nritems);
3739                end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb);
3740                memzero_extent_buffer(eb, start, end - start);
3741        }
3742
3743        for (i = 0; i < num_pages; i++) {
3744                struct page *p = eb->pages[i];
3745
3746                clear_page_dirty_for_io(p);
3747                set_page_writeback(p);
3748                ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
3749                                         p, offset, PAGE_SIZE, 0, bdev,
3750                                         &epd->bio,
3751                                         end_bio_extent_buffer_writepage,
3752                                         0, 0, 0, false);
3753                if (ret) {
3754                        set_btree_ioerr(p);
3755                        if (PageWriteback(p))
3756                                end_page_writeback(p);
3757                        if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3758                                end_extent_buffer_writeback(eb);
3759                        ret = -EIO;
3760                        break;
3761                }
3762                offset += PAGE_SIZE;
3763                update_nr_written(wbc, 1);
3764                unlock_page(p);
3765        }
3766
3767        if (unlikely(ret)) {
3768                for (; i < num_pages; i++) {
3769                        struct page *p = eb->pages[i];
3770                        clear_page_dirty_for_io(p);
3771                        unlock_page(p);
3772                }
3773        }
3774
3775        return ret;
3776}
3777
3778int btree_write_cache_pages(struct address_space *mapping,
3779                                   struct writeback_control *wbc)
3780{
3781        struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3782        struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3783        struct extent_buffer *eb, *prev_eb = NULL;
3784        struct extent_page_data epd = {
3785                .bio = NULL,
3786                .tree = tree,
3787                .extent_locked = 0,
3788                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3789        };
3790        int ret = 0;
3791        int done = 0;
3792        int nr_to_write_done = 0;
3793        struct pagevec pvec;
3794        int nr_pages;
3795        pgoff_t index;
3796        pgoff_t end;            /* Inclusive */
3797        int scanned = 0;
3798        int tag;
3799
3800        pagevec_init(&pvec);
3801        if (wbc->range_cyclic) {
3802                index = mapping->writeback_index; /* Start from prev offset */
3803                end = -1;
3804        } else {
3805                index = wbc->range_start >> PAGE_SHIFT;
3806                end = wbc->range_end >> PAGE_SHIFT;
3807                scanned = 1;
3808        }
3809        if (wbc->sync_mode == WB_SYNC_ALL)
3810                tag = PAGECACHE_TAG_TOWRITE;
3811        else
3812                tag = PAGECACHE_TAG_DIRTY;
3813retry:
3814        if (wbc->sync_mode == WB_SYNC_ALL)
3815                tag_pages_for_writeback(mapping, index, end);
3816        while (!done && !nr_to_write_done && (index <= end) &&
3817               (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
3818                        tag))) {
3819                unsigned i;
3820
3821                scanned = 1;
3822                for (i = 0; i < nr_pages; i++) {
3823                        struct page *page = pvec.pages[i];
3824
3825                        if (!PagePrivate(page))
3826                                continue;
3827
3828                        spin_lock(&mapping->private_lock);
3829                        if (!PagePrivate(page)) {
3830                                spin_unlock(&mapping->private_lock);
3831                                continue;
3832                        }
3833
3834                        eb = (struct extent_buffer *)page->private;
3835
3836                        /*
3837                         * Shouldn't happen and normally this would be a BUG_ON
3838                         * but no sense in crashing the users box for something
3839                         * we can survive anyway.
3840                         */
3841                        if (WARN_ON(!eb)) {
3842                                spin_unlock(&mapping->private_lock);
3843                                continue;
3844                        }
3845
3846                        if (eb == prev_eb) {
3847                                spin_unlock(&mapping->private_lock);
3848                                continue;
3849                        }
3850
3851                        ret = atomic_inc_not_zero(&eb->refs);
3852                        spin_unlock(&mapping->private_lock);
3853                        if (!ret)
3854                                continue;
3855
3856                        prev_eb = eb;
3857                        ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3858                        if (!ret) {
3859                                free_extent_buffer(eb);
3860                                continue;
3861                        }
3862
3863                        ret = write_one_eb(eb, fs_info, wbc, &epd);
3864                        if (ret) {
3865                                done = 1;
3866                                free_extent_buffer(eb);
3867                                break;
3868                        }
3869                        free_extent_buffer(eb);
3870
3871                        /*
3872                         * the filesystem may choose to bump up nr_to_write.
3873                         * We have to make sure to honor the new nr_to_write
3874                         * at any time
3875                         */
3876                        nr_to_write_done = wbc->nr_to_write <= 0;
3877                }
3878                pagevec_release(&pvec);
3879                cond_resched();
3880        }
3881        if (!scanned && !done) {
3882                /*
3883                 * We hit the last page and there is more work to be done: wrap
3884                 * back to the start of the file
3885                 */
3886                scanned = 1;
3887                index = 0;
3888                goto retry;
3889        }
3890        flush_write_bio(&epd);
3891        return ret;
3892}
3893
3894/**
3895 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3896 * @mapping: address space structure to write
3897 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3898 * @data: data passed to __extent_writepage function
3899 *
3900 * If a page is already under I/O, write_cache_pages() skips it, even
3901 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3902 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3903 * and msync() need to guarantee that all the data which was dirty at the time
3904 * the call was made get new I/O started against them.  If wbc->sync_mode is
3905 * WB_SYNC_ALL then we were called for data integrity and we must wait for
3906 * existing IO to complete.
3907 */
3908static int extent_write_cache_pages(struct address_space *mapping,
3909                             struct writeback_control *wbc,
3910                             struct extent_page_data *epd)
3911{
3912        struct inode *inode = mapping->host;
3913        int ret = 0;
3914        int done = 0;
3915        int nr_to_write_done = 0;
3916        struct pagevec pvec;
3917        int nr_pages;
3918        pgoff_t index;
3919        pgoff_t end;            /* Inclusive */
3920        pgoff_t done_index;
3921        int range_whole = 0;
3922        int scanned = 0;
3923        int tag;
3924
3925        /*
3926         * We have to hold onto the inode so that ordered extents can do their
3927         * work when the IO finishes.  The alternative to this is failing to add
3928         * an ordered extent if the igrab() fails there and that is a huge pain
3929         * to deal with, so instead just hold onto the inode throughout the
3930         * writepages operation.  If it fails here we are freeing up the inode
3931         * anyway and we'd rather not waste our time writing out stuff that is
3932         * going to be truncated anyway.
3933         */
3934        if (!igrab(inode))
3935                return 0;
3936
3937        pagevec_init(&pvec);
3938        if (wbc->range_cyclic) {
3939                index = mapping->writeback_index; /* Start from prev offset */
3940                end = -1;
3941        } else {
3942                index = wbc->range_start >> PAGE_SHIFT;
3943                end = wbc->range_end >> PAGE_SHIFT;
3944                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
3945                        range_whole = 1;
3946                scanned = 1;
3947        }
3948        if (wbc->sync_mode == WB_SYNC_ALL)
3949                tag = PAGECACHE_TAG_TOWRITE;
3950        else
3951                tag = PAGECACHE_TAG_DIRTY;
3952retry:
3953        if (wbc->sync_mode == WB_SYNC_ALL)
3954                tag_pages_for_writeback(mapping, index, end);
3955        done_index = index;
3956        while (!done && !nr_to_write_done && (index <= end) &&
3957                        (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
3958                                                &index, end, tag))) {
3959                unsigned i;
3960
3961                scanned = 1;
3962                for (i = 0; i < nr_pages; i++) {
3963                        struct page *page = pvec.pages[i];
3964
3965                        done_index = page->index;
3966                        /*
3967                         * At this point we hold neither the i_pages lock nor
3968                         * the page lock: the page may be truncated or
3969                         * invalidated (changing page->mapping to NULL),
3970                         * or even swizzled back from swapper_space to
3971                         * tmpfs file mapping
3972                         */
3973                        if (!trylock_page(page)) {
3974                                flush_write_bio(epd);
3975                                lock_page(page);
3976                        }
3977
3978                        if (unlikely(page->mapping != mapping)) {
3979                                unlock_page(page);
3980                                continue;
3981                        }
3982
3983                        if (wbc->sync_mode != WB_SYNC_NONE) {
3984                                if (PageWriteback(page))
3985                                        flush_write_bio(epd);
3986                                wait_on_page_writeback(page);
3987                        }
3988
3989                        if (PageWriteback(page) ||
3990                            !clear_page_dirty_for_io(page)) {
3991                                unlock_page(page);
3992                                continue;
3993                        }
3994
3995                        ret = __extent_writepage(page, wbc, epd);
3996
3997                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3998                                unlock_page(page);
3999                                ret = 0;
4000                        }
4001                        if (ret < 0) {
4002                                /*
4003                                 * done_index is set past this page,
4004                                 * so media errors will not choke
4005                                 * background writeout for the entire
4006                                 * file. This has consequences for
4007                                 * range_cyclic semantics (ie. it may
4008                                 * not be suitable for data integrity
4009                                 * writeout).
4010                                 */
4011                                done_index = page->index + 1;
4012                                done = 1;
4013                                break;
4014                        }
4015
4016                        /*
4017                         * the filesystem may choose to bump up nr_to_write.
4018                         * We have to make sure to honor the new nr_to_write
4019                         * at any time
4020                         */
4021                        nr_to_write_done = wbc->nr_to_write <= 0;
4022                }
4023                pagevec_release(&pvec);
4024                cond_resched();
4025        }
4026        if (!scanned && !done) {
4027                /*
4028                 * We hit the last page and there is more work to be done: wrap
4029                 * back to the start of the file
4030                 */
4031                scanned = 1;
4032                index = 0;
4033                goto retry;
4034        }
4035
4036        if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4037                mapping->writeback_index = done_index;
4038
4039        btrfs_add_delayed_iput(inode);
4040        return ret;
4041}
4042
4043static void flush_write_bio(struct extent_page_data *epd)
4044{
4045        if (epd->bio) {
4046                int ret;
4047
4048                ret = submit_one_bio(epd->bio, 0, 0);
4049                BUG_ON(ret < 0); /* -ENOMEM */
4050                epd->bio = NULL;
4051        }
4052}
4053
4054int extent_write_full_page(struct page *page, struct writeback_control *wbc)
4055{
4056        int ret;
4057        struct extent_page_data epd = {
4058                .bio = NULL,
4059                .tree = &BTRFS_I(page->mapping->host)->io_tree,
4060                .extent_locked = 0,
4061                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4062        };
4063
4064        ret = __extent_writepage(page, wbc, &epd);
4065
4066        flush_write_bio(&epd);
4067        return ret;
4068}
4069
4070int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
4071                              int mode)
4072{
4073        int ret = 0;
4074        struct address_space *mapping = inode->i_mapping;
4075        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
4076        struct page *page;
4077        unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4078                PAGE_SHIFT;
4079
4080        struct extent_page_data epd = {
4081                .bio = NULL,
4082                .tree = tree,
4083                .extent_locked = 1,
4084                .sync_io = mode == WB_SYNC_ALL,
4085        };
4086        struct writeback_control wbc_writepages = {
4087                .sync_mode      = mode,
4088                .nr_to_write    = nr_pages * 2,
4089                .range_start    = start,
4090                .range_end      = end + 1,
4091        };
4092
4093        while (start <= end) {
4094                page = find_get_page(mapping, start >> PAGE_SHIFT);
4095                if (clear_page_dirty_for_io(page))
4096                        ret = __extent_writepage(page, &wbc_writepages, &epd);
4097                else {
4098                        if (tree->ops && tree->ops->writepage_end_io_hook)
4099                                tree->ops->writepage_end_io_hook(page, start,
4100                                                 start + PAGE_SIZE - 1,
4101                                                 NULL, 1);
4102                        unlock_page(page);
4103                }
4104                put_page(page);
4105                start += PAGE_SIZE;
4106        }
4107
4108        flush_write_bio(&epd);
4109        return ret;
4110}
4111
4112int extent_writepages(struct extent_io_tree *tree,
4113                      struct address_space *mapping,
4114                      struct writeback_control *wbc)
4115{
4116        int ret = 0;
4117        struct extent_page_data epd = {
4118                .bio = NULL,
4119                .tree = tree,
4120                .extent_locked = 0,
4121                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4122        };
4123
4124        ret = extent_write_cache_pages(mapping, wbc, &epd);
4125        flush_write_bio(&epd);
4126        return ret;
4127}
4128
4129int extent_readpages(struct extent_io_tree *tree,
4130                     struct address_space *mapping,
4131                     struct list_head *pages, unsigned nr_pages)
4132{
4133        struct bio *bio = NULL;
4134        unsigned page_idx;
4135        unsigned long bio_flags = 0;
4136        struct page *pagepool[16];
4137        struct page *page;
4138        struct extent_map *em_cached = NULL;
4139        int nr = 0;
4140        u64 prev_em_start = (u64)-1;
4141
4142        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
4143                page = list_entry(pages->prev, struct page, lru);
4144
4145                prefetchw(&page->flags);
4146                list_del(&page->lru);
4147                if (add_to_page_cache_lru(page, mapping,
4148                                        page->index,
4149                                        readahead_gfp_mask(mapping))) {
4150                        put_page(page);
4151                        continue;
4152                }
4153
4154                pagepool[nr++] = page;
4155                if (nr < ARRAY_SIZE(pagepool))
4156                        continue;
4157                __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
4158                                &bio_flags, &prev_em_start);
4159                nr = 0;
4160        }
4161        if (nr)
4162                __extent_readpages(tree, pagepool, nr, &em_cached, &bio,
4163                                &bio_flags, &prev_em_start);
4164
4165        if (em_cached)
4166                free_extent_map(em_cached);
4167
4168        BUG_ON(!list_empty(pages));
4169        if (bio)
4170                return submit_one_bio(bio, 0, bio_flags);
4171        return 0;
4172}
4173
4174/*
4175 * basic invalidatepage code, this waits on any locked or writeback
4176 * ranges corresponding to the page, and then deletes any extent state
4177 * records from the tree
4178 */
4179int extent_invalidatepage(struct extent_io_tree *tree,
4180                          struct page *page, unsigned long offset)
4181{
4182        struct extent_state *cached_state = NULL;
4183        u64 start = page_offset(page);
4184        u64 end = start + PAGE_SIZE - 1;
4185        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4186
4187        start += ALIGN(offset, blocksize);
4188        if (start > end)
4189                return 0;
4190
4191        lock_extent_bits(tree, start, end, &cached_state);
4192        wait_on_page_writeback(page);
4193        clear_extent_bit(tree, start, end,
4194                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4195                         EXTENT_DO_ACCOUNTING,
4196                         1, 1, &cached_state);
4197        return 0;
4198}
4199
4200/*
4201 * a helper for releasepage, this tests for areas of the page that
4202 * are locked or under IO and drops the related state bits if it is safe
4203 * to drop the page.
4204 */
4205static int try_release_extent_state(struct extent_map_tree *map,
4206                                    struct extent_io_tree *tree,
4207                                    struct page *page, gfp_t mask)
4208{
4209        u64 start = page_offset(page);
4210        u64 end = start + PAGE_SIZE - 1;
4211        int ret = 1;
4212
4213        if (test_range_bit(tree, start, end,
4214                           EXTENT_IOBITS, 0, NULL))
4215                ret = 0;
4216        else {
4217                /*
4218                 * at this point we can safely clear everything except the
4219                 * locked bit and the nodatasum bit
4220                 */
4221                ret = __clear_extent_bit(tree, start, end,
4222                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
4223                                 0, 0, NULL, mask, NULL);
4224
4225                /* if clear_extent_bit failed for enomem reasons,
4226                 * we can't allow the release to continue.
4227                 */
4228                if (ret < 0)
4229                        ret = 0;
4230                else
4231                        ret = 1;
4232        }
4233        return ret;
4234}
4235
4236/*
4237 * a helper for releasepage.  As long as there are no locked extents
4238 * in the range corresponding to the page, both state records and extent
4239 * map records are removed
4240 */
4241int try_release_extent_mapping(struct extent_map_tree *map,
4242                               struct extent_io_tree *tree, struct page *page,
4243                               gfp_t mask)
4244{
4245        struct extent_map *em;
4246        u64 start = page_offset(page);
4247        u64 end = start + PAGE_SIZE - 1;
4248
4249        if (gfpflags_allow_blocking(mask) &&
4250            page->mapping->host->i_size > SZ_16M) {
4251                u64 len;
4252                while (start <= end) {
4253                        len = end - start + 1;
4254                        write_lock(&map->lock);
4255                        em = lookup_extent_mapping(map, start, len);
4256                        if (!em) {
4257                                write_unlock(&map->lock);
4258                                break;
4259                        }
4260                        if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4261                            em->start != start) {
4262                                write_unlock(&map->lock);
4263                                free_extent_map(em);
4264                                break;
4265                        }
4266                        if (!test_range_bit(tree, em->start,
4267                                            extent_map_end(em) - 1,
4268                                            EXTENT_LOCKED | EXTENT_WRITEBACK,
4269                                            0, NULL)) {
4270                                remove_extent_mapping(map, em);
4271                                /* once for the rb tree */
4272                                free_extent_map(em);
4273                        }
4274                        start = extent_map_end(em);
4275                        write_unlock(&map->lock);
4276
4277                        /* once for us */
4278                        free_extent_map(em);
4279                }
4280        }
4281        return try_release_extent_state(map, tree, page, mask);
4282}
4283
4284/*
4285 * helper function for fiemap, which doesn't want to see any holes.
4286 * This maps until we find something past 'last'
4287 */
4288static struct extent_map *get_extent_skip_holes(struct inode *inode,
4289                                                u64 offset, u64 last)
4290{
4291        u64 sectorsize = btrfs_inode_sectorsize(inode);
4292        struct extent_map *em;
4293        u64 len;
4294
4295        if (offset >= last)
4296                return NULL;
4297
4298        while (1) {
4299                len = last - offset;
4300                if (len == 0)
4301                        break;
4302                len = ALIGN(len, sectorsize);
4303                em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
4304                                len, 0);
4305                if (IS_ERR_OR_NULL(em))
4306                        return em;
4307
4308                /* if this isn't a hole return it */
4309                if (em->block_start != EXTENT_MAP_HOLE)
4310                        return em;
4311
4312                /* this is a hole, advance to the next extent */
4313                offset = extent_map_end(em);
4314                free_extent_map(em);
4315                if (offset >= last)
4316                        break;
4317        }
4318        return NULL;
4319}
4320
4321/*
4322 * To cache previous fiemap extent
4323 *
4324 * Will be used for merging fiemap extent
4325 */
4326struct fiemap_cache {
4327        u64 offset;
4328        u64 phys;
4329        u64 len;
4330        u32 flags;
4331        bool cached;
4332};
4333
4334/*
4335 * Helper to submit fiemap extent.
4336 *
4337 * Will try to merge current fiemap extent specified by @offset, @phys,
4338 * @len and @flags with cached one.
4339 * And only when we fails to merge, cached one will be submitted as
4340 * fiemap extent.
4341 *
4342 * Return value is the same as fiemap_fill_next_extent().
4343 */
4344static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
4345                                struct fiemap_cache *cache,
4346                                u64 offset, u64 phys, u64 len, u32 flags)
4347{
4348        int ret = 0;
4349
4350        if (!cache->cached)
4351                goto assign;
4352
4353        /*
4354         * Sanity check, extent_fiemap() should have ensured that new
4355         * fiemap extent won't overlap with cahced one.
4356         * Not recoverable.
4357         *
4358         * NOTE: Physical address can overlap, due to compression
4359         */
4360        if (cache->offset + cache->len > offset) {
4361                WARN_ON(1);
4362                return -EINVAL;
4363        }
4364
4365        /*
4366         * Only merges fiemap extents if
4367         * 1) Their logical addresses are continuous
4368         *
4369         * 2) Their physical addresses are continuous
4370         *    So truly compressed (physical size smaller than logical size)
4371         *    extents won't get merged with each other
4372         *
4373         * 3) Share same flags except FIEMAP_EXTENT_LAST
4374         *    So regular extent won't get merged with prealloc extent
4375         */
4376        if (cache->offset + cache->len  == offset &&
4377            cache->phys + cache->len == phys  &&
4378            (cache->flags & ~FIEMAP_EXTENT_LAST) ==
4379                        (flags & ~FIEMAP_EXTENT_LAST)) {
4380                cache->len += len;
4381                cache->flags |= flags;
4382                goto try_submit_last;
4383        }
4384
4385        /* Not mergeable, need to submit cached one */
4386        ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4387                                      cache->len, cache->flags);
4388        cache->cached = false;
4389        if (ret)
4390                return ret;
4391assign:
4392        cache->cached = true;
4393        cache->offset = offset;
4394        cache->phys = phys;
4395        cache->len = len;
4396        cache->flags = flags;
4397try_submit_last:
4398        if (cache->flags & FIEMAP_EXTENT_LAST) {
4399                ret = fiemap_fill_next_extent(fieinfo, cache->offset,
4400                                cache->phys, cache->len, cache->flags);
4401                cache->cached = false;
4402        }
4403        return ret;
4404}
4405
4406/*
4407 * Emit last fiemap cache
4408 *
4409 * The last fiemap cache may still be cached in the following case:
4410 * 0                  4k                    8k
4411 * |<- Fiemap range ->|
4412 * |<------------  First extent ----------->|
4413 *
4414 * In this case, the first extent range will be cached but not emitted.
4415 * So we must emit it before ending extent_fiemap().
4416 */
4417static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
4418                                  struct fiemap_extent_info *fieinfo,
4419                                  struct fiemap_cache *cache)
4420{
4421        int ret;
4422
4423        if (!cache->cached)
4424                return 0;
4425
4426        ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4427                                      cache->len, cache->flags);
4428        cache->cached = false;
4429        if (ret > 0)
4430                ret = 0;
4431        return ret;
4432}
4433
4434int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4435                __u64 start, __u64 len)
4436{
4437        int ret = 0;
4438        u64 off = start;
4439        u64 max = start + len;
4440        u32 flags = 0;
4441        u32 found_type;
4442        u64 last;
4443        u64 last_for_get_extent = 0;
4444        u64 disko = 0;
4445        u64 isize = i_size_read(inode);
4446        struct btrfs_key found_key;
4447        struct extent_map *em = NULL;
4448        struct extent_state *cached_state = NULL;
4449        struct btrfs_path *path;
4450        struct btrfs_root *root = BTRFS_I(inode)->root;
4451        struct fiemap_cache cache = { 0 };
4452        int end = 0;
4453        u64 em_start = 0;
4454        u64 em_len = 0;
4455        u64 em_end = 0;
4456
4457        if (len == 0)
4458                return -EINVAL;
4459
4460        path = btrfs_alloc_path();
4461        if (!path)
4462                return -ENOMEM;
4463        path->leave_spinning = 1;
4464
4465        start = round_down(start, btrfs_inode_sectorsize(inode));
4466        len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4467
4468        /*
4469         * lookup the last file extent.  We're not using i_size here
4470         * because there might be preallocation past i_size
4471         */
4472        ret = btrfs_lookup_file_extent(NULL, root, path,
4473                        btrfs_ino(BTRFS_I(inode)), -1, 0);
4474        if (ret < 0) {
4475                btrfs_free_path(path);
4476                return ret;
4477        } else {
4478                WARN_ON(!ret);
4479                if (ret == 1)
4480                        ret = 0;
4481        }
4482
4483        path->slots[0]--;
4484        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4485        found_type = found_key.type;
4486
4487        /* No extents, but there might be delalloc bits */
4488        if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
4489            found_type != BTRFS_EXTENT_DATA_KEY) {
4490                /* have to trust i_size as the end */
4491                last = (u64)-1;
4492                last_for_get_extent = isize;
4493        } else {
4494                /*
4495                 * remember the start of the last extent.  There are a
4496                 * bunch of different factors that go into the length of the
4497                 * extent, so its much less complex to remember where it started
4498                 */
4499                last = found_key.offset;
4500                last_for_get_extent = last + 1;
4501        }
4502        btrfs_release_path(path);
4503
4504        /*
4505         * we might have some extents allocated but more delalloc past those
4506         * extents.  so, we trust isize unless the start of the last extent is
4507         * beyond isize
4508         */
4509        if (last < isize) {
4510                last = (u64)-1;
4511                last_for_get_extent = isize;
4512        }
4513
4514        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4515                         &cached_state);
4516
4517        em = get_extent_skip_holes(inode, start, last_for_get_extent);
4518        if (!em)
4519                goto out;
4520        if (IS_ERR(em)) {
4521                ret = PTR_ERR(em);
4522                goto out;
4523        }
4524
4525        while (!end) {
4526                u64 offset_in_extent = 0;
4527
4528                /* break if the extent we found is outside the range */
4529                if (em->start >= max || extent_map_end(em) < off)
4530                        break;
4531
4532                /*
4533                 * get_extent may return an extent that starts before our
4534                 * requested range.  We have to make sure the ranges
4535                 * we return to fiemap always move forward and don't
4536                 * overlap, so adjust the offsets here
4537                 */
4538                em_start = max(em->start, off);
4539
4540                /*
4541                 * record the offset from the start of the extent
4542                 * for adjusting the disk offset below.  Only do this if the
4543                 * extent isn't compressed since our in ram offset may be past
4544                 * what we have actually allocated on disk.
4545                 */
4546                if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4547                        offset_in_extent = em_start - em->start;
4548                em_end = extent_map_end(em);
4549                em_len = em_end - em_start;
4550                disko = 0;
4551                flags = 0;
4552
4553                /*
4554                 * bump off for our next call to get_extent
4555                 */
4556                off = extent_map_end(em);
4557                if (off >= max)
4558                        end = 1;
4559
4560                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
4561                        end = 1;
4562                        flags |= FIEMAP_EXTENT_LAST;
4563                } else if (em->block_start == EXTENT_MAP_INLINE) {
4564                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
4565                                  FIEMAP_EXTENT_NOT_ALIGNED);
4566                } else if (em->block_start == EXTENT_MAP_DELALLOC) {
4567                        flags |= (FIEMAP_EXTENT_DELALLOC |
4568                                  FIEMAP_EXTENT_UNKNOWN);
4569                } else if (fieinfo->fi_extents_max) {
4570                        u64 bytenr = em->block_start -
4571                                (em->start - em->orig_start);
4572
4573                        disko = em->block_start + offset_in_extent;
4574
4575                        /*
4576                         * As btrfs supports shared space, this information
4577                         * can be exported to userspace tools via
4578                         * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
4579                         * then we're just getting a count and we can skip the
4580                         * lookup stuff.
4581                         */
4582                        ret = btrfs_check_shared(root,
4583                                                 btrfs_ino(BTRFS_I(inode)),
4584                                                 bytenr);
4585                        if (ret < 0)
4586                                goto out_free;
4587                        if (ret)
4588                                flags |= FIEMAP_EXTENT_SHARED;
4589                        ret = 0;
4590                }
4591                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4592                        flags |= FIEMAP_EXTENT_ENCODED;
4593                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4594                        flags |= FIEMAP_EXTENT_UNWRITTEN;
4595
4596                free_extent_map(em);
4597                em = NULL;
4598                if ((em_start >= last) || em_len == (u64)-1 ||
4599                   (last == (u64)-1 && isize <= em_end)) {
4600                        flags |= FIEMAP_EXTENT_LAST;
4601                        end = 1;
4602                }
4603
4604                /* now scan forward to see if this is really the last extent. */
4605                em = get_extent_skip_holes(inode, off, last_for_get_extent);
4606                if (IS_ERR(em)) {
4607                        ret = PTR_ERR(em);
4608                        goto out;
4609                }
4610                if (!em) {
4611                        flags |= FIEMAP_EXTENT_LAST;
4612                        end = 1;
4613                }
4614                ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
4615                                           em_len, flags);
4616                if (ret) {
4617                        if (ret == 1)
4618                                ret = 0;
4619                        goto out_free;
4620                }
4621        }
4622out_free:
4623        if (!ret)
4624                ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache);
4625        free_extent_map(em);
4626out:
4627        btrfs_free_path(path);
4628        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4629                             &cached_state);
4630        return ret;
4631}
4632
4633static void __free_extent_buffer(struct extent_buffer *eb)
4634{
4635        btrfs_leak_debug_del(&eb->leak_list);
4636        kmem_cache_free(extent_buffer_cache, eb);
4637}
4638
4639int extent_buffer_under_io(struct extent_buffer *eb)
4640{
4641        return (atomic_read(&eb->io_pages) ||
4642                test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4643                test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4644}
4645
4646/*
4647 * Helper for releasing extent buffer page.
4648 */
4649static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
4650{
4651        unsigned long index;
4652        struct page *page;
4653        int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4654
4655        BUG_ON(extent_buffer_under_io(eb));
4656
4657        index = num_extent_pages(eb->start, eb->len);
4658        if (index == 0)
4659                return;
4660
4661        do {
4662                index--;
4663                page = eb->pages[index];
4664                if (!page)
4665                        continue;
4666                if (mapped)
4667                        spin_lock(&page->mapping->private_lock);
4668                /*
4669                 * We do this since we'll remove the pages after we've
4670                 * removed the eb from the radix tree, so we could race
4671                 * and have this page now attached to the new eb.  So
4672                 * only clear page_private if it's still connected to
4673                 * this eb.
4674                 */
4675                if (PagePrivate(page) &&
4676                    page->private == (unsigned long)eb) {
4677                        BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4678                        BUG_ON(PageDirty(page));
4679                        BUG_ON(PageWriteback(page));
4680                        /*
4681                         * We need to make sure we haven't be attached
4682                         * to a new eb.
4683                         */
4684                        ClearPagePrivate(page);
4685                        set_page_private(page, 0);
4686                        /* One for the page private */
4687                        put_page(page);
4688                }
4689
4690                if (mapped)
4691                        spin_unlock(&page->mapping->private_lock);
4692
4693                /* One for when we allocated the page */
4694                put_page(page);
4695        } while (index != 0);
4696}
4697
4698/*
4699 * Helper for releasing the extent buffer.
4700 */
4701static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4702{
4703        btrfs_release_extent_buffer_page(eb);
4704        __free_extent_buffer(eb);
4705}
4706
4707static struct extent_buffer *
4708__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
4709                      unsigned long len)
4710{
4711        struct extent_buffer *eb = NULL;
4712
4713        eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
4714        eb->start = start;
4715        eb->len = len;
4716        eb->fs_info = fs_info;
4717        eb->bflags = 0;
4718        rwlock_init(&eb->lock);
4719        atomic_set(&eb->write_locks, 0);
4720        atomic_set(&eb->read_locks, 0);
4721        atomic_set(&eb->blocking_readers, 0);
4722        atomic_set(&eb->blocking_writers, 0);
4723        atomic_set(&eb->spinning_readers, 0);
4724        atomic_set(&eb->spinning_writers, 0);
4725        eb->lock_nested = 0;
4726        init_waitqueue_head(&eb->write_lock_wq);
4727        init_waitqueue_head(&eb->read_lock_wq);
4728
4729        btrfs_leak_debug_add(&eb->leak_list, &buffers);
4730
4731        spin_lock_init(&eb->refs_lock);
4732        atomic_set(&eb->refs, 1);
4733        atomic_set(&eb->io_pages, 0);
4734
4735        /*
4736         * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4737         */
4738        BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4739                > MAX_INLINE_EXTENT_BUFFER_SIZE);
4740        BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4741
4742        return eb;
4743}
4744
4745struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4746{
4747        unsigned long i;
4748        struct page *p;
4749        struct extent_buffer *new;
4750        unsigned long num_pages = num_extent_pages(src->start, src->len);
4751
4752        new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
4753        if (new == NULL)
4754                return NULL;
4755
4756        for (i = 0; i < num_pages; i++) {
4757                p = alloc_page(GFP_NOFS);
4758                if (!p) {
4759                        btrfs_release_extent_buffer(new);
4760                        return NULL;
4761                }
4762                attach_extent_buffer_page(new, p);
4763                WARN_ON(PageDirty(p));
4764                SetPageUptodate(p);
4765                new->pages[i] = p;
4766                copy_page(page_address(p), page_address(src->pages[i]));
4767        }
4768
4769        set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
4770        set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
4771
4772        return new;
4773}
4774
4775struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4776                                                  u64 start, unsigned long len)
4777{
4778        struct extent_buffer *eb;
4779        unsigned long num_pages;
4780        unsigned long i;
4781
4782        num_pages = num_extent_pages(start, len);
4783
4784        eb = __alloc_extent_buffer(fs_info, start, len);
4785        if (!eb)
4786                return NULL;
4787
4788        for (i = 0; i < num_pages; i++) {
4789                eb->pages[i] = alloc_page(GFP_NOFS);
4790                if (!eb->pages[i])
4791                        goto err;
4792        }
4793        set_extent_buffer_uptodate(eb);
4794        btrfs_set_header_nritems(eb, 0);
4795        set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4796
4797        return eb;
4798err:
4799        for (; i > 0; i--)
4800                __free_page(eb->pages[i - 1]);
4801        __free_extent_buffer(eb);
4802        return NULL;
4803}
4804
4805struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4806                                                u64 start)
4807{
4808        return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
4809}
4810
4811static void check_buffer_tree_ref(struct extent_buffer *eb)
4812{
4813        int refs;
4814        /* the ref bit is tricky.  We have to make sure it is set
4815         * if we have the buffer dirty.   Otherwise the
4816         * code to free a buffer can end up dropping a dirty
4817         * page
4818         *
4819         * Once the ref bit is set, it won't go away while the
4820         * buffer is dirty or in writeback, and it also won't
4821         * go away while we have the reference count on the
4822         * eb bumped.
4823         *
4824         * We can't just set the ref bit without bumping the
4825         * ref on the eb because free_extent_buffer might
4826         * see the ref bit and try to clear it.  If this happens
4827         * free_extent_buffer might end up dropping our original
4828         * ref by mistake and freeing the page before we are able
4829         * to add one more ref.
4830         *
4831         * So bump the ref count first, then set the bit.  If someone
4832         * beat us to it, drop the ref we added.
4833         */
4834        refs = atomic_read(&eb->refs);
4835        if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4836                return;
4837
4838        spin_lock(&eb->refs_lock);
4839        if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4840                atomic_inc(&eb->refs);
4841        spin_unlock(&eb->refs_lock);
4842}
4843
4844static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4845                struct page *accessed)
4846{
4847        unsigned long num_pages, i;
4848
4849        check_buffer_tree_ref(eb);
4850
4851        num_pages = num_extent_pages(eb->start, eb->len);
4852        for (i = 0; i < num_pages; i++) {
4853                struct page *p = eb->pages[i];
4854
4855                if (p != accessed)
4856                        mark_page_accessed(p);
4857        }
4858}
4859
4860struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
4861                                         u64 start)
4862{
4863        struct extent_buffer *eb;
4864
4865        rcu_read_lock();
4866        eb = radix_tree_lookup(&fs_info->buffer_radix,
4867                               start >> PAGE_SHIFT);
4868        if (eb && atomic_inc_not_zero(&eb->refs)) {
4869                rcu_read_unlock();
4870                /*
4871                 * Lock our eb's refs_lock to avoid races with
4872                 * free_extent_buffer. When we get our eb it might be flagged
4873                 * with EXTENT_BUFFER_STALE and another task running
4874                 * free_extent_buffer might have seen that flag set,
4875                 * eb->refs == 2, that the buffer isn't under IO (dirty and
4876                 * writeback flags not set) and it's still in the tree (flag
4877                 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
4878                 * of decrementing the extent buffer's reference count twice.
4879                 * So here we could race and increment the eb's reference count,
4880                 * clear its stale flag, mark it as dirty and drop our reference
4881                 * before the other task finishes executing free_extent_buffer,
4882                 * which would later result in an attempt to free an extent
4883                 * buffer that is dirty.
4884                 */
4885                if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
4886                        spin_lock(&eb->refs_lock);
4887                        spin_unlock(&eb->refs_lock);
4888                }
4889                mark_extent_buffer_accessed(eb, NULL);
4890                return eb;
4891        }
4892        rcu_read_unlock();
4893
4894        return NULL;
4895}
4896
4897#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4898struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
4899                                        u64 start)
4900{
4901        struct extent_buffer *eb, *exists = NULL;
4902        int ret;
4903
4904        eb = find_extent_buffer(fs_info, start);
4905        if (eb)
4906                return eb;
4907        eb = alloc_dummy_extent_buffer(fs_info, start);
4908        if (!eb)
4909                return NULL;
4910        eb->fs_info = fs_info;
4911again:
4912        ret = radix_tree_preload(GFP_NOFS);
4913        if (ret)
4914                goto free_eb;
4915        spin_lock(&fs_info->buffer_lock);
4916        ret = radix_tree_insert(&fs_info->buffer_radix,
4917                                start >> PAGE_SHIFT, eb);
4918        spin_unlock(&fs_info->buffer_lock);
4919        radix_tree_preload_end();
4920        if (ret == -EEXIST) {
4921                exists = find_extent_buffer(fs_info, start);
4922                if (exists)
4923                        goto free_eb;
4924                else
4925                        goto again;
4926        }
4927        check_buffer_tree_ref(eb);
4928        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
4929
4930        /*
4931         * We will free dummy extent buffer's if they come into
4932         * free_extent_buffer with a ref count of 2, but if we are using this we
4933         * want the buffers to stay in memory until we're done with them, so
4934         * bump the ref count again.
4935         */
4936        atomic_inc(&eb->refs);
4937        return eb;
4938free_eb:
4939        btrfs_release_extent_buffer(eb);
4940        return exists;
4941}
4942#endif
4943
4944struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4945                                          u64 start)
4946{
4947        unsigned long len = fs_info->nodesize;
4948        unsigned long num_pages = num_extent_pages(start, len);
4949        unsigned long i;
4950        unsigned long index = start >> PAGE_SHIFT;
4951        struct extent_buffer *eb;
4952        struct extent_buffer *exists = NULL;
4953        struct page *p;
4954        struct address_space *mapping = fs_info->btree_inode->i_mapping;
4955        int uptodate = 1;
4956        int ret;
4957
4958        if (!IS_ALIGNED(start, fs_info->sectorsize)) {
4959                btrfs_err(fs_info, "bad tree block start %llu", start);
4960                return ERR_PTR(-EINVAL);
4961        }
4962
4963        eb = find_extent_buffer(fs_info, start);
4964        if (eb)
4965                return eb;
4966
4967        eb = __alloc_extent_buffer(fs_info, start, len);
4968        if (!eb)
4969                return ERR_PTR(-ENOMEM);
4970
4971        for (i = 0; i < num_pages; i++, index++) {
4972                p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
4973                if (!p) {
4974                        exists = ERR_PTR(-ENOMEM);
4975                        goto free_eb;
4976                }
4977
4978                spin_lock(&mapping->private_lock);
4979                if (PagePrivate(p)) {
4980                        /*
4981                         * We could have already allocated an eb for this page
4982                         * and attached one so lets see if we can get a ref on
4983                         * the existing eb, and if we can we know it's good and
4984                         * we can just return that one, else we know we can just
4985                         * overwrite page->private.
4986                         */
4987                        exists = (struct extent_buffer *)p->private;
4988                        if (atomic_inc_not_zero(&exists->refs)) {
4989                                spin_unlock(&mapping->private_lock);
4990                                unlock_page(p);
4991                                put_page(p);
4992                                mark_extent_buffer_accessed(exists, p);
4993                                goto free_eb;
4994                        }
4995                        exists = NULL;
4996
4997                        /*
4998                         * Do this so attach doesn't complain and we need to
4999                         * drop the ref the old guy had.
5000                         */
5001                        ClearPagePrivate(p);
5002                        WARN_ON(PageDirty(p));
5003                        put_page(p);
5004                }
5005                attach_extent_buffer_page(eb, p);
5006                spin_unlock(&mapping->private_lock);
5007                WARN_ON(PageDirty(p));
5008                eb->pages[i] = p;
5009                if (!PageUptodate(p))
5010                        uptodate = 0;
5011
5012                /*
5013                 * see below about how we avoid a nasty race with release page
5014                 * and why we unlock later
5015                 */
5016        }
5017        if (uptodate)
5018                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5019again:
5020        ret = radix_tree_preload(GFP_NOFS);
5021        if (ret) {
5022                exists = ERR_PTR(ret);
5023                goto free_eb;
5024        }
5025
5026        spin_lock(&fs_info->buffer_lock);
5027        ret = radix_tree_insert(&fs_info->buffer_radix,
5028                                start >> PAGE_SHIFT, eb);
5029        spin_unlock(&fs_info->buffer_lock);
5030        radix_tree_preload_end();
5031        if (ret == -EEXIST) {
5032                exists = find_extent_buffer(fs_info, start);
5033                if (exists)
5034                        goto free_eb;
5035                else
5036                        goto again;
5037        }
5038        /* add one reference for the tree */
5039        check_buffer_tree_ref(eb);
5040        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5041
5042        /*
5043         * there is a race where release page may have
5044         * tried to find this extent buffer in the radix
5045         * but failed.  It will tell the VM it is safe to
5046         * reclaim the, and it will clear the page private bit.
5047         * We must make sure to set the page private bit properly
5048         * after the extent buffer is in the radix tree so
5049         * it doesn't get lost
5050         */
5051        SetPageChecked(eb->pages[0]);
5052        for (i = 1; i < num_pages; i++) {
5053                p = eb->pages[i];
5054                ClearPageChecked(p);
5055                unlock_page(p);
5056        }
5057        unlock_page(eb->pages[0]);
5058        return eb;
5059
5060free_eb:
5061        WARN_ON(!atomic_dec_and_test(&eb->refs));
5062        for (i = 0; i < num_pages; i++) {
5063                if (eb->pages[i])
5064                        unlock_page(eb->pages[i]);
5065        }
5066
5067        btrfs_release_extent_buffer(eb);
5068        return exists;
5069}
5070
5071static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5072{
5073        struct extent_buffer *eb =
5074                        container_of(head, struct extent_buffer, rcu_head);
5075
5076        __free_extent_buffer(eb);
5077}
5078
5079/* Expects to have eb->eb_lock already held */
5080static int release_extent_buffer(struct extent_buffer *eb)
5081{
5082        WARN_ON(atomic_read(&eb->refs) == 0);
5083        if (atomic_dec_and_test(&eb->refs)) {
5084                if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
5085                        struct btrfs_fs_info *fs_info = eb->fs_info;
5086
5087                        spin_unlock(&eb->refs_lock);
5088
5089                        spin_lock(&fs_info->buffer_lock);
5090                        radix_tree_delete(&fs_info->buffer_radix,
5091                                          eb->start >> PAGE_SHIFT);
5092                        spin_unlock(&fs_info->buffer_lock);
5093                } else {
5094                        spin_unlock(&eb->refs_lock);
5095                }
5096
5097                /* Should be safe to release our pages at this point */
5098                btrfs_release_extent_buffer_page(eb);
5099#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5100                if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
5101                        __free_extent_buffer(eb);
5102                        return 1;
5103                }
5104#endif
5105                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
5106                return 1;
5107        }
5108        spin_unlock(&eb->refs_lock);
5109
5110        return 0;
5111}
5112
5113void free_extent_buffer(struct extent_buffer *eb)
5114{
5115        int refs;
5116        int old;
5117        if (!eb)
5118                return;
5119
5120        while (1) {
5121                refs = atomic_read(&eb->refs);
5122                if (refs <= 3)
5123                        break;
5124                old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5125                if (old == refs)
5126                        return;
5127        }
5128
5129        spin_lock(&eb->refs_lock);
5130        if (atomic_read(&eb->refs) == 2 &&
5131            test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
5132                atomic_dec(&eb->refs);
5133
5134        if (atomic_read(&eb->refs) == 2 &&
5135            test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
5136            !extent_buffer_under_io(eb) &&
5137            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5138                atomic_dec(&eb->refs);
5139
5140        /*
5141         * I know this is terrible, but it's temporary until we stop tracking
5142         * the uptodate bits and such for the extent buffers.
5143         */
5144        release_extent_buffer(eb);
5145}
5146
5147void free_extent_buffer_stale(struct extent_buffer *eb)
5148{
5149        if (!eb)
5150                return;
5151
5152        spin_lock(&eb->refs_lock);
5153        set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5154
5155        if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
5156            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5157                atomic_dec(&eb->refs);
5158        release_extent_buffer(eb);
5159}
5160
5161void clear_extent_buffer_dirty(struct extent_buffer *eb)
5162{
5163        unsigned long i;
5164        unsigned long num_pages;
5165        struct page *page;
5166
5167        num_pages = num_extent_pages(eb->start, eb->len);
5168
5169        for (i = 0; i < num_pages; i++) {
5170                page = eb->pages[i];
5171                if (!PageDirty(page))
5172                        continue;
5173
5174                lock_page(page);
5175                WARN_ON(!PagePrivate(page));
5176
5177                clear_page_dirty_for_io(page);
5178                xa_lock_irq(&page->mapping->i_pages);
5179                if (!PageDirty(page)) {
5180                        radix_tree_tag_clear(&page->mapping->i_pages,
5181                                                page_index(page),
5182                                                PAGECACHE_TAG_DIRTY);
5183                }
5184                xa_unlock_irq(&page->mapping->i_pages);
5185                ClearPageError(page);
5186                unlock_page(page);
5187        }
5188        WARN_ON(atomic_read(&eb->refs) == 0);
5189}
5190
5191int set_extent_buffer_dirty(struct extent_buffer *eb)
5192{
5193        unsigned long i;
5194        unsigned long num_pages;
5195        int was_dirty = 0;
5196
5197        check_buffer_tree_ref(eb);
5198
5199        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
5200
5201        num_pages = num_extent_pages(eb->start, eb->len);
5202        WARN_ON(atomic_read(&eb->refs) == 0);
5203        WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5204
5205        for (i = 0; i < num_pages; i++)
5206                set_page_dirty(eb->pages[i]);
5207        return was_dirty;
5208}
5209
5210void clear_extent_buffer_uptodate(struct extent_buffer *eb)
5211{
5212        unsigned long i;
5213        struct page *page;
5214        unsigned long num_pages;
5215
5216        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5217        num_pages = num_extent_pages(eb->start, eb->len);
5218        for (i = 0; i < num_pages; i++) {
5219                page = eb->pages[i];
5220                if (page)
5221                        ClearPageUptodate(page);
5222        }
5223}
5224
5225void set_extent_buffer_uptodate(struct extent_buffer *eb)
5226{
5227        unsigned long i;
5228        struct page *page;
5229        unsigned long num_pages;
5230
5231        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5232        num_pages = num_extent_pages(eb->start, eb->len);
5233        for (i = 0; i < num_pages; i++) {
5234                page = eb->pages[i];
5235                SetPageUptodate(page);
5236        }
5237}
5238
5239int read_extent_buffer_pages(struct extent_io_tree *tree,
5240                             struct extent_buffer *eb, int wait, int mirror_num)
5241{
5242        unsigned long i;
5243        struct page *page;
5244        int err;
5245        int ret = 0;
5246        int locked_pages = 0;
5247        int all_uptodate = 1;
5248        unsigned long num_pages;
5249        unsigned long num_reads = 0;
5250        struct bio *bio = NULL;
5251        unsigned long bio_flags = 0;
5252
5253        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
5254                return 0;
5255
5256        num_pages = num_extent_pages(eb->start, eb->len);
5257        for (i = 0; i < num_pages; i++) {
5258                page = eb->pages[i];
5259                if (wait == WAIT_NONE) {
5260                        if (!trylock_page(page))
5261                                goto unlock_exit;
5262                } else {
5263                        lock_page(page);
5264                }
5265                locked_pages++;
5266        }
5267        /*
5268         * We need to firstly lock all pages to make sure that
5269         * the uptodate bit of our pages won't be affected by
5270         * clear_extent_buffer_uptodate().
5271         */
5272        for (i = 0; i < num_pages; i++) {
5273                page = eb->pages[i];
5274                if (!PageUptodate(page)) {
5275                        num_reads++;
5276                        all_uptodate = 0;
5277                }
5278        }
5279
5280        if (all_uptodate) {
5281                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5282                goto unlock_exit;
5283        }
5284
5285        clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5286        eb->read_mirror = 0;
5287        atomic_set(&eb->io_pages, num_reads);
5288        for (i = 0; i < num_pages; i++) {
5289                page = eb->pages[i];
5290
5291                if (!PageUptodate(page)) {
5292                        if (ret) {
5293                                atomic_dec(&eb->io_pages);
5294                                unlock_page(page);
5295                                continue;
5296                        }
5297
5298                        ClearPageError(page);
5299                        err = __extent_read_full_page(tree, page,
5300                                                      btree_get_extent, &bio,
5301                                                      mirror_num, &bio_flags,
5302                                                      REQ_META);
5303                        if (err) {
5304                                ret = err;
5305                                /*
5306                                 * We use &bio in above __extent_read_full_page,
5307                                 * so we ensure that if it returns error, the
5308                                 * current page fails to add itself to bio and
5309                                 * it's been unlocked.
5310                                 *
5311                                 * We must dec io_pages by ourselves.
5312                                 */
5313                                atomic_dec(&eb->io_pages);
5314                        }
5315                } else {
5316                        unlock_page(page);
5317                }
5318        }
5319
5320        if (bio) {
5321                err = submit_one_bio(bio, mirror_num, bio_flags);
5322                if (err)
5323                        return err;
5324        }
5325
5326        if (ret || wait != WAIT_COMPLETE)
5327                return ret;
5328
5329        for (i = 0; i < num_pages; i++) {
5330                page = eb->pages[i];
5331                wait_on_page_locked(page);
5332                if (!PageUptodate(page))
5333                        ret = -EIO;
5334        }
5335
5336        return ret;
5337
5338unlock_exit:
5339        while (locked_pages > 0) {
5340                locked_pages--;
5341                page = eb->pages[locked_pages];
5342                unlock_page(page);
5343        }
5344        return ret;
5345}
5346
5347void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
5348                        unsigned long start, unsigned long len)
5349{
5350        size_t cur;
5351        size_t offset;
5352        struct page *page;
5353        char *kaddr;
5354        char *dst = (char *)dstv;
5355        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5356        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5357
5358        if (start + len > eb->len) {
5359                WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5360                     eb->start, eb->len, start, len);
5361                memset(dst, 0, len);
5362                return;
5363        }
5364
5365        offset = (start_offset + start) & (PAGE_SIZE - 1);
5366
5367        while (len > 0) {
5368                page = eb->pages[i];
5369
5370                cur = min(len, (PAGE_SIZE - offset));
5371                kaddr = page_address(page);
5372                memcpy(dst, kaddr + offset, cur);
5373
5374                dst += cur;
5375                len -= cur;
5376                offset = 0;
5377                i++;
5378        }
5379}
5380
5381int read_extent_buffer_to_user(const struct extent_buffer *eb,
5382                               void __user *dstv,
5383                               unsigned long start, unsigned long len)
5384{
5385        size_t cur;
5386        size_t offset;
5387        struct page *page;
5388        char *kaddr;
5389        char __user *dst = (char __user *)dstv;
5390        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5391        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5392        int ret = 0;
5393
5394        WARN_ON(start > eb->len);
5395        WARN_ON(start + len > eb->start + eb->len);
5396
5397        offset = (start_offset + start) & (PAGE_SIZE - 1);
5398
5399        while (len > 0) {
5400                page = eb->pages[i];
5401
5402                cur = min(len, (PAGE_SIZE - offset));
5403                kaddr = page_address(page);
5404                if (copy_to_user(dst, kaddr + offset, cur)) {
5405                        ret = -EFAULT;
5406                        break;
5407                }
5408
5409                dst += cur;
5410                len -= cur;
5411                offset = 0;
5412                i++;
5413        }
5414
5415        return ret;
5416}
5417
5418/*
5419 * return 0 if the item is found within a page.
5420 * return 1 if the item spans two pages.
5421 * return -EINVAL otherwise.
5422 */
5423int map_private_extent_buffer(const struct extent_buffer *eb,
5424                              unsigned long start, unsigned long min_len,
5425                              char **map, unsigned long *map_start,
5426                              unsigned long *map_len)
5427{
5428        size_t offset = start & (PAGE_SIZE - 1);
5429        char *kaddr;
5430        struct page *p;
5431        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5432        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5433        unsigned long end_i = (start_offset + start + min_len - 1) >>
5434                PAGE_SHIFT;
5435
5436        if (start + min_len > eb->len) {
5437                WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5438                       eb->start, eb->len, start, min_len);
5439                return -EINVAL;
5440        }
5441
5442        if (i != end_i)
5443                return 1;
5444
5445        if (i == 0) {
5446                offset = start_offset;
5447                *map_start = 0;
5448        } else {
5449                offset = 0;
5450                *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
5451        }
5452
5453        p = eb->pages[i];
5454        kaddr = page_address(p);
5455        *map = kaddr + offset;
5456        *map_len = PAGE_SIZE - offset;
5457        return 0;
5458}
5459
5460int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
5461                         unsigned long start, unsigned long len)
5462{
5463        size_t cur;
5464        size_t offset;
5465        struct page *page;
5466        char *kaddr;
5467        char *ptr = (char *)ptrv;
5468        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5469        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5470        int ret = 0;
5471
5472        WARN_ON(start > eb->len);
5473        WARN_ON(start + len > eb->start + eb->len);
5474
5475        offset = (start_offset + start) & (PAGE_SIZE - 1);
5476
5477        while (len > 0) {
5478                page = eb->pages[i];
5479
5480                cur = min(len, (PAGE_SIZE - offset));
5481
5482                kaddr = page_address(page);
5483                ret = memcmp(ptr, kaddr + offset, cur);
5484                if (ret)
5485                        break;
5486
5487                ptr += cur;
5488                len -= cur;
5489                offset = 0;
5490                i++;
5491        }
5492        return ret;
5493}
5494
5495void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
5496                const void *srcv)
5497{
5498        char *kaddr;
5499
5500        WARN_ON(!PageUptodate(eb->pages[0]));
5501        kaddr = page_address(eb->pages[0]);
5502        memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
5503                        BTRFS_FSID_SIZE);
5504}
5505
5506void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
5507{
5508        char *kaddr;
5509
5510        WARN_ON(!PageUptodate(eb->pages[0]));
5511        kaddr = page_address(eb->pages[0]);
5512        memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
5513                        BTRFS_FSID_SIZE);
5514}
5515
5516void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5517                         unsigned long start, unsigned long len)
5518{
5519        size_t cur;
5520        size_t offset;
5521        struct page *page;
5522        char *kaddr;
5523        char *src = (char *)srcv;
5524        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5525        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5526
5527        WARN_ON(start > eb->len);
5528        WARN_ON(start + len > eb->start + eb->len);
5529
5530        offset = (start_offset + start) & (PAGE_SIZE - 1);
5531
5532        while (len > 0) {
5533                page = eb->pages[i];
5534                WARN_ON(!PageUptodate(page));
5535
5536                cur = min(len, PAGE_SIZE - offset);
5537                kaddr = page_address(page);
5538                memcpy(kaddr + offset, src, cur);
5539
5540                src += cur;
5541                len -= cur;
5542                offset = 0;
5543                i++;
5544        }
5545}
5546
5547void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
5548                unsigned long len)
5549{
5550        size_t cur;
5551        size_t offset;
5552        struct page *page;
5553        char *kaddr;
5554        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5555        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5556
5557        WARN_ON(start > eb->len);
5558        WARN_ON(start + len > eb->start + eb->len);
5559
5560        offset = (start_offset + start) & (PAGE_SIZE - 1);
5561
5562        while (len > 0) {
5563                page = eb->pages[i];
5564                WARN_ON(!PageUptodate(page));
5565
5566                cur = min(len, PAGE_SIZE - offset);
5567                kaddr = page_address(page);
5568                memset(kaddr + offset, 0, cur);
5569
5570                len -= cur;
5571                offset = 0;
5572                i++;
5573        }
5574}
5575
5576void copy_extent_buffer_full(struct extent_buffer *dst,
5577                             struct extent_buffer *src)
5578{
5579        int i;
5580        unsigned num_pages;
5581
5582        ASSERT(dst->len == src->len);
5583
5584        num_pages = num_extent_pages(dst->start, dst->len);
5585        for (i = 0; i < num_pages; i++)
5586                copy_page(page_address(dst->pages[i]),
5587                                page_address(src->pages[i]));
5588}
5589
5590void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5591                        unsigned long dst_offset, unsigned long src_offset,
5592                        unsigned long len)
5593{
5594        u64 dst_len = dst->len;
5595        size_t cur;
5596        size_t offset;
5597        struct page *page;
5598        char *kaddr;
5599        size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
5600        unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
5601
5602        WARN_ON(src->len != dst_len);
5603
5604        offset = (start_offset + dst_offset) &
5605                (PAGE_SIZE - 1);
5606
5607        while (len > 0) {
5608                page = dst->pages[i];
5609                WARN_ON(!PageUptodate(page));
5610
5611                cur = min(len, (unsigned long)(PAGE_SIZE - offset));
5612
5613                kaddr = page_address(page);
5614                read_extent_buffer(src, kaddr + offset, src_offset, cur);
5615
5616                src_offset += cur;
5617                len -= cur;
5618                offset = 0;
5619                i++;
5620        }
5621}
5622
5623void le_bitmap_set(u8 *map, unsigned int start, int len)
5624{
5625        u8 *p = map + BIT_BYTE(start);
5626        const unsigned int size = start + len;
5627        int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
5628        u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
5629
5630        while (len - bits_to_set >= 0) {
5631                *p |= mask_to_set;
5632                len -= bits_to_set;
5633                bits_to_set = BITS_PER_BYTE;
5634                mask_to_set = ~0;
5635                p++;
5636        }
5637        if (len) {
5638                mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
5639                *p |= mask_to_set;
5640        }
5641}
5642
5643void le_bitmap_clear(u8 *map, unsigned int start, int len)
5644{
5645        u8 *p = map + BIT_BYTE(start);
5646        const unsigned int size = start + len;
5647        int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE);
5648        u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start);
5649
5650        while (len - bits_to_clear >= 0) {
5651                *p &= ~mask_to_clear;
5652                len -= bits_to_clear;
5653                bits_to_clear = BITS_PER_BYTE;
5654                mask_to_clear = ~0;
5655                p++;
5656        }
5657        if (len) {
5658                mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
5659                *p &= ~mask_to_clear;
5660        }
5661}
5662
5663/*
5664 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
5665 * given bit number
5666 * @eb: the extent buffer
5667 * @start: offset of the bitmap item in the extent buffer
5668 * @nr: bit number
5669 * @page_index: return index of the page in the extent buffer that contains the
5670 * given bit number
5671 * @page_offset: return offset into the page given by page_index
5672 *
5673 * This helper hides the ugliness of finding the byte in an extent buffer which
5674 * contains a given bit.
5675 */
5676static inline void eb_bitmap_offset(struct extent_buffer *eb,
5677                                    unsigned long start, unsigned long nr,
5678                                    unsigned long *page_index,
5679                                    size_t *page_offset)
5680{
5681        size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
5682        size_t byte_offset = BIT_BYTE(nr);
5683        size_t offset;
5684
5685        /*
5686         * The byte we want is the offset of the extent buffer + the offset of
5687         * the bitmap item in the extent buffer + the offset of the byte in the
5688         * bitmap item.
5689         */
5690        offset = start_offset + start + byte_offset;
5691
5692        *page_index = offset >> PAGE_SHIFT;
5693        *page_offset = offset & (PAGE_SIZE - 1);
5694}
5695
5696/**
5697 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
5698 * @eb: the extent buffer
5699 * @start: offset of the bitmap item in the extent buffer
5700 * @nr: bit number to test
5701 */
5702int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
5703                           unsigned long nr)
5704{
5705        u8 *kaddr;
5706        struct page *page;
5707        unsigned long i;
5708        size_t offset;
5709
5710        eb_bitmap_offset(eb, start, nr, &i, &offset);
5711        page = eb->pages[i];
5712        WARN_ON(!PageUptodate(page));
5713        kaddr = page_address(page);
5714        return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
5715}
5716
5717/**
5718 * extent_buffer_bitmap_set - set an area of a bitmap
5719 * @eb: the extent buffer
5720 * @start: offset of the bitmap item in the extent buffer
5721 * @pos: bit number of the first bit
5722 * @len: number of bits to set
5723 */
5724void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
5725                              unsigned long pos, unsigned long len)
5726{
5727        u8 *kaddr;
5728        struct page *page;
5729        unsigned long i;
5730        size_t offset;
5731        const unsigned int size = pos + len;
5732        int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
5733        u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
5734
5735        eb_bitmap_offset(eb, start, pos, &i, &offset);
5736        page = eb->pages[i];
5737        WARN_ON(!PageUptodate(page));
5738        kaddr = page_address(page);
5739
5740        while (len >= bits_to_set) {
5741                kaddr[offset] |= mask_to_set;
5742                len -= bits_to_set;
5743                bits_to_set = BITS_PER_BYTE;
5744                mask_to_set = ~0;
5745                if (++offset >= PAGE_SIZE && len > 0) {
5746                        offset = 0;
5747                        page = eb->pages[++i];
5748                        WARN_ON(!PageUptodate(page));
5749                        kaddr = page_address(page);
5750                }
5751        }
5752        if (len) {
5753                mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
5754                kaddr[offset] |= mask_to_set;
5755        }
5756}
5757
5758
5759/**
5760 * extent_buffer_bitmap_clear - clear an area of a bitmap
5761 * @eb: the extent buffer
5762 * @start: offset of the bitmap item in the extent buffer
5763 * @pos: bit number of the first bit
5764 * @len: number of bits to clear
5765 */
5766void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
5767                                unsigned long pos, unsigned long len)
5768{
5769        u8 *kaddr;
5770        struct page *page;
5771        unsigned long i;
5772        size_t offset;
5773        const unsigned int size = pos + len;
5774        int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
5775        u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
5776
5777        eb_bitmap_offset(eb, start, pos, &i, &offset);
5778        page = eb->pages[i];
5779        WARN_ON(!PageUptodate(page));
5780        kaddr = page_address(page);
5781
5782        while (len >= bits_to_clear) {
5783                kaddr[offset] &= ~mask_to_clear;
5784                len -= bits_to_clear;
5785                bits_to_clear = BITS_PER_BYTE;
5786                mask_to_clear = ~0;
5787                if (++offset >= PAGE_SIZE && len > 0) {
5788                        offset = 0;
5789                        page = eb->pages[++i];
5790                        WARN_ON(!PageUptodate(page));
5791                        kaddr = page_address(page);
5792                }
5793        }
5794        if (len) {
5795                mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
5796                kaddr[offset] &= ~mask_to_clear;
5797        }
5798}
5799
5800static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
5801{
5802        unsigned long distance = (src > dst) ? src - dst : dst - src;
5803        return distance < len;
5804}
5805
5806static void copy_pages(struct page *dst_page, struct page *src_page,
5807                       unsigned long dst_off, unsigned long src_off,
5808                       unsigned long len)
5809{
5810        char *dst_kaddr = page_address(dst_page);
5811        char *src_kaddr;
5812        int must_memmove = 0;
5813
5814        if (dst_page != src_page) {
5815                src_kaddr = page_address(src_page);
5816        } else {
5817                src_kaddr = dst_kaddr;
5818                if (areas_overlap(src_off, dst_off, len))
5819                        must_memmove = 1;
5820        }
5821
5822        if (must_memmove)
5823                memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
5824        else
5825                memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
5826}
5827
5828void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5829                           unsigned long src_offset, unsigned long len)
5830{
5831        struct btrfs_fs_info *fs_info = dst->fs_info;
5832        size_t cur;
5833        size_t dst_off_in_page;
5834        size_t src_off_in_page;
5835        size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
5836        unsigned long dst_i;
5837        unsigned long src_i;
5838
5839        if (src_offset + len > dst->len) {
5840                btrfs_err(fs_info,
5841                        "memmove bogus src_offset %lu move len %lu dst len %lu",
5842                         src_offset, len, dst->len);
5843                BUG_ON(1);
5844        }
5845        if (dst_offset + len > dst->len) {
5846                btrfs_err(fs_info,
5847                        "memmove bogus dst_offset %lu move len %lu dst len %lu",
5848                         dst_offset, len, dst->len);
5849                BUG_ON(1);
5850        }
5851
5852        while (len > 0) {
5853                dst_off_in_page = (start_offset + dst_offset) &
5854                        (PAGE_SIZE - 1);
5855                src_off_in_page = (start_offset + src_offset) &
5856                        (PAGE_SIZE - 1);
5857
5858                dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
5859                src_i = (start_offset + src_offset) >> PAGE_SHIFT;
5860
5861                cur = min(len, (unsigned long)(PAGE_SIZE -
5862                                               src_off_in_page));
5863                cur = min_t(unsigned long, cur,
5864                        (unsigned long)(PAGE_SIZE - dst_off_in_page));
5865
5866                copy_pages(dst->pages[dst_i], dst->pages[src_i],
5867                           dst_off_in_page, src_off_in_page, cur);
5868
5869                src_offset += cur;
5870                dst_offset += cur;
5871                len -= cur;
5872        }
5873}
5874
5875void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5876                           unsigned long src_offset, unsigned long len)
5877{
5878        struct btrfs_fs_info *fs_info = dst->fs_info;
5879        size_t cur;
5880        size_t dst_off_in_page;
5881        size_t src_off_in_page;
5882        unsigned long dst_end = dst_offset + len - 1;
5883        unsigned long src_end = src_offset + len - 1;
5884        size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
5885        unsigned long dst_i;
5886        unsigned long src_i;
5887
5888        if (src_offset + len > dst->len) {
5889                btrfs_err(fs_info,
5890                          "memmove bogus src_offset %lu move len %lu len %lu",
5891                          src_offset, len, dst->len);
5892                BUG_ON(1);
5893        }
5894        if (dst_offset + len > dst->len) {
5895                btrfs_err(fs_info,
5896                          "memmove bogus dst_offset %lu move len %lu len %lu",
5897                          dst_offset, len, dst->len);
5898                BUG_ON(1);
5899        }
5900        if (dst_offset < src_offset) {
5901                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
5902                return;
5903        }
5904        while (len > 0) {
5905                dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
5906                src_i = (start_offset + src_end) >> PAGE_SHIFT;
5907
5908                dst_off_in_page = (start_offset + dst_end) &
5909                        (PAGE_SIZE - 1);
5910                src_off_in_page = (start_offset + src_end) &
5911                        (PAGE_SIZE - 1);
5912
5913                cur = min_t(unsigned long, len, src_off_in_page + 1);
5914                cur = min(cur, dst_off_in_page + 1);
5915                copy_pages(dst->pages[dst_i], dst->pages[src_i],
5916                           dst_off_in_page - cur + 1,
5917                           src_off_in_page - cur + 1, cur);
5918
5919                dst_end -= cur;
5920                src_end -= cur;
5921                len -= cur;
5922        }
5923}
5924
5925int try_release_extent_buffer(struct page *page)
5926{
5927        struct extent_buffer *eb;
5928
5929        /*
5930         * We need to make sure nobody is attaching this page to an eb right
5931         * now.
5932         */
5933        spin_lock(&page->mapping->private_lock);
5934        if (!PagePrivate(page)) {
5935                spin_unlock(&page->mapping->private_lock);
5936                return 1;
5937        }
5938
5939        eb = (struct extent_buffer *)page->private;
5940        BUG_ON(!eb);
5941
5942        /*
5943         * This is a little awful but should be ok, we need to make sure that
5944         * the eb doesn't disappear out from under us while we're looking at
5945         * this page.
5946         */
5947        spin_lock(&eb->refs_lock);
5948        if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
5949                spin_unlock(&eb->refs_lock);
5950                spin_unlock(&page->mapping->private_lock);
5951                return 0;
5952        }
5953        spin_unlock(&page->mapping->private_lock);
5954
5955        /*
5956         * If tree ref isn't set then we know the ref on this eb is a real ref,
5957         * so just return, this page will likely be freed soon anyway.
5958         */
5959        if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
5960                spin_unlock(&eb->refs_lock);
5961                return 0;
5962        }
5963
5964        return release_extent_buffer(eb);
5965}
5966