LXR linux/fs/btrfs/extent

   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/bitops.h>
   4#include <linux/slab.h>
   5#include <linux/bio.h>
   6#include <linux/mm.h>
   7#include <linux/pagemap.h>
   8#include <linux/page-flags.h>
   9#include <linux/spinlock.h>
  10#include <linux/blkdev.h>
  11#include <linux/swap.h>
  12#include <linux/writeback.h>
  13#include <linux/pagevec.h>
  14#include <linux/prefetch.h>
  15#include <linux/cleancache.h>
  16#include "extent_io.h"
  17#include "extent-io-tree.h"
  18#include "extent_map.h"
  19#include "ctree.h"
  20#include "btrfs_inode.h"
  21#include "volumes.h"
  22#include "check-integrity.h"
  23#include "locking.h"
  24#include "rcu-string.h"
  25#include "backref.h"
  26#include "disk-io.h"
  27
  28static struct kmem_cache *extent_state_cache;
  29static struct kmem_cache *extent_buffer_cache;
  30static struct bio_set btrfs_bioset;
  31
  32static inline bool extent_state_in_tree(const struct extent_state *state)
  33{
  34        return !RB_EMPTY_NODE(&state->rb_node);
  35}
  36
  37#ifdef CONFIG_BTRFS_DEBUG
  38static LIST_HEAD(states);
  39static DEFINE_SPINLOCK(leak_lock);
  40
  41static inline void btrfs_leak_debug_add(spinlock_t *lock,
  42                                        struct list_head *new,
  43                                        struct list_head *head)
  44{
  45        unsigned long flags;
  46
  47        spin_lock_irqsave(lock, flags);
  48        list_add(new, head);
  49        spin_unlock_irqrestore(lock, flags);
  50}
  51
  52static inline void btrfs_leak_debug_del(spinlock_t *lock,
  53                                        struct list_head *entry)
  54{
  55        unsigned long flags;
  56
  57        spin_lock_irqsave(lock, flags);
  58        list_del(entry);
  59        spin_unlock_irqrestore(lock, flags);
  60}
  61
  62void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
  63{
  64        struct extent_buffer *eb;
  65        unsigned long flags;
  66
  67        /*
  68         * If we didn't get into open_ctree our allocated_ebs will not be
  69         * initialized, so just skip this.
  70         */
  71        if (!fs_info->allocated_ebs.next)
  72                return;
  73
  74        spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
  75        while (!list_empty(&fs_info->allocated_ebs)) {
  76                eb = list_first_entry(&fs_info->allocated_ebs,
  77                                      struct extent_buffer, leak_list);
  78                pr_err(
  79        "BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
  80                       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
  81                       btrfs_header_owner(eb));
  82                list_del(&eb->leak_list);
  83                kmem_cache_free(extent_buffer_cache, eb);
  84        }
  85        spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
  86}
  87
  88static inline void btrfs_extent_state_leak_debug_check(void)
  89{
  90        struct extent_state *state;
  91
  92        while (!list_empty(&states)) {
  93                state = list_entry(states.next, struct extent_state, leak_list);
  94                pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
  95                       state->start, state->end, state->state,
  96                       extent_state_in_tree(state),
  97                       refcount_read(&state->refs));
  98                list_del(&state->leak_list);
  99                kmem_cache_free(extent_state_cache, state);
 100        }
 101}
 102
 103#define btrfs_debug_check_extent_io_range(tree, start, end)             \
 104        __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
 105static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 106                struct extent_io_tree *tree, u64 start, u64 end)
 107{
 108        struct inode *inode = tree->private_data;
 109        u64 isize;
 110
 111        if (!inode || !is_data_inode(inode))
 112                return;
 113
 114        isize = i_size_read(inode);
 115        if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
 116                btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
 117                    "%s: ino %llu isize %llu odd range [%llu,%llu]",
 118                        caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
 119        }
 120}
 121#else
 122#define btrfs_leak_debug_add(lock, new, head)   do {} while (0)
 123#define btrfs_leak_debug_del(lock, entry)       do {} while (0)
 124#define btrfs_extent_state_leak_debug_check()   do {} while (0)
 125#define btrfs_debug_check_extent_io_range(c, s, e)      do {} while (0)
 126#endif
 127
 128struct tree_entry {
 129        u64 start;
 130        u64 end;
 131        struct rb_node rb_node;
 132};
 133
 134struct extent_page_data {
 135        struct bio *bio;
 136        /* tells writepage not to lock the state bits for this range
 137         * it still does the unlocking
 138         */
 139        unsigned int extent_locked:1;
 140
 141        /* tells the submit_bio code to use REQ_SYNC */
 142        unsigned int sync_io:1;
 143};
 144
 145static int add_extent_changeset(struct extent_state *state, u32 bits,
 146                                 struct extent_changeset *changeset,
 147                                 int set)
 148{
 149        int ret;
 150
 151        if (!changeset)
 152                return 0;
 153        if (set && (state->state & bits) == bits)
 154                return 0;
 155        if (!set && (state->state & bits) == 0)
 156                return 0;
 157        changeset->bytes_changed += state->end - state->start + 1;
 158        ret = ulist_add(&changeset->range_changed, state->start, state->end,
 159                        GFP_ATOMIC);
 160        return ret;
 161}
 162
 163int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 164                                unsigned long bio_flags)
 165{
 166        blk_status_t ret = 0;
 167        struct extent_io_tree *tree = bio->bi_private;
 168
 169        bio->bi_private = NULL;
 170
 171        if (is_data_inode(tree->private_data))
 172                ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
 173                                            bio_flags);
 174        else
 175                ret = btrfs_submit_metadata_bio(tree->private_data, bio,
 176                                                mirror_num, bio_flags);
 177
 178        return blk_status_to_errno(ret);
 179}
 180
 181/* Cleanup unsubmitted bios */
 182static void end_write_bio(struct extent_page_data *epd, int ret)
 183{
 184        if (epd->bio) {
 185                epd->bio->bi_status = errno_to_blk_status(ret);
 186                bio_endio(epd->bio);
 187                epd->bio = NULL;
 188        }
 189}
 190
 191/*
 192 * Submit bio from extent page data via submit_one_bio
 193 *
 194 * Return 0 if everything is OK.
 195 * Return <0 for error.
 196 */
 197static int __must_check flush_write_bio(struct extent_page_data *epd)
 198{
 199        int ret = 0;
 200
 201        if (epd->bio) {
 202                ret = submit_one_bio(epd->bio, 0, 0);
 203                /*
 204                 * Clean up of epd->bio is handled by its endio function.
 205                 * And endio is either triggered by successful bio execution
 206                 * or the error handler of submit bio hook.
 207                 * So at this point, no matter what happened, we don't need
 208                 * to clean up epd->bio.
 209                 */
 210                epd->bio = NULL;
 211        }
 212        return ret;
 213}
 214
 215int __init extent_state_cache_init(void)
 216{
 217        extent_state_cache = kmem_cache_create("btrfs_extent_state",
 218                        sizeof(struct extent_state), 0,
 219                        SLAB_MEM_SPREAD, NULL);
 220        if (!extent_state_cache)
 221                return -ENOMEM;
 222        return 0;
 223}
 224
 225int __init extent_io_init(void)
 226{
 227        extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 228                        sizeof(struct extent_buffer), 0,
 229                        SLAB_MEM_SPREAD, NULL);
 230        if (!extent_buffer_cache)
 231                return -ENOMEM;
 232
 233        if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
 234                        offsetof(struct btrfs_io_bio, bio),
 235                        BIOSET_NEED_BVECS))
 236                goto free_buffer_cache;
 237
 238        if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
 239                goto free_bioset;
 240
 241        return 0;
 242
 243free_bioset:
 244        bioset_exit(&btrfs_bioset);
 245
 246free_buffer_cache:
 247        kmem_cache_destroy(extent_buffer_cache);
 248        extent_buffer_cache = NULL;
 249        return -ENOMEM;
 250}
 251
 252void __cold extent_state_cache_exit(void)
 253{
 254        btrfs_extent_state_leak_debug_check();
 255        kmem_cache_destroy(extent_state_cache);
 256}
 257
 258void __cold extent_io_exit(void)
 259{
 260        /*
 261         * Make sure all delayed rcu free are flushed before we
 262         * destroy caches.
 263         */
 264        rcu_barrier();
 265        kmem_cache_destroy(extent_buffer_cache);
 266        bioset_exit(&btrfs_bioset);
 267}
 268
 269/*
 270 * For the file_extent_tree, we want to hold the inode lock when we lookup and
 271 * update the disk_i_size, but lockdep will complain because our io_tree we hold
 272 * the tree lock and get the inode lock when setting delalloc.  These two things
 273 * are unrelated, so make a class for the file_extent_tree so we don't get the
 274 * two locking patterns mixed up.
 275 */
 276static struct lock_class_key file_extent_tree_class;
 277
 278void extent_io_tree_init(struct btrfs_fs_info *fs_info,
 279                         struct extent_io_tree *tree, unsigned int owner,
 280                         void *private_data)
 281{
 282        tree->fs_info = fs_info;
 283        tree->state = RB_ROOT;
 284        tree->dirty_bytes = 0;
 285        spin_lock_init(&tree->lock);
 286        tree->private_data = private_data;
 287        tree->owner = owner;
 288        if (owner == IO_TREE_INODE_FILE_EXTENT)
 289                lockdep_set_class(&tree->lock, &file_extent_tree_class);
 290}
 291
 292void extent_io_tree_release(struct extent_io_tree *tree)
 293{
 294        spin_lock(&tree->lock);
 295        /*
 296         * Do a single barrier for the waitqueue_active check here, the state
 297         * of the waitqueue should not change once extent_io_tree_release is
 298         * called.
 299         */
 300        smp_mb();
 301        while (!RB_EMPTY_ROOT(&tree->state)) {
 302                struct rb_node *node;
 303                struct extent_state *state;
 304
 305                node = rb_first(&tree->state);
 306                state = rb_entry(node, struct extent_state, rb_node);
 307                rb_erase(&state->rb_node, &tree->state);
 308                RB_CLEAR_NODE(&state->rb_node);
 309                /*
 310                 * btree io trees aren't supposed to have tasks waiting for
 311                 * changes in the flags of extent states ever.
 312                 */
 313                ASSERT(!waitqueue_active(&state->wq));
 314                free_extent_state(state);
 315
 316                cond_resched_lock(&tree->lock);
 317        }
 318        spin_unlock(&tree->lock);
 319}
 320
 321static struct extent_state *alloc_extent_state(gfp_t mask)
 322{
 323        struct extent_state *state;
 324
 325        /*
 326         * The given mask might be not appropriate for the slab allocator,
 327         * drop the unsupported bits
 328         */
 329        mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
 330        state = kmem_cache_alloc(extent_state_cache, mask);
 331        if (!state)
 332                return state;
 333        state->state = 0;
 334        state->failrec = NULL;
 335        RB_CLEAR_NODE(&state->rb_node);
 336        btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
 337        refcount_set(&state->refs, 1);
 338        init_waitqueue_head(&state->wq);
 339        trace_alloc_extent_state(state, mask, _RET_IP_);
 340        return state;
 341}
 342
 343void free_extent_state(struct extent_state *state)
 344{
 345        if (!state)
 346                return;
 347        if (refcount_dec_and_test(&state->refs)) {
 348                WARN_ON(extent_state_in_tree(state));
 349                btrfs_leak_debug_del(&leak_lock, &state->leak_list);
 350                trace_free_extent_state(state, _RET_IP_);
 351                kmem_cache_free(extent_state_cache, state);
 352        }
 353}
 354
 355static struct rb_node *tree_insert(struct rb_root *root,
 356                                   struct rb_node *search_start,
 357                                   u64 offset,
 358                                   struct rb_node *node,
 359                                   struct rb_node ***p_in,
 360                                   struct rb_node **parent_in)
 361{
 362        struct rb_node **p;
 363        struct rb_node *parent = NULL;
 364        struct tree_entry *entry;
 365
 366        if (p_in && parent_in) {
 367                p = *p_in;
 368                parent = *parent_in;
 369                goto do_insert;
 370        }
 371
 372        p = search_start ? &search_start : &root->rb_node;
 373        while (*p) {
 374                parent = *p;
 375                entry = rb_entry(parent, struct tree_entry, rb_node);
 376
 377                if (offset < entry->start)
 378                        p = &(*p)->rb_left;
 379                else if (offset > entry->end)
 380                        p = &(*p)->rb_right;
 381                else
 382                        return parent;
 383        }
 384
 385do_insert:
 386        rb_link_node(node, parent, p);
 387        rb_insert_color(node, root);
 388        return NULL;
 389}
 390
 391/**
 392 * __etree_search - searche @tree for an entry that contains @offset. Such
 393 * entry would have entry->start <= offset && entry->end >= offset.
 394 *
 395 * @tree - the tree to search
 396 * @offset - offset that should fall within an entry in @tree
 397 * @next_ret - pointer to the first entry whose range ends after @offset
 398 * @prev - pointer to the first entry whose range begins before @offset
 399 * @p_ret - pointer where new node should be anchored (used when inserting an
 400 *          entry in the tree)
 401 * @parent_ret - points to entry which would have been the parent of the entry,
 402 *               containing @offset
 403 *
 404 * This function returns a pointer to the entry that contains @offset byte
 405 * address. If no such entry exists, then NULL is returned and the other
 406 * pointer arguments to the function are filled, otherwise the found entry is
 407 * returned and other pointers are left untouched.
 408 */
 409static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 410                                      struct rb_node **next_ret,
 411                                      struct rb_node **prev_ret,
 412                                      struct rb_node ***p_ret,
 413                                      struct rb_node **parent_ret)
 414{
 415        struct rb_root *root = &tree->state;
 416        struct rb_node **n = &root->rb_node;
 417        struct rb_node *prev = NULL;
 418        struct rb_node *orig_prev = NULL;
 419        struct tree_entry *entry;
 420        struct tree_entry *prev_entry = NULL;
 421
 422        while (*n) {
 423                prev = *n;
 424                entry = rb_entry(prev, struct tree_entry, rb_node);
 425                prev_entry = entry;
 426
 427                if (offset < entry->start)
 428                        n = &(*n)->rb_left;
 429                else if (offset > entry->end)
 430                        n = &(*n)->rb_right;
 431                else
 432                        return *n;
 433        }
 434
 435        if (p_ret)
 436                *p_ret = n;
 437        if (parent_ret)
 438                *parent_ret = prev;
 439
 440        if (next_ret) {
 441                orig_prev = prev;
 442                while (prev && offset > prev_entry->end) {
 443                        prev = rb_next(prev);
 444                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 445                }
 446                *next_ret = prev;
 447                prev = orig_prev;
 448        }
 449
 450        if (prev_ret) {
 451                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 452                while (prev && offset < prev_entry->start) {
 453                        prev = rb_prev(prev);
 454                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 455                }
 456                *prev_ret = prev;
 457        }
 458        return NULL;
 459}
 460
 461static inline struct rb_node *
 462tree_search_for_insert(struct extent_io_tree *tree,
 463                       u64 offset,
 464                       struct rb_node ***p_ret,
 465                       struct rb_node **parent_ret)
 466{
 467        struct rb_node *next= NULL;
 468        struct rb_node *ret;
 469
 470        ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
 471        if (!ret)
 472                return next;
 473        return ret;
 474}
 475
 476static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 477                                          u64 offset)
 478{
 479        return tree_search_for_insert(tree, offset, NULL, NULL);
 480}
 481
 482/*
 483 * utility function to look for merge candidates inside a given range.
 484 * Any extents with matching state are merged together into a single
 485 * extent in the tree.  Extents with EXTENT_IO in their state field
 486 * are not merged because the end_io handlers need to be able to do
 487 * operations on them without sleeping (or doing allocations/splits).
 488 *
 489 * This should be called with the tree lock held.
 490 */
 491static void merge_state(struct extent_io_tree *tree,
 492                        struct extent_state *state)
 493{
 494        struct extent_state *other;
 495        struct rb_node *other_node;
 496
 497        if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 498                return;
 499
 500        other_node = rb_prev(&state->rb_node);
 501        if (other_node) {
 502                other = rb_entry(other_node, struct extent_state, rb_node);
 503                if (other->end == state->start - 1 &&
 504                    other->state == state->state) {
 505                        if (tree->private_data &&
 506                            is_data_inode(tree->private_data))
 507                                btrfs_merge_delalloc_extent(tree->private_data,
 508                                                            state, other);
 509                        state->start = other->start;
 510                        rb_erase(&other->rb_node, &tree->state);
 511                        RB_CLEAR_NODE(&other->rb_node);
 512                        free_extent_state(other);
 513                }
 514        }
 515        other_node = rb_next(&state->rb_node);
 516        if (other_node) {
 517                other = rb_entry(other_node, struct extent_state, rb_node);
 518                if (other->start == state->end + 1 &&
 519                    other->state == state->state) {
 520                        if (tree->private_data &&
 521                            is_data_inode(tree->private_data))
 522                                btrfs_merge_delalloc_extent(tree->private_data,
 523                                                            state, other);
 524                        state->end = other->end;
 525                        rb_erase(&other->rb_node, &tree->state);
 526                        RB_CLEAR_NODE(&other->rb_node);
 527                        free_extent_state(other);
 528                }
 529        }
 530}
 531
 532static void set_state_bits(struct extent_io_tree *tree,
 533                           struct extent_state *state, u32 *bits,
 534                           struct extent_changeset *changeset);
 535
 536/*
 537 * insert an extent_state struct into the tree.  'bits' are set on the
 538 * struct before it is inserted.
 539 *
 540 * This may return -EEXIST if the extent is already there, in which case the
 541 * state struct is freed.
 542 *
 543 * The tree lock is not taken internally.  This is a utility function and
 544 * probably isn't what you want to call (see set/clear_extent_bit).
 545 */
 546static int insert_state(struct extent_io_tree *tree,
 547                        struct extent_state *state, u64 start, u64 end,
 548                        struct rb_node ***p,
 549                        struct rb_node **parent,
 550                        u32 *bits, struct extent_changeset *changeset)
 551{
 552        struct rb_node *node;
 553
 554        if (end < start) {
 555                btrfs_err(tree->fs_info,
 556                        "insert state: end < start %llu %llu", end, start);
 557                WARN_ON(1);
 558        }
 559        state->start = start;
 560        state->end = end;
 561
 562        set_state_bits(tree, state, bits, changeset);
 563
 564        node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 565        if (node) {
 566                struct extent_state *found;
 567                found = rb_entry(node, struct extent_state, rb_node);
 568                btrfs_err(tree->fs_info,
 569                       "found node %llu %llu on insert of %llu %llu",
 570                       found->start, found->end, start, end);
 571                return -EEXIST;
 572        }
 573        merge_state(tree, state);
 574        return 0;
 575}
 576
 577/*
 578 * split a given extent state struct in two, inserting the preallocated
 579 * struct 'prealloc' as the newly created second half.  'split' indicates an
 580 * offset inside 'orig' where it should be split.
 581 *
 582 * Before calling,
 583 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 584 * are two extent state structs in the tree:
 585 * prealloc: [orig->start, split - 1]
 586 * orig: [ split, orig->end ]
 587 *
 588 * The tree locks are not taken by this function. They need to be held
 589 * by the caller.
 590 */
 591static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 592                       struct extent_state *prealloc, u64 split)
 593{
 594        struct rb_node *node;
 595
 596        if (tree->private_data && is_data_inode(tree->private_data))
 597                btrfs_split_delalloc_extent(tree->private_data, orig, split);
 598
 599        prealloc->start = orig->start;
 600        prealloc->end = split - 1;
 601        prealloc->state = orig->state;
 602        orig->start = split;
 603
 604        node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
 605                           &prealloc->rb_node, NULL, NULL);
 606        if (node) {
 607                free_extent_state(prealloc);
 608                return -EEXIST;
 609        }
 610        return 0;
 611}
 612
 613static struct extent_state *next_state(struct extent_state *state)
 614{
 615        struct rb_node *next = rb_next(&state->rb_node);
 616        if (next)
 617                return rb_entry(next, struct extent_state, rb_node);
 618        else
 619                return NULL;
 620}
 621
 622/*
 623 * utility function to clear some bits in an extent state struct.
 624 * it will optionally wake up anyone waiting on this state (wake == 1).
 625 *
 626 * If no bits are set on the state struct after clearing things, the
 627 * struct is freed and removed from the tree
 628 */
 629static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 630                                            struct extent_state *state,
 631                                            u32 *bits, int wake,
 632                                            struct extent_changeset *changeset)
 633{
 634        struct extent_state *next;
 635        u32 bits_to_clear = *bits & ~EXTENT_CTLBITS;
 636        int ret;
 637
 638        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 639                u64 range = state->end - state->start + 1;
 640                WARN_ON(range > tree->dirty_bytes);
 641                tree->dirty_bytes -= range;
 642        }
 643
 644        if (tree->private_data && is_data_inode(tree->private_data))
 645                btrfs_clear_delalloc_extent(tree->private_data, state, bits);
 646
 647        ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
 648        BUG_ON(ret < 0);
 649        state->state &= ~bits_to_clear;
 650        if (wake)
 651                wake_up(&state->wq);
 652        if (state->state == 0) {
 653                next = next_state(state);
 654                if (extent_state_in_tree(state)) {
 655                        rb_erase(&state->rb_node, &tree->state);
 656                        RB_CLEAR_NODE(&state->rb_node);
 657                        free_extent_state(state);
 658                } else {
 659                        WARN_ON(1);
 660                }
 661        } else {
 662                merge_state(tree, state);
 663                next = next_state(state);
 664        }
 665        return next;
 666}
 667
 668static struct extent_state *
 669alloc_extent_state_atomic(struct extent_state *prealloc)
 670{
 671        if (!prealloc)
 672                prealloc = alloc_extent_state(GFP_ATOMIC);
 673
 674        return prealloc;
 675}
 676
 677static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 678{
 679        btrfs_panic(tree->fs_info, err,
 680        "locking error: extent tree was modified by another thread while locked");
 681}
 682
 683/*
 684 * clear some bits on a range in the tree.  This may require splitting
 685 * or inserting elements in the tree, so the gfp mask is used to
 686 * indicate which allocations or sleeping are allowed.
 687 *
 688 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 689 * the given range from the tree regardless of state (ie for truncate).
 690 *
 691 * the range [start, end] is inclusive.
 692 *
 693 * This takes the tree lock, and returns 0 on success and < 0 on error.
 694 */
 695int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 696                       u32 bits, int wake, int delete,
 697                       struct extent_state **cached_state,
 698                       gfp_t mask, struct extent_changeset *changeset)
 699{
 700        struct extent_state *state;
 701        struct extent_state *cached;
 702        struct extent_state *prealloc = NULL;
 703        struct rb_node *node;
 704        u64 last_end;
 705        int err;
 706        int clear = 0;
 707
 708        btrfs_debug_check_extent_io_range(tree, start, end);
 709        trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
 710
 711        if (bits & EXTENT_DELALLOC)
 712                bits |= EXTENT_NORESERVE;
 713
 714        if (delete)
 715                bits |= ~EXTENT_CTLBITS;
 716
 717        if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 718                clear = 1;
 719again:
 720        if (!prealloc && gfpflags_allow_blocking(mask)) {
 721                /*
 722                 * Don't care for allocation failure here because we might end
 723                 * up not needing the pre-allocated extent state at all, which
 724                 * is the case if we only have in the tree extent states that
 725                 * cover our input range and don't cover too any other range.
 726                 * If we end up needing a new extent state we allocate it later.
 727                 */
 728                prealloc = alloc_extent_state(mask);
 729        }
 730
 731        spin_lock(&tree->lock);
 732        if (cached_state) {
 733                cached = *cached_state;
 734
 735                if (clear) {
 736                        *cached_state = NULL;
 737                        cached_state = NULL;
 738                }
 739
 740                if (cached && extent_state_in_tree(cached) &&
 741                    cached->start <= start && cached->end > start) {
 742                        if (clear)
 743                                refcount_dec(&cached->refs);
 744                        state = cached;
 745                        goto hit_next;
 746                }
 747                if (clear)
 748                        free_extent_state(cached);
 749        }
 750        /*
 751         * this search will find the extents that end after
 752         * our range starts
 753         */
 754        node = tree_search(tree, start);
 755        if (!node)
 756                goto out;
 757        state = rb_entry(node, struct extent_state, rb_node);
 758hit_next:
 759        if (state->start > end)
 760                goto out;
 761        WARN_ON(state->end < start);
 762        last_end = state->end;
 763
 764        /* the state doesn't have the wanted bits, go ahead */
 765        if (!(state->state & bits)) {
 766                state = next_state(state);
 767                goto next;
 768        }
 769
 770        /*
 771         *     | ---- desired range ---- |
 772         *  | state | or
 773         *  | ------------- state -------------- |
 774         *
 775         * We need to split the extent we found, and may flip
 776         * bits on second half.
 777         *
 778         * If the extent we found extends past our range, we
 779         * just split and search again.  It'll get split again
 780         * the next time though.
 781         *
 782         * If the extent we found is inside our range, we clear
 783         * the desired bit on it.
 784         */
 785
 786        if (state->start < start) {
 787                prealloc = alloc_extent_state_atomic(prealloc);
 788                BUG_ON(!prealloc);
 789                err = split_state(tree, state, prealloc, start);
 790                if (err)
 791                        extent_io_tree_panic(tree, err);
 792
 793                prealloc = NULL;
 794                if (err)
 795                        goto out;
 796                if (state->end <= end) {
 797                        state = clear_state_bit(tree, state, &bits, wake,
 798                                                changeset);
 799                        goto next;
 800                }
 801                goto search_again;
 802        }
 803        /*
 804         * | ---- desired range ---- |
 805         *                        | state |
 806         * We need to split the extent, and clear the bit
 807         * on the first half
 808         */
 809        if (state->start <= end && state->end > end) {
 810                prealloc = alloc_extent_state_atomic(prealloc);
 811                BUG_ON(!prealloc);
 812                err = split_state(tree, state, prealloc, end + 1);
 813                if (err)
 814                        extent_io_tree_panic(tree, err);
 815
 816                if (wake)
 817                        wake_up(&state->wq);
 818
 819                clear_state_bit(tree, prealloc, &bits, wake, changeset);
 820
 821                prealloc = NULL;
 822                goto out;
 823        }
 824
 825        state = clear_state_bit(tree, state, &bits, wake, changeset);
 826next:
 827        if (last_end == (u64)-1)
 828                goto out;
 829        start = last_end + 1;
 830        if (start <= end && state && !need_resched())
 831                goto hit_next;
 832
 833search_again:
 834        if (start > end)
 835                goto out;
 836        spin_unlock(&tree->lock);
 837        if (gfpflags_allow_blocking(mask))
 838                cond_resched();
 839        goto again;
 840
 841out:
 842        spin_unlock(&tree->lock);
 843        if (prealloc)
 844                free_extent_state(prealloc);
 845
 846        return 0;
 847
 848}
 849
 850static void wait_on_state(struct extent_io_tree *tree,
 851                          struct extent_state *state)
 852                __releases(tree->lock)
 853                __acquires(tree->lock)
 854{
 855        DEFINE_WAIT(wait);
 856        prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 857        spin_unlock(&tree->lock);
 858        schedule();
 859        spin_lock(&tree->lock);
 860        finish_wait(&state->wq, &wait);
 861}
 862
 863/*
 864 * waits for one or more bits to clear on a range in the state tree.
 865 * The range [start, end] is inclusive.
 866 * The tree lock is taken by this function
 867 */
 868static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 869                            u32 bits)
 870{
 871        struct extent_state *state;
 872        struct rb_node *node;
 873
 874        btrfs_debug_check_extent_io_range(tree, start, end);
 875
 876        spin_lock(&tree->lock);
 877again:
 878        while (1) {
 879                /*
 880                 * this search will find all the extents that end after
 881                 * our range starts
 882                 */
 883                node = tree_search(tree, start);
 884process_node:
 885                if (!node)
 886                        break;
 887
 888                state = rb_entry(node, struct extent_state, rb_node);
 889
 890                if (state->start > end)
 891                        goto out;
 892
 893                if (state->state & bits) {
 894                        start = state->start;
 895                        refcount_inc(&state->refs);
 896                        wait_on_state(tree, state);
 897                        free_extent_state(state);
 898                        goto again;
 899                }
 900                start = state->end + 1;
 901
 902                if (start > end)
 903                        break;
 904
 905                if (!cond_resched_lock(&tree->lock)) {
 906                        node = rb_next(node);
 907                        goto process_node;
 908                }
 909        }
 910out:
 911        spin_unlock(&tree->lock);
 912}
 913
 914static void set_state_bits(struct extent_io_tree *tree,
 915                           struct extent_state *state,
 916                           u32 *bits, struct extent_changeset *changeset)
 917{
 918        u32 bits_to_set = *bits & ~EXTENT_CTLBITS;
 919        int ret;
 920
 921        if (tree->private_data && is_data_inode(tree->private_data))
 922                btrfs_set_delalloc_extent(tree->private_data, state, bits);
 923
 924        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 925                u64 range = state->end - state->start + 1;
 926                tree->dirty_bytes += range;
 927        }
 928        ret = add_extent_changeset(state, bits_to_set, changeset, 1);
 929        BUG_ON(ret < 0);
 930        state->state |= bits_to_set;
 931}
 932
 933static void cache_state_if_flags(struct extent_state *state,
 934                                 struct extent_state **cached_ptr,
 935                                 unsigned flags)
 936{
 937        if (cached_ptr && !(*cached_ptr)) {
 938                if (!flags || (state->state & flags)) {
 939                        *cached_ptr = state;
 940                        refcount_inc(&state->refs);
 941                }
 942        }
 943}
 944
 945static void cache_state(struct extent_state *state,
 946                        struct extent_state **cached_ptr)
 947{
 948        return cache_state_if_flags(state, cached_ptr,
 949                                    EXTENT_LOCKED | EXTENT_BOUNDARY);
 950}
 951
 952/*
 953 * set some bits on a range in the tree.  This may require allocations or
 954 * sleeping, so the gfp mask is used to indicate what is allowed.
 955 *
 956 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 957 * part of the range already has the desired bits set.  The start of the
 958 * existing range is returned in failed_start in this case.
 959 *
 960 * [start, end] is inclusive This takes the tree lock.
 961 */
 962int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
 963                   u32 exclusive_bits, u64 *failed_start,
 964                   struct extent_state **cached_state, gfp_t mask,
 965                   struct extent_changeset *changeset)
 966{
 967        struct extent_state *state;
 968        struct extent_state *prealloc = NULL;
 969        struct rb_node *node;
 970        struct rb_node **p;
 971        struct rb_node *parent;
 972        int err = 0;
 973        u64 last_start;
 974        u64 last_end;
 975
 976        btrfs_debug_check_extent_io_range(tree, start, end);
 977        trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
 978
 979        if (exclusive_bits)
 980                ASSERT(failed_start);
 981        else
 982                ASSERT(failed_start == NULL);
 983again:
 984        if (!prealloc && gfpflags_allow_blocking(mask)) {
 985                /*
 986                 * Don't care for allocation failure here because we might end
 987                 * up not needing the pre-allocated extent state at all, which
 988                 * is the case if we only have in the tree extent states that
 989                 * cover our input range and don't cover too any other range.
 990                 * If we end up needing a new extent state we allocate it later.
 991                 */
 992                prealloc = alloc_extent_state(mask);
 993        }
 994
 995        spin_lock(&tree->lock);
 996        if (cached_state && *cached_state) {
 997                state = *cached_state;
 998                if (state->start <= start && state->end > start &&
 999                    extent_state_in_tree(state)) {
1000                        node = &state->rb_node;

1001                        goto hit_next;
1002                }
1003        }
1004        /*
1005         * this search will find all the extents that end after
1006         * our range starts.
1007         */
1008        node = tree_search_for_insert(tree, start, &p, &parent);
1009        if (!node) {
1010                prealloc = alloc_extent_state_atomic(prealloc);
1011                BUG_ON(!prealloc);
1012                err = insert_state(tree, prealloc, start, end,
1013                                   &p, &parent, &bits, changeset);
1014                if (err)
1015                        extent_io_tree_panic(tree, err);
1016
1017                cache_state(prealloc, cached_state);
1018                prealloc = NULL;
1019                goto out;
1020        }
1021        state = rb_entry(node, struct extent_state, rb_node);
1022hit_next:
1023        last_start = state->start;
1024        last_end = state->end;
1025
1026        /*
1027         * | ---- desired range ---- |
1028         * | state |
1029         *
1030         * Just lock what we found and keep going
1031         */
1032        if (state->start == start && state->end <= end) {
1033                if (state->state & exclusive_bits) {
1034                        *failed_start = state->start;
1035                        err = -EEXIST;
1036                        goto out;
1037                }
1038
1039                set_state_bits(tree, state, &bits, changeset);
1040                cache_state(state, cached_state);
1041                merge_state(tree, state);
1042                if (last_end == (u64)-1)
1043                        goto out;
1044                start = last_end + 1;
1045                state = next_state(state);
1046                if (start < end && state && state->start == start &&
1047                    !need_resched())
1048                        goto hit_next;
1049                goto search_again;
1050        }
1051
1052        /*
1053         *     | ---- desired range ---- |
1054         * | state |
1055         *   or
1056         * | ------------- state -------------- |
1057         *
1058         * We need to split the extent we found, and may flip bits on
1059         * second half.
1060         *
1061         * If the extent we found extends past our
1062         * range, we just split and search again.  It'll get split
1063         * again the next time though.
1064         *
1065         * If the extent we found is inside our range, we set the
1066         * desired bit on it.
1067         */
1068        if (state->start < start) {
1069                if (state->state & exclusive_bits) {
1070                        *failed_start = start;
1071                        err = -EEXIST;
1072                        goto out;
1073                }
1074
1075                /*
1076                 * If this extent already has all the bits we want set, then
1077                 * skip it, not necessary to split it or do anything with it.
1078                 */
1079                if ((state->state & bits) == bits) {
1080                        start = state->end + 1;
1081                        cache_state(state, cached_state);
1082                        goto search_again;
1083                }
1084
1085                prealloc = alloc_extent_state_atomic(prealloc);
1086                BUG_ON(!prealloc);
1087                err = split_state(tree, state, prealloc, start);
1088                if (err)
1089                        extent_io_tree_panic(tree, err);
1090
1091                prealloc = NULL;
1092                if (err)
1093                        goto out;
1094                if (state->end <= end) {
1095                        set_state_bits(tree, state, &bits, changeset);
1096                        cache_state(state, cached_state);
1097                        merge_state(tree, state);
1098                        if (last_end == (u64)-1)
1099                                goto out;
1100                        start = last_end + 1;
1101                        state = next_state(state);
1102                        if (start < end && state && state->start == start &&
1103                            !need_resched())
1104                                goto hit_next;
1105                }
1106                goto search_again;
1107        }
1108        /*
1109         * | ---- desired range ---- |
1110         *     | state | or               | state |
1111         *
1112         * There's a hole, we need to insert something in it and
1113         * ignore the extent we found.
1114         */
1115        if (state->start > start) {
1116                u64 this_end;
1117                if (end < last_start)
1118                        this_end = end;
1119                else
1120                        this_end = last_start - 1;
1121
1122                prealloc = alloc_extent_state_atomic(prealloc);
1123                BUG_ON(!prealloc);
1124
1125                /*
1126                 * Avoid to free 'prealloc' if it can be merged with
1127                 * the later extent.
1128                 */
1129                err = insert_state(tree, prealloc, start, this_end,
1130                                   NULL, NULL, &bits, changeset);
1131                if (err)
1132                        extent_io_tree_panic(tree, err);
1133
1134                cache_state(prealloc, cached_state);
1135                prealloc = NULL;
1136                start = this_end + 1;
1137                goto search_again;
1138        }
1139        /*
1140         * | ---- desired range ---- |
1141         *                        | state |
1142         * We need to split the extent, and set the bit
1143         * on the first half
1144         */
1145        if (state->start <= end && state->end > end) {
1146                if (state->state & exclusive_bits) {
1147                        *failed_start = start;
1148                        err = -EEXIST;
1149                        goto out;
1150                }
1151
1152                prealloc = alloc_extent_state_atomic(prealloc);
1153                BUG_ON(!prealloc);
1154                err = split_state(tree, state, prealloc, end + 1);
1155                if (err)
1156                        extent_io_tree_panic(tree, err);
1157
1158                set_state_bits(tree, prealloc, &bits, changeset);
1159                cache_state(prealloc, cached_state);
1160                merge_state(tree, prealloc);
1161                prealloc = NULL;
1162                goto out;
1163        }
1164
1165search_again:
1166        if (start > end)
1167                goto out;
1168        spin_unlock(&tree->lock);
1169        if (gfpflags_allow_blocking(mask))
1170                cond_resched();
1171        goto again;
1172
1173out:
1174        spin_unlock(&tree->lock);
1175        if (prealloc)
1176                free_extent_state(prealloc);
1177
1178        return err;
1179
1180}
1181
1182/**
1183 * convert_extent_bit - convert all bits in a given range from one bit to
1184 *                      another
1185 * @tree:       the io tree to search
1186 * @start:      the start offset in bytes
1187 * @end:        the end offset in bytes (inclusive)
1188 * @bits:       the bits to set in this range
1189 * @clear_bits: the bits to clear in this range
1190 * @cached_state:       state that we're going to cache
1191 *
1192 * This will go through and set bits for the given range.  If any states exist
1193 * already in this range they are set with the given bit and cleared of the
1194 * clear_bits.  This is only meant to be used by things that are mergeable, ie
1195 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1196 * boundary bits like LOCK.
1197 *
1198 * All allocations are done with GFP_NOFS.
1199 */
1200int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1201                       u32 bits, u32 clear_bits,
1202                       struct extent_state **cached_state)
1203{
1204        struct extent_state *state;
1205        struct extent_state *prealloc = NULL;
1206        struct rb_node *node;
1207        struct rb_node **p;
1208        struct rb_node *parent;
1209        int err = 0;
1210        u64 last_start;
1211        u64 last_end;
1212        bool first_iteration = true;
1213
1214        btrfs_debug_check_extent_io_range(tree, start, end);
1215        trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1216                                       clear_bits);
1217
1218again:
1219        if (!prealloc) {
1220                /*
1221                 * Best effort, don't worry if extent state allocation fails
1222                 * here for the first iteration. We might have a cached state
1223                 * that matches exactly the target range, in which case no
1224                 * extent state allocations are needed. We'll only know this
1225                 * after locking the tree.
1226                 */
1227                prealloc = alloc_extent_state(GFP_NOFS);
1228                if (!prealloc && !first_iteration)
1229                        return -ENOMEM;
1230        }
1231
1232        spin_lock(&tree->lock);
1233        if (cached_state && *cached_state) {
1234                state = *cached_state;
1235                if (state->start <= start && state->end > start &&
1236                    extent_state_in_tree(state)) {
1237                        node = &state->rb_node;
1238                        goto hit_next;
1239                }
1240        }
1241
1242        /*
1243         * this search will find all the extents that end after
1244         * our range starts.
1245         */
1246        node = tree_search_for_insert(tree, start, &p, &parent);
1247        if (!node) {
1248                prealloc = alloc_extent_state_atomic(prealloc);
1249                if (!prealloc) {
1250                        err = -ENOMEM;
1251                        goto out;
1252                }
1253                err = insert_state(tree, prealloc, start, end,
1254                                   &p, &parent, &bits, NULL);
1255                if (err)
1256                        extent_io_tree_panic(tree, err);
1257                cache_state(prealloc, cached_state);
1258                prealloc = NULL;
1259                goto out;
1260        }
1261        state = rb_entry(node, struct extent_state, rb_node);
1262hit_next:
1263        last_start = state->start;
1264        last_end = state->end;
1265
1266        /*
1267         * | ---- desired range ---- |
1268         * | state |
1269         *
1270         * Just lock what we found and keep going
1271         */
1272        if (state->start == start && state->end <= end) {
1273                set_state_bits(tree, state, &bits, NULL);
1274                cache_state(state, cached_state);
1275                state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
1276                if (last_end == (u64)-1)
1277                        goto out;
1278                start = last_end + 1;
1279                if (start < end && state && state->start == start &&
1280                    !need_resched())
1281                        goto hit_next;
1282                goto search_again;
1283        }
1284
1285        /*
1286         *     | ---- desired range ---- |
1287         * | state |
1288         *   or
1289         * | ------------- state -------------- |
1290         *
1291         * We need to split the extent we found, and may flip bits on
1292         * second half.
1293         *
1294         * If the extent we found extends past our
1295         * range, we just split and search again.  It'll get split
1296         * again the next time though.
1297         *
1298         * If the extent we found is inside our range, we set the
1299         * desired bit on it.
1300         */
1301        if (state->start < start) {
1302                prealloc = alloc_extent_state_atomic(prealloc);
1303                if (!prealloc) {
1304                        err = -ENOMEM;
1305                        goto out;
1306                }
1307                err = split_state(tree, state, prealloc, start);
1308                if (err)
1309                        extent_io_tree_panic(tree, err);
1310                prealloc = NULL;
1311                if (err)
1312                        goto out;
1313                if (state->end <= end) {
1314                        set_state_bits(tree, state, &bits, NULL);
1315                        cache_state(state, cached_state);
1316                        state = clear_state_bit(tree, state, &clear_bits, 0,
1317                                                NULL);
1318                        if (last_end == (u64)-1)
1319                                goto out;
1320                        start = last_end + 1;
1321                        if (start < end && state && state->start == start &&
1322                            !need_resched())
1323                                goto hit_next;
1324                }
1325                goto search_again;
1326        }
1327        /*
1328         * | ---- desired range ---- |
1329         *     | state | or               | state |
1330         *
1331         * There's a hole, we need to insert something in it and
1332         * ignore the extent we found.
1333         */
1334        if (state->start > start) {
1335                u64 this_end;
1336                if (end < last_start)
1337                        this_end = end;
1338                else
1339                        this_end = last_start - 1;
1340
1341                prealloc = alloc_extent_state_atomic(prealloc);
1342                if (!prealloc) {
1343                        err = -ENOMEM;
1344                        goto out;
1345                }
1346
1347                /*
1348                 * Avoid to free 'prealloc' if it can be merged with
1349                 * the later extent.
1350                 */
1351                err = insert_state(tree, prealloc, start, this_end,
1352                                   NULL, NULL, &bits, NULL);
1353                if (err)
1354                        extent_io_tree_panic(tree, err);
1355                cache_state(prealloc, cached_state);
1356                prealloc = NULL;
1357                start = this_end + 1;
1358                goto search_again;
1359        }
1360        /*
1361         * | ---- desired range ---- |
1362         *                        | state |
1363         * We need to split the extent, and set the bit
1364         * on the first half
1365         */
1366        if (state->start <= end && state->end > end) {
1367                prealloc = alloc_extent_state_atomic(prealloc);
1368                if (!prealloc) {
1369                        err = -ENOMEM;
1370                        goto out;
1371                }
1372
1373                err = split_state(tree, state, prealloc, end + 1);
1374                if (err)
1375                        extent_io_tree_panic(tree, err);
1376
1377                set_state_bits(tree, prealloc, &bits, NULL);
1378                cache_state(prealloc, cached_state);
1379                clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
1380                prealloc = NULL;
1381                goto out;
1382        }
1383
1384search_again:
1385        if (start > end)
1386                goto out;
1387        spin_unlock(&tree->lock);
1388        cond_resched();
1389        first_iteration = false;
1390        goto again;
1391
1392out:
1393        spin_unlock(&tree->lock);
1394        if (prealloc)
1395                free_extent_state(prealloc);
1396
1397        return err;
1398}
1399
1400/* wrappers around set/clear extent bit */
1401int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1402                           u32 bits, struct extent_changeset *changeset)
1403{
1404        /*
1405         * We don't support EXTENT_LOCKED yet, as current changeset will
1406         * record any bits changed, so for EXTENT_LOCKED case, it will
1407         * either fail with -EEXIST or changeset will record the whole
1408         * range.
1409         */
1410        BUG_ON(bits & EXTENT_LOCKED);
1411
1412        return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1413                              changeset);
1414}
1415
1416int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1417                           u32 bits)
1418{
1419        return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1420                              GFP_NOWAIT, NULL);
1421}
1422
1423int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1424                     u32 bits, int wake, int delete,
1425                     struct extent_state **cached)
1426{
1427        return __clear_extent_bit(tree, start, end, bits, wake, delete,
1428                                  cached, GFP_NOFS, NULL);
1429}
1430
1431int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1432                u32 bits, struct extent_changeset *changeset)
1433{
1434        /*
1435         * Don't support EXTENT_LOCKED case, same reason as
1436         * set_record_extent_bits().
1437         */
1438        BUG_ON(bits & EXTENT_LOCKED);
1439
1440        return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
1441                                  changeset);
1442}
1443
1444/*
1445 * either insert or lock state struct between start and end use mask to tell
1446 * us if waiting is desired.
1447 */
1448int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1449                     struct extent_state **cached_state)
1450{
1451        int err;
1452        u64 failed_start;
1453
1454        while (1) {
1455                err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1456                                     EXTENT_LOCKED, &failed_start,
1457                                     cached_state, GFP_NOFS, NULL);
1458                if (err == -EEXIST) {
1459                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1460                        start = failed_start;
1461                } else
1462                        break;
1463                WARN_ON(start > end);
1464        }
1465        return err;
1466}
1467
1468int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1469{
1470        int err;
1471        u64 failed_start;
1472
1473        err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1474                             &failed_start, NULL, GFP_NOFS, NULL);
1475        if (err == -EEXIST) {
1476                if (failed_start > start)
1477                        clear_extent_bit(tree, start, failed_start - 1,
1478                                         EXTENT_LOCKED, 1, 0, NULL);
1479                return 0;
1480        }
1481        return 1;
1482}
1483
1484void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1485{
1486        unsigned long index = start >> PAGE_SHIFT;
1487        unsigned long end_index = end >> PAGE_SHIFT;
1488        struct page *page;
1489
1490        while (index <= end_index) {
1491                page = find_get_page(inode->i_mapping, index);
1492                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1493                clear_page_dirty_for_io(page);
1494                put_page(page);
1495                index++;
1496        }
1497}
1498
1499void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1500{
1501        unsigned long index = start >> PAGE_SHIFT;
1502        unsigned long end_index = end >> PAGE_SHIFT;
1503        struct page *page;
1504
1505        while (index <= end_index) {
1506                page = find_get_page(inode->i_mapping, index);
1507                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1508                __set_page_dirty_nobuffers(page);
1509                account_page_redirty(page);
1510                put_page(page);
1511                index++;
1512        }
1513}
1514
1515/* find the first state struct with 'bits' set after 'start', and
1516 * return it.  tree->lock must be held.  NULL will returned if
1517 * nothing was found after 'start'
1518 */
1519static struct extent_state *
1520find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
1521{
1522        struct rb_node *node;
1523        struct extent_state *state;
1524
1525        /*
1526         * this search will find all the extents that end after
1527         * our range starts.
1528         */
1529        node = tree_search(tree, start);
1530        if (!node)
1531                goto out;
1532
1533        while (1) {
1534                state = rb_entry(node, struct extent_state, rb_node);
1535                if (state->end >= start && (state->state & bits))
1536                        return state;
1537
1538                node = rb_next(node);
1539                if (!node)
1540                        break;
1541        }
1542out:
1543        return NULL;
1544}
1545
1546/*
1547 * Find the first offset in the io tree with one or more @bits set.
1548 *
1549 * Note: If there are multiple bits set in @bits, any of them will match.
1550 *
1551 * Return 0 if we find something, and update @start_ret and @end_ret.
1552 * Return 1 if we found nothing.
1553 */
1554int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1555                          u64 *start_ret, u64 *end_ret, u32 bits,
1556                          struct extent_state **cached_state)
1557{
1558        struct extent_state *state;
1559        int ret = 1;
1560
1561        spin_lock(&tree->lock);
1562        if (cached_state && *cached_state) {
1563                state = *cached_state;
1564                if (state->end == start - 1 && extent_state_in_tree(state)) {
1565                        while ((state = next_state(state)) != NULL) {
1566                                if (state->state & bits)
1567                                        goto got_it;
1568                        }
1569                        free_extent_state(*cached_state);
1570                        *cached_state = NULL;
1571                        goto out;
1572                }
1573                free_extent_state(*cached_state);
1574                *cached_state = NULL;
1575        }
1576
1577        state = find_first_extent_bit_state(tree, start, bits);
1578got_it:
1579        if (state) {
1580                cache_state_if_flags(state, cached_state, 0);
1581                *start_ret = state->start;
1582                *end_ret = state->end;
1583                ret = 0;
1584        }
1585out:
1586        spin_unlock(&tree->lock);
1587        return ret;
1588}
1589
1590/**
1591 * find_contiguous_extent_bit: find a contiguous area of bits
1592 * @tree - io tree to check
1593 * @start - offset to start the search from
1594 * @start_ret - the first offset we found with the bits set
1595 * @end_ret - the final contiguous range of the bits that were set
1596 * @bits - bits to look for
1597 *
1598 * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1599 * to set bits appropriately, and then merge them again.  During this time it
1600 * will drop the tree->lock, so use this helper if you want to find the actual
1601 * contiguous area for given bits.  We will search to the first bit we find, and
1602 * then walk down the tree until we find a non-contiguous area.  The area
1603 * returned will be the full contiguous area with the bits set.
1604 */
1605int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
1606                               u64 *start_ret, u64 *end_ret, u32 bits)
1607{
1608        struct extent_state *state;
1609        int ret = 1;
1610
1611        spin_lock(&tree->lock);
1612        state = find_first_extent_bit_state(tree, start, bits);
1613        if (state) {
1614                *start_ret = state->start;
1615                *end_ret = state->end;
1616                while ((state = next_state(state)) != NULL) {
1617                        if (state->start > (*end_ret + 1))
1618                                break;
1619                        *end_ret = state->end;
1620                }
1621                ret = 0;
1622        }
1623        spin_unlock(&tree->lock);
1624        return ret;
1625}
1626
1627/**
1628 * find_first_clear_extent_bit - find the first range that has @bits not set.
1629 * This range could start before @start.
1630 *
1631 * @tree - the tree to search
1632 * @start - the offset at/after which the found extent should start
1633 * @start_ret - records the beginning of the range
1634 * @end_ret - records the end of the range (inclusive)
1635 * @bits - the set of bits which must be unset
1636 *
1637 * Since unallocated range is also considered one which doesn't have the bits
1638 * set it's possible that @end_ret contains -1, this happens in case the range
1639 * spans (last_range_end, end of device]. In this case it's up to the caller to
1640 * trim @end_ret to the appropriate size.
1641 */
1642void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1643                                 u64 *start_ret, u64 *end_ret, u32 bits)
1644{
1645        struct extent_state *state;
1646        struct rb_node *node, *prev = NULL, *next;
1647
1648        spin_lock(&tree->lock);
1649
1650        /* Find first extent with bits cleared */
1651        while (1) {
1652                node = __etree_search(tree, start, &next, &prev, NULL, NULL);
1653                if (!node && !next && !prev) {
1654                        /*
1655                         * Tree is completely empty, send full range and let
1656                         * caller deal with it
1657                         */
1658                        *start_ret = 0;
1659                        *end_ret = -1;
1660                        goto out;
1661                } else if (!node && !next) {
1662                        /*
1663                         * We are past the last allocated chunk, set start at
1664                         * the end of the last extent.
1665                         */
1666                        state = rb_entry(prev, struct extent_state, rb_node);
1667                        *start_ret = state->end + 1;
1668                        *end_ret = -1;
1669                        goto out;
1670                } else if (!node) {
1671                        node = next;
1672                }
1673                /*
1674                 * At this point 'node' either contains 'start' or start is
1675                 * before 'node'
1676                 */
1677                state = rb_entry(node, struct extent_state, rb_node);
1678
1679                if (in_range(start, state->start, state->end - state->start + 1)) {
1680                        if (state->state & bits) {
1681                                /*
1682                                 * |--range with bits sets--|
1683                                 *    |
1684                                 *    start
1685                                 */
1686                                start = state->end + 1;
1687                        } else {
1688                                /*
1689                                 * 'start' falls within a range that doesn't
1690                                 * have the bits set, so take its start as
1691                                 * the beginning of the desired range
1692                                 *
1693                                 * |--range with bits cleared----|
1694                                 *      |
1695                                 *      start
1696                                 */
1697                                *start_ret = state->start;
1698                                break;
1699                        }
1700                } else {
1701                        /*
1702                         * |---prev range---|---hole/unset---|---node range---|
1703                         *                          |
1704                         *                        start
1705                         *
1706                         *                        or
1707                         *
1708                         * |---hole/unset--||--first node--|
1709                         * 0   |
1710                         *    start
1711                         */
1712                        if (prev) {
1713                                state = rb_entry(prev, struct extent_state,
1714                                                 rb_node);
1715                                *start_ret = state->end + 1;
1716                        } else {
1717                                *start_ret = 0;
1718                        }
1719                        break;
1720                }
1721        }
1722
1723        /*
1724         * Find the longest stretch from start until an entry which has the
1725         * bits set
1726         */
1727        while (1) {
1728                state = rb_entry(node, struct extent_state, rb_node);
1729                if (state->end >= start && !(state->state & bits)) {
1730                        *end_ret = state->end;
1731                } else {
1732                        *end_ret = state->start - 1;
1733                        break;
1734                }
1735
1736                node = rb_next(node);
1737                if (!node)
1738                        break;
1739        }
1740out:
1741        spin_unlock(&tree->lock);
1742}
1743
1744/*
1745 * find a contiguous range of bytes in the file marked as delalloc, not
1746 * more than 'max_bytes'.  start and end are used to return the range,
1747 *
1748 * true is returned if we find something, false if nothing was in the tree
1749 */
1750bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1751                               u64 *end, u64 max_bytes,
1752                               struct extent_state **cached_state)
1753{
1754        struct rb_node *node;
1755        struct extent_state *state;
1756        u64 cur_start = *start;
1757        bool found = false;
1758        u64 total_bytes = 0;
1759
1760        spin_lock(&tree->lock);
1761
1762        /*
1763         * this search will find all the extents that end after
1764         * our range starts.
1765         */
1766        node = tree_search(tree, cur_start);
1767        if (!node) {
1768                *end = (u64)-1;
1769                goto out;
1770        }
1771
1772        while (1) {
1773                state = rb_entry(node, struct extent_state, rb_node);
1774                if (found && (state->start != cur_start ||
1775                              (state->state & EXTENT_BOUNDARY))) {
1776                        goto out;
1777                }
1778                if (!(state->state & EXTENT_DELALLOC)) {
1779                        if (!found)
1780                                *end = state->end;
1781                        goto out;
1782                }
1783                if (!found) {
1784                        *start = state->start;
1785                        *cached_state = state;
1786                        refcount_inc(&state->refs);
1787                }
1788                found = true;
1789                *end = state->end;
1790                cur_start = state->end + 1;
1791                node = rb_next(node);
1792                total_bytes += state->end - state->start + 1;
1793                if (total_bytes >= max_bytes)
1794                        break;
1795                if (!node)
1796                        break;
1797        }
1798out:
1799        spin_unlock(&tree->lock);
1800        return found;
1801}
1802
1803static int __process_pages_contig(struct address_space *mapping,
1804                                  struct page *locked_page,
1805                                  pgoff_t start_index, pgoff_t end_index,
1806                                  unsigned long page_ops, pgoff_t *index_ret);
1807
1808static noinline void __unlock_for_delalloc(struct inode *inode,
1809                                           struct page *locked_page,
1810                                           u64 start, u64 end)
1811{
1812        unsigned long index = start >> PAGE_SHIFT;
1813        unsigned long end_index = end >> PAGE_SHIFT;
1814
1815        ASSERT(locked_page);
1816        if (index == locked_page->index && end_index == index)
1817                return;
1818
1819        __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1820                               PAGE_UNLOCK, NULL);
1821}
1822
1823static noinline int lock_delalloc_pages(struct inode *inode,
1824                                        struct page *locked_page,
1825                                        u64 delalloc_start,
1826                                        u64 delalloc_end)
1827{
1828        unsigned long index = delalloc_start >> PAGE_SHIFT;
1829        unsigned long index_ret = index;
1830        unsigned long end_index = delalloc_end >> PAGE_SHIFT;
1831        int ret;
1832
1833        ASSERT(locked_page);
1834        if (index == locked_page->index && index == end_index)
1835                return 0;
1836
1837        ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1838                                     end_index, PAGE_LOCK, &index_ret);
1839        if (ret == -EAGAIN)
1840                __unlock_for_delalloc(inode, locked_page, delalloc_start,
1841                                      (u64)index_ret << PAGE_SHIFT);
1842        return ret;
1843}
1844
1845/*
1846 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
1847 * more than @max_bytes.  @Start and @end are used to return the range,
1848 *
1849 * Return: true if we find something
1850 *         false if nothing was in the tree
1851 */
1852EXPORT_FOR_TESTS
1853noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
1854                                    struct page *locked_page, u64 *start,
1855                                    u64 *end)
1856{
1857        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1858        u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
1859        u64 delalloc_start;
1860        u64 delalloc_end;
1861        bool found;
1862        struct extent_state *cached_state = NULL;
1863        int ret;
1864        int loops = 0;
1865
1866again:
1867        /* step one, find a bunch of delalloc bytes starting at start */
1868        delalloc_start = *start;
1869        delalloc_end = 0;
1870        found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1871                                          max_bytes, &cached_state);
1872        if (!found || delalloc_end <= *start) {
1873                *start = delalloc_start;
1874                *end = delalloc_end;
1875                free_extent_state(cached_state);
1876                return false;
1877        }
1878
1879        /*
1880         * start comes from the offset of locked_page.  We have to lock
1881         * pages in order, so we can't process delalloc bytes before
1882         * locked_page
1883         */
1884        if (delalloc_start < *start)
1885                delalloc_start = *start;
1886
1887        /*
1888         * make sure to limit the number of pages we try to lock down
1889         */
1890        if (delalloc_end + 1 - delalloc_start > max_bytes)
1891                delalloc_end = delalloc_start + max_bytes - 1;
1892
1893        /* step two, lock all the pages after the page that has start */
1894        ret = lock_delalloc_pages(inode, locked_page,
1895                                  delalloc_start, delalloc_end);
1896        ASSERT(!ret || ret == -EAGAIN);
1897        if (ret == -EAGAIN) {
1898                /* some of the pages are gone, lets avoid looping by
1899                 * shortening the size of the delalloc range we're searching
1900                 */
1901                free_extent_state(cached_state);
1902                cached_state = NULL;
1903                if (!loops) {
1904                        max_bytes = PAGE_SIZE;
1905                        loops = 1;
1906                        goto again;
1907                } else {
1908                        found = false;
1909                        goto out_failed;
1910                }
1911        }
1912
1913        /* step three, lock the state bits for the whole range */
1914        lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
1915
1916        /* then test to make sure it is all still delalloc */
1917        ret = test_range_bit(tree, delalloc_start, delalloc_end,
1918                             EXTENT_DELALLOC, 1, cached_state);
1919        if (!ret) {
1920                unlock_extent_cached(tree, delalloc_start, delalloc_end,
1921                                     &cached_state);
1922                __unlock_for_delalloc(inode, locked_page,
1923                              delalloc_start, delalloc_end);
1924                cond_resched();
1925                goto again;
1926        }
1927        free_extent_state(cached_state);
1928        *start = delalloc_start;
1929        *end = delalloc_end;
1930out_failed:
1931        return found;
1932}
1933
1934static int __process_pages_contig(struct address_space *mapping,
1935                                  struct page *locked_page,
1936                                  pgoff_t start_index, pgoff_t end_index,
1937                                  unsigned long page_ops, pgoff_t *index_ret)
1938{
1939        unsigned long nr_pages = end_index - start_index + 1;
1940        unsigned long pages_processed = 0;
1941        pgoff_t index = start_index;
1942        struct page *pages[16];
1943        unsigned ret;
1944        int err = 0;
1945        int i;
1946
1947        if (page_ops & PAGE_LOCK) {
1948                ASSERT(page_ops == PAGE_LOCK);
1949                ASSERT(index_ret && *index_ret == start_index);
1950        }
1951
1952        if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1953                mapping_set_error(mapping, -EIO);
1954
1955        while (nr_pages > 0) {
1956                ret = find_get_pages_contig(mapping, index,
1957                                     min_t(unsigned long,
1958                                     nr_pages, ARRAY_SIZE(pages)), pages);
1959                if (ret == 0) {
1960                        /*
1961                         * Only if we're going to lock these pages,
1962                         * can we find nothing at @index.
1963                         */
1964                        ASSERT(page_ops & PAGE_LOCK);
1965                        err = -EAGAIN;
1966                        goto out;
1967                }
1968
1969                for (i = 0; i < ret; i++) {
1970                        if (page_ops & PAGE_SET_PRIVATE2)
1971                                SetPagePrivate2(pages[i]);
1972
1973                        if (locked_page && pages[i] == locked_page) {
1974                                put_page(pages[i]);
1975                                pages_processed++;
1976                                continue;
1977                        }
1978                        if (page_ops & PAGE_CLEAR_DIRTY)
1979                                clear_page_dirty_for_io(pages[i]);
1980                        if (page_ops & PAGE_SET_WRITEBACK)
1981                                set_page_writeback(pages[i]);
1982                        if (page_ops & PAGE_SET_ERROR)
1983                                SetPageError(pages[i]);
1984                        if (page_ops & PAGE_END_WRITEBACK)
1985                                end_page_writeback(pages[i]);
1986                        if (page_ops & PAGE_UNLOCK)
1987                                unlock_page(pages[i]);
1988                        if (page_ops & PAGE_LOCK) {
1989                                lock_page(pages[i]);
1990                                if (!PageDirty(pages[i]) ||
1991                                    pages[i]->mapping != mapping) {
1992                                        unlock_page(pages[i]);
1993                                        for (; i < ret; i++)
1994                                                put_page(pages[i]);
1995                                        err = -EAGAIN;
1996                                        goto out;
1997                                }
1998                        }
1999                        put_page(pages[i]);
2000                        pages_processed++;

2001                }
2002                nr_pages -= ret;
2003                index += ret;
2004                cond_resched();
2005        }
2006out:
2007        if (err && index_ret)
2008                *index_ret = start_index + pages_processed - 1;
2009        return err;
2010}
2011
2012void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2013                                  struct page *locked_page,
2014                                  u32 clear_bits, unsigned long page_ops)
2015{
2016        clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
2017
2018        __process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
2019                               start >> PAGE_SHIFT, end >> PAGE_SHIFT,
2020                               page_ops, NULL);
2021}
2022
2023/*
2024 * count the number of bytes in the tree that have a given bit(s)
2025 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
2026 * cached.  The total number found is returned.
2027 */
2028u64 count_range_bits(struct extent_io_tree *tree,
2029                     u64 *start, u64 search_end, u64 max_bytes,
2030                     u32 bits, int contig)
2031{
2032        struct rb_node *node;
2033        struct extent_state *state;
2034        u64 cur_start = *start;
2035        u64 total_bytes = 0;
2036        u64 last = 0;
2037        int found = 0;
2038
2039        if (WARN_ON(search_end <= cur_start))
2040                return 0;
2041
2042        spin_lock(&tree->lock);
2043        if (cur_start == 0 && bits == EXTENT_DIRTY) {
2044                total_bytes = tree->dirty_bytes;
2045                goto out;
2046        }
2047        /*
2048         * this search will find all the extents that end after
2049         * our range starts.
2050         */
2051        node = tree_search(tree, cur_start);
2052        if (!node)
2053                goto out;
2054
2055        while (1) {
2056                state = rb_entry(node, struct extent_state, rb_node);
2057                if (state->start > search_end)
2058                        break;
2059                if (contig && found && state->start > last + 1)
2060                        break;
2061                if (state->end >= cur_start && (state->state & bits) == bits) {
2062                        total_bytes += min(search_end, state->end) + 1 -
2063                                       max(cur_start, state->start);
2064                        if (total_bytes >= max_bytes)
2065                                break;
2066                        if (!found) {
2067                                *start = max(cur_start, state->start);
2068                                found = 1;
2069                        }
2070                        last = state->end;
2071                } else if (contig && found) {
2072                        break;
2073                }
2074                node = rb_next(node);
2075                if (!node)
2076                        break;
2077        }
2078out:
2079        spin_unlock(&tree->lock);
2080        return total_bytes;
2081}
2082
2083/*
2084 * set the private field for a given byte offset in the tree.  If there isn't
2085 * an extent_state there already, this does nothing.
2086 */
2087int set_state_failrec(struct extent_io_tree *tree, u64 start,
2088                      struct io_failure_record *failrec)
2089{
2090        struct rb_node *node;
2091        struct extent_state *state;
2092        int ret = 0;
2093
2094        spin_lock(&tree->lock);
2095        /*
2096         * this search will find all the extents that end after
2097         * our range starts.
2098         */
2099        node = tree_search(tree, start);
2100        if (!node) {
2101                ret = -ENOENT;
2102                goto out;
2103        }
2104        state = rb_entry(node, struct extent_state, rb_node);
2105        if (state->start != start) {
2106                ret = -ENOENT;
2107                goto out;
2108        }
2109        state->failrec = failrec;
2110out:
2111        spin_unlock(&tree->lock);
2112        return ret;
2113}
2114
2115struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
2116{
2117        struct rb_node *node;
2118        struct extent_state *state;
2119        struct io_failure_record *failrec;
2120
2121        spin_lock(&tree->lock);
2122        /*
2123         * this search will find all the extents that end after
2124         * our range starts.
2125         */
2126        node = tree_search(tree, start);
2127        if (!node) {
2128                failrec = ERR_PTR(-ENOENT);
2129                goto out;
2130        }
2131        state = rb_entry(node, struct extent_state, rb_node);
2132        if (state->start != start) {
2133                failrec = ERR_PTR(-ENOENT);
2134                goto out;
2135        }
2136
2137        failrec = state->failrec;
2138out:
2139        spin_unlock(&tree->lock);
2140        return failrec;
2141}
2142
2143/*
2144 * searches a range in the state tree for a given mask.
2145 * If 'filled' == 1, this returns 1 only if every extent in the tree
2146 * has the bits set.  Otherwise, 1 is returned if any bit in the
2147 * range is found set.
2148 */
2149int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
2150                   u32 bits, int filled, struct extent_state *cached)
2151{
2152        struct extent_state *state = NULL;
2153        struct rb_node *node;
2154        int bitset = 0;
2155
2156        spin_lock(&tree->lock);
2157        if (cached && extent_state_in_tree(cached) && cached->start <= start &&
2158            cached->end > start)
2159                node = &cached->rb_node;
2160        else
2161                node = tree_search(tree, start);
2162        while (node && start <= end) {
2163                state = rb_entry(node, struct extent_state, rb_node);
2164
2165                if (filled && state->start > start) {
2166                        bitset = 0;
2167                        break;
2168                }
2169
2170                if (state->start > end)
2171                        break;
2172
2173                if (state->state & bits) {
2174                        bitset = 1;
2175                        if (!filled)
2176                                break;
2177                } else if (filled) {
2178                        bitset = 0;
2179                        break;
2180                }
2181
2182                if (state->end == (u64)-1)
2183                        break;
2184
2185                start = state->end + 1;
2186                if (start > end)
2187                        break;
2188                node = rb_next(node);
2189                if (!node) {
2190                        if (filled)
2191                                bitset = 0;
2192                        break;
2193                }
2194        }
2195        spin_unlock(&tree->lock);
2196        return bitset;
2197}
2198
2199/*
2200 * helper function to set a given page up to date if all the
2201 * extents in the tree for that page are up to date
2202 */
2203static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
2204{
2205        u64 start = page_offset(page);
2206        u64 end = start + PAGE_SIZE - 1;
2207        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
2208                SetPageUptodate(page);
2209}
2210
2211int free_io_failure(struct extent_io_tree *failure_tree,
2212                    struct extent_io_tree *io_tree,
2213                    struct io_failure_record *rec)
2214{
2215        int ret;
2216        int err = 0;
2217
2218        set_state_failrec(failure_tree, rec->start, NULL);
2219        ret = clear_extent_bits(failure_tree, rec->start,
2220                                rec->start + rec->len - 1,
2221                                EXTENT_LOCKED | EXTENT_DIRTY);
2222        if (ret)
2223                err = ret;
2224
2225        ret = clear_extent_bits(io_tree, rec->start,
2226                                rec->start + rec->len - 1,
2227                                EXTENT_DAMAGED);
2228        if (ret && !err)
2229                err = ret;
2230
2231        kfree(rec);
2232        return err;
2233}
2234
2235/*
2236 * this bypasses the standard btrfs submit functions deliberately, as
2237 * the standard behavior is to write all copies in a raid setup. here we only
2238 * want to write the one bad copy. so we do the mapping for ourselves and issue
2239 * submit_bio directly.
2240 * to avoid any synchronization issues, wait for the data after writing, which
2241 * actually prevents the read that triggered the error from finishing.
2242 * currently, there can be no more than two copies of every data bit. thus,
2243 * exactly one rewrite is required.
2244 */
2245int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2246                      u64 length, u64 logical, struct page *page,
2247                      unsigned int pg_offset, int mirror_num)
2248{
2249        struct bio *bio;
2250        struct btrfs_device *dev;
2251        u64 map_length = 0;
2252        u64 sector;
2253        struct btrfs_bio *bbio = NULL;
2254        int ret;
2255
2256        ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
2257        BUG_ON(!mirror_num);
2258
2259        bio = btrfs_io_bio_alloc(1);
2260        bio->bi_iter.bi_size = 0;
2261        map_length = length;
2262
2263        /*
2264         * Avoid races with device replace and make sure our bbio has devices
2265         * associated to its stripes that don't go away while we are doing the
2266         * read repair operation.
2267         */
2268        btrfs_bio_counter_inc_blocked(fs_info);
2269        if (btrfs_is_parity_mirror(fs_info, logical, length)) {
2270                /*
2271                 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2272                 * to update all raid stripes, but here we just want to correct
2273                 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2274                 * stripe's dev and sector.
2275                 */
2276                ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2277                                      &map_length, &bbio, 0);
2278                if (ret) {
2279                        btrfs_bio_counter_dec(fs_info);
2280                        bio_put(bio);
2281                        return -EIO;
2282                }
2283                ASSERT(bbio->mirror_num == 1);
2284        } else {
2285                ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2286                                      &map_length, &bbio, mirror_num);
2287                if (ret) {
2288                        btrfs_bio_counter_dec(fs_info);
2289                        bio_put(bio);
2290                        return -EIO;
2291                }
2292                BUG_ON(mirror_num != bbio->mirror_num);
2293        }
2294
2295        sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
2296        bio->bi_iter.bi_sector = sector;
2297        dev = bbio->stripes[bbio->mirror_num - 1].dev;
2298        btrfs_put_bbio(bbio);
2299        if (!dev || !dev->bdev ||
2300            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2301                btrfs_bio_counter_dec(fs_info);
2302                bio_put(bio);
2303                return -EIO;
2304        }
2305        bio_set_dev(bio, dev->bdev);
2306        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
2307        bio_add_page(bio, page, length, pg_offset);
2308
2309        if (btrfsic_submit_bio_wait(bio)) {
2310                /* try to remap that extent elsewhere? */
2311                btrfs_bio_counter_dec(fs_info);
2312                bio_put(bio);
2313                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2314                return -EIO;
2315        }
2316
2317        btrfs_info_rl_in_rcu(fs_info,
2318                "read error corrected: ino %llu off %llu (dev %s sector %llu)",
2319                                  ino, start,
2320                                  rcu_str_deref(dev->name), sector);
2321        btrfs_bio_counter_dec(fs_info);
2322        bio_put(bio);
2323        return 0;
2324}
2325
2326int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
2327{
2328        struct btrfs_fs_info *fs_info = eb->fs_info;
2329        u64 start = eb->start;
2330        int i, num_pages = num_extent_pages(eb);
2331        int ret = 0;
2332
2333        if (sb_rdonly(fs_info->sb))
2334                return -EROFS;
2335
2336        for (i = 0; i < num_pages; i++) {
2337                struct page *p = eb->pages[i];
2338
2339                ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2340                                        start - page_offset(p), mirror_num);
2341                if (ret)
2342                        break;
2343                start += PAGE_SIZE;
2344        }
2345
2346        return ret;
2347}
2348
2349/*
2350 * each time an IO finishes, we do a fast check in the IO failure tree
2351 * to see if we need to process or clean up an io_failure_record
2352 */
2353int clean_io_failure(struct btrfs_fs_info *fs_info,
2354                     struct extent_io_tree *failure_tree,
2355                     struct extent_io_tree *io_tree, u64 start,
2356                     struct page *page, u64 ino, unsigned int pg_offset)
2357{
2358        u64 private;
2359        struct io_failure_record *failrec;
2360        struct extent_state *state;
2361        int num_copies;
2362        int ret;
2363
2364        private = 0;
2365        ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2366                               EXTENT_DIRTY, 0);
2367        if (!ret)
2368                return 0;
2369
2370        failrec = get_state_failrec(failure_tree, start);
2371        if (IS_ERR(failrec))
2372                return 0;
2373
2374        BUG_ON(!failrec->this_mirror);
2375
2376        if (failrec->in_validation) {
2377                /* there was no real error, just free the record */
2378                btrfs_debug(fs_info,
2379                        "clean_io_failure: freeing dummy error at %llu",
2380                        failrec->start);
2381                goto out;
2382        }
2383        if (sb_rdonly(fs_info->sb))
2384                goto out;
2385
2386        spin_lock(&io_tree->lock);
2387        state = find_first_extent_bit_state(io_tree,
2388                                            failrec->start,
2389                                            EXTENT_LOCKED);
2390        spin_unlock(&io_tree->lock);
2391
2392        if (state && state->start <= failrec->start &&
2393            state->end >= failrec->start + failrec->len - 1) {
2394                num_copies = btrfs_num_copies(fs_info, failrec->logical,
2395                                              failrec->len);
2396                if (num_copies > 1)  {
2397                        repair_io_failure(fs_info, ino, start, failrec->len,
2398                                          failrec->logical, page, pg_offset,
2399                                          failrec->failed_mirror);
2400                }
2401        }
2402
2403out:
2404        free_io_failure(failure_tree, io_tree, failrec);
2405
2406        return 0;
2407}
2408
2409/*
2410 * Can be called when
2411 * - hold extent lock
2412 * - under ordered extent
2413 * - the inode is freeing
2414 */
2415void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
2416{
2417        struct extent_io_tree *failure_tree = &inode->io_failure_tree;
2418        struct io_failure_record *failrec;
2419        struct extent_state *state, *next;
2420
2421        if (RB_EMPTY_ROOT(&failure_tree->state))
2422                return;
2423
2424        spin_lock(&failure_tree->lock);
2425        state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2426        while (state) {
2427                if (state->start > end)
2428                        break;
2429
2430                ASSERT(state->end <= end);
2431
2432                next = next_state(state);
2433
2434                failrec = state->failrec;
2435                free_extent_state(state);
2436                kfree(failrec);
2437
2438                state = next;
2439        }
2440        spin_unlock(&failure_tree->lock);
2441}
2442
2443static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
2444                                                             u64 start, u64 end)
2445{
2446        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2447        struct io_failure_record *failrec;
2448        struct extent_map *em;
2449        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2450        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2451        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2452        int ret;
2453        u64 logical;
2454
2455        failrec = get_state_failrec(failure_tree, start);
2456        if (!IS_ERR(failrec)) {
2457                btrfs_debug(fs_info,
2458                        "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
2459                        failrec->logical, failrec->start, failrec->len,
2460                        failrec->in_validation);
2461                /*
2462                 * when data can be on disk more than twice, add to failrec here
2463                 * (e.g. with a list for failed_mirror) to make
2464                 * clean_io_failure() clean all those errors at once.
2465                 */
2466
2467                return failrec;
2468        }
2469
2470        failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2471        if (!failrec)
2472                return ERR_PTR(-ENOMEM);
2473
2474        failrec->start = start;
2475        failrec->len = end - start + 1;
2476        failrec->this_mirror = 0;
2477        failrec->bio_flags = 0;
2478        failrec->in_validation = 0;
2479
2480        read_lock(&em_tree->lock);
2481        em = lookup_extent_mapping(em_tree, start, failrec->len);
2482        if (!em) {
2483                read_unlock(&em_tree->lock);
2484                kfree(failrec);
2485                return ERR_PTR(-EIO);
2486        }
2487
2488        if (em->start > start || em->start + em->len <= start) {
2489                free_extent_map(em);
2490                em = NULL;
2491        }
2492        read_unlock(&em_tree->lock);
2493        if (!em) {
2494                kfree(failrec);
2495                return ERR_PTR(-EIO);
2496        }
2497
2498        logical = start - em->start;
2499        logical = em->block_start + logical;
2500        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2501                logical = em->block_start;
2502                failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2503                extent_set_compress_type(&failrec->bio_flags, em->compress_type);
2504        }
2505
2506        btrfs_debug(fs_info,
2507                    "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2508                    logical, start, failrec->len);
2509
2510        failrec->logical = logical;
2511        free_extent_map(em);
2512
2513        /* Set the bits in the private failure tree */
2514        ret = set_extent_bits(failure_tree, start, end,
2515                              EXTENT_LOCKED | EXTENT_DIRTY);
2516        if (ret >= 0) {
2517                ret = set_state_failrec(failure_tree, start, failrec);
2518                /* Set the bits in the inode's tree */
2519                ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
2520        } else if (ret < 0) {
2521                kfree(failrec);
2522                return ERR_PTR(ret);
2523        }
2524
2525        return failrec;
2526}
2527
2528static bool btrfs_check_repairable(struct inode *inode, bool needs_validation,
2529                                   struct io_failure_record *failrec,
2530                                   int failed_mirror)
2531{
2532        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2533        int num_copies;
2534
2535        num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
2536        if (num_copies == 1) {
2537                /*
2538                 * we only have a single copy of the data, so don't bother with
2539                 * all the retry and error correction code that follows. no
2540                 * matter what the error is, it is very likely to persist.
2541                 */
2542                btrfs_debug(fs_info,
2543                        "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2544                        num_copies, failrec->this_mirror, failed_mirror);
2545                return false;
2546        }
2547
2548        /*
2549         * there are two premises:
2550         *      a) deliver good data to the caller
2551         *      b) correct the bad sectors on disk
2552         */
2553        if (needs_validation) {
2554                /*
2555                 * to fulfill b), we need to know the exact failing sectors, as
2556                 * we don't want to rewrite any more than the failed ones. thus,
2557                 * we need separate read requests for the failed bio
2558                 *
2559                 * if the following BUG_ON triggers, our validation request got
2560                 * merged. we need separate requests for our algorithm to work.
2561                 */
2562                BUG_ON(failrec->in_validation);
2563                failrec->in_validation = 1;
2564                failrec->this_mirror = failed_mirror;
2565        } else {
2566                /*
2567                 * we're ready to fulfill a) and b) alongside. get a good copy
2568                 * of the failed sector and if we succeed, we have setup
2569                 * everything for repair_io_failure to do the rest for us.
2570                 */
2571                if (failrec->in_validation) {
2572                        BUG_ON(failrec->this_mirror != failed_mirror);
2573                        failrec->in_validation = 0;
2574                        failrec->this_mirror = 0;
2575                }
2576                failrec->failed_mirror = failed_mirror;
2577                failrec->this_mirror++;
2578                if (failrec->this_mirror == failed_mirror)
2579                        failrec->this_mirror++;
2580        }
2581
2582        if (failrec->this_mirror > num_copies) {
2583                btrfs_debug(fs_info,
2584                        "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2585                        num_copies, failrec->this_mirror, failed_mirror);
2586                return false;
2587        }
2588
2589        return true;
2590}
2591
2592static bool btrfs_io_needs_validation(struct inode *inode, struct bio *bio)
2593{
2594        u64 len = 0;
2595        const u32 blocksize = inode->i_sb->s_blocksize;
2596
2597        /*
2598         * If bi_status is BLK_STS_OK, then this was a checksum error, not an
2599         * I/O error. In this case, we already know exactly which sector was
2600         * bad, so we don't need to validate.
2601         */
2602        if (bio->bi_status == BLK_STS_OK)
2603                return false;
2604
2605        /*
2606         * We need to validate each sector individually if the failed I/O was
2607         * for multiple sectors.
2608         *
2609         * There are a few possible bios that can end up here:
2610         * 1. A buffered read bio, which is not cloned.
2611         * 2. A direct I/O read bio, which is cloned.
2612         * 3. A (buffered or direct) repair bio, which is not cloned.
2613         *
2614         * For cloned bios (case 2), we can get the size from
2615         * btrfs_io_bio->iter; for non-cloned bios (cases 1 and 3), we can get
2616         * it from the bvecs.
2617         */
2618        if (bio_flagged(bio, BIO_CLONED)) {
2619                if (btrfs_io_bio(bio)->iter.bi_size > blocksize)
2620                        return true;
2621        } else {
2622                struct bio_vec *bvec;
2623                int i;
2624
2625                bio_for_each_bvec_all(bvec, bio, i) {
2626                        len += bvec->bv_len;
2627                        if (len > blocksize)
2628                                return true;
2629                }
2630        }
2631        return false;
2632}
2633
2634blk_status_t btrfs_submit_read_repair(struct inode *inode,
2635                                      struct bio *failed_bio, u32 bio_offset,
2636                                      struct page *page, unsigned int pgoff,
2637                                      u64 start, u64 end, int failed_mirror,
2638                                      submit_bio_hook_t *submit_bio_hook)
2639{
2640        struct io_failure_record *failrec;
2641        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2642        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2643        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2644        struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio);
2645        const int icsum = bio_offset >> fs_info->sectorsize_bits;
2646        bool need_validation;
2647        struct bio *repair_bio;
2648        struct btrfs_io_bio *repair_io_bio;
2649        blk_status_t status;
2650
2651        btrfs_debug(fs_info,
2652                   "repair read error: read error at %llu", start);
2653
2654        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2655
2656        failrec = btrfs_get_io_failure_record(inode, start, end);
2657        if (IS_ERR(failrec))
2658                return errno_to_blk_status(PTR_ERR(failrec));
2659
2660        need_validation = btrfs_io_needs_validation(inode, failed_bio);
2661
2662        if (!btrfs_check_repairable(inode, need_validation, failrec,
2663                                    failed_mirror)) {
2664                free_io_failure(failure_tree, tree, failrec);
2665                return BLK_STS_IOERR;
2666        }
2667
2668        repair_bio = btrfs_io_bio_alloc(1);
2669        repair_io_bio = btrfs_io_bio(repair_bio);
2670        repair_bio->bi_opf = REQ_OP_READ;
2671        if (need_validation)
2672                repair_bio->bi_opf |= REQ_FAILFAST_DEV;
2673        repair_bio->bi_end_io = failed_bio->bi_end_io;
2674        repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2675        repair_bio->bi_private = failed_bio->bi_private;
2676
2677        if (failed_io_bio->csum) {
2678                const u32 csum_size = fs_info->csum_size;
2679
2680                repair_io_bio->csum = repair_io_bio->csum_inline;
2681                memcpy(repair_io_bio->csum,
2682                       failed_io_bio->csum + csum_size * icsum, csum_size);
2683        }
2684
2685        bio_add_page(repair_bio, page, failrec->len, pgoff);
2686        repair_io_bio->logical = failrec->start;
2687        repair_io_bio->iter = repair_bio->bi_iter;
2688
2689        btrfs_debug(btrfs_sb(inode->i_sb),
2690"repair read error: submitting new read to mirror %d, in_validation=%d",
2691                    failrec->this_mirror, failrec->in_validation);
2692
2693        status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
2694                                 failrec->bio_flags);
2695        if (status) {
2696                free_io_failure(failure_tree, tree, failrec);
2697                bio_put(repair_bio);
2698        }
2699        return status;
2700}
2701
2702/* lots and lots of room for performance fixes in the end_bio funcs */
2703
2704void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2705{
2706        int uptodate = (err == 0);
2707        int ret = 0;
2708
2709        btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
2710
2711        if (!uptodate) {
2712                ClearPageUptodate(page);
2713                SetPageError(page);
2714                ret = err < 0 ? err : -EIO;
2715                mapping_set_error(page->mapping, ret);
2716        }
2717}
2718
2719/*
2720 * after a writepage IO is done, we need to:
2721 * clear the uptodate bits on error
2722 * clear the writeback bits in the extent tree for this IO
2723 * end_page_writeback if the page has no more pending IO
2724 *
2725 * Scheduling is not allowed, so the extent state tree is expected
2726 * to have one and only one object corresponding to this IO.
2727 */
2728static void end_bio_extent_writepage(struct bio *bio)
2729{
2730        int error = blk_status_to_errno(bio->bi_status);
2731        struct bio_vec *bvec;
2732        u64 start;
2733        u64 end;
2734        struct bvec_iter_all iter_all;
2735
2736        ASSERT(!bio_flagged(bio, BIO_CLONED));
2737        bio_for_each_segment_all(bvec, bio, iter_all) {
2738                struct page *page = bvec->bv_page;
2739                struct inode *inode = page->mapping->host;
2740                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2741
2742                /* We always issue full-page reads, but if some block
2743                 * in a page fails to read, blk_update_request() will
2744                 * advance bv_offset and adjust bv_len to compensate.
2745                 * Print a warning for nonzero offsets, and an error
2746                 * if they don't add up to a full page.  */
2747                if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2748                        if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
2749                                btrfs_err(fs_info,
2750                                   "partial page write in btrfs with offset %u and length %u",
2751                                        bvec->bv_offset, bvec->bv_len);
2752                        else
2753                                btrfs_info(fs_info,
2754                                   "incomplete page write in btrfs with offset %u and length %u",
2755                                        bvec->bv_offset, bvec->bv_len);
2756                }
2757
2758                start = page_offset(page);
2759                end = start + bvec->bv_offset + bvec->bv_len - 1;
2760
2761                end_extent_writepage(page, error, start, end);
2762                end_page_writeback(page);
2763        }
2764
2765        bio_put(bio);
2766}
2767
2768/*
2769 * Record previously processed extent range
2770 *
2771 * For endio_readpage_release_extent() to handle a full extent range, reducing
2772 * the extent io operations.
2773 */
2774struct processed_extent {
2775        struct btrfs_inode *inode;
2776        /* Start of the range in @inode */
2777        u64 start;
2778        /* End of the range in in @inode */
2779        u64 end;
2780        bool uptodate;
2781};
2782
2783/*
2784 * Try to release processed extent range
2785 *
2786 * May not release the extent range right now if the current range is
2787 * contiguous to processed extent.
2788 *
2789 * Will release processed extent when any of @inode, @uptodate, the range is
2790 * no longer contiguous to the processed range.
2791 *
2792 * Passing @inode == NULL will force processed extent to be released.
2793 */
2794static void endio_readpage_release_extent(struct processed_extent *processed,
2795                              struct btrfs_inode *inode, u64 start, u64 end,
2796                              bool uptodate)
2797{
2798        struct extent_state *cached = NULL;
2799        struct extent_io_tree *tree;
2800
2801        /* The first extent, initialize @processed */
2802        if (!processed->inode)
2803                goto update;
2804
2805        /*
2806         * Contiguous to processed extent, just uptodate the end.
2807         *
2808         * Several things to notice:
2809         *
2810         * - bio can be merged as long as on-disk bytenr is contiguous
2811         *   This means we can have page belonging to other inodes, thus need to
2812         *   check if the inode still matches.
2813         * - bvec can contain range beyond current page for multi-page bvec
2814         *   Thus we need to do processed->end + 1 >= start check
2815         */
2816        if (processed->inode == inode && processed->uptodate == uptodate &&
2817            processed->end + 1 >= start && end >= processed->end) {
2818                processed->end = end;
2819                return;
2820        }
2821
2822        tree = &processed->inode->io_tree;
2823        /*
2824         * Now we don't have range contiguous to the processed range, release
2825         * the processed range now.
2826         */
2827        if (processed->uptodate && tree->track_uptodate)
2828                set_extent_uptodate(tree, processed->start, processed->end,
2829                                    &cached, GFP_ATOMIC);
2830        unlock_extent_cached_atomic(tree, processed->start, processed->end,
2831                                    &cached);
2832
2833update:
2834        /* Update processed to current range */
2835        processed->inode = inode;
2836        processed->start = start;
2837        processed->end = end;
2838        processed->uptodate = uptodate;
2839}
2840
2841static void endio_readpage_update_page_status(struct page *page, bool uptodate)
2842{
2843        if (uptodate) {
2844                SetPageUptodate(page);
2845        } else {
2846                ClearPageUptodate(page);
2847                SetPageError(page);
2848        }
2849        unlock_page(page);
2850}
2851
2852/*
2853 * after a readpage IO is done, we need to:
2854 * clear the uptodate bits on error
2855 * set the uptodate bits if things worked
2856 * set the page up to date if all extents in the tree are uptodate
2857 * clear the lock bit in the extent tree
2858 * unlock the page if there are no other extents locked for it
2859 *
2860 * Scheduling is not allowed, so the extent state tree is expected
2861 * to have one and only one object corresponding to this IO.
2862 */
2863static void end_bio_extent_readpage(struct bio *bio)
2864{
2865        struct bio_vec *bvec;
2866        int uptodate = !bio->bi_status;
2867        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2868        struct extent_io_tree *tree, *failure_tree;
2869        struct processed_extent processed = { 0 };
2870        /*
2871         * The offset to the beginning of a bio, since one bio can never be
2872         * larger than UINT_MAX, u32 here is enough.
2873         */
2874        u32 bio_offset = 0;
2875        int mirror;
2876        int ret;
2877        struct bvec_iter_all iter_all;
2878
2879        ASSERT(!bio_flagged(bio, BIO_CLONED));
2880        bio_for_each_segment_all(bvec, bio, iter_all) {
2881                struct page *page = bvec->bv_page;
2882                struct inode *inode = page->mapping->host;
2883                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2884                const u32 sectorsize = fs_info->sectorsize;
2885                u64 start;
2886                u64 end;
2887                u32 len;
2888
2889                btrfs_debug(fs_info,
2890                        "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
2891                        bio->bi_iter.bi_sector, bio->bi_status,
2892                        io_bio->mirror_num);
2893                tree = &BTRFS_I(inode)->io_tree;
2894                failure_tree = &BTRFS_I(inode)->io_failure_tree;
2895
2896                /*
2897                 * We always issue full-sector reads, but if some block in a
2898                 * page fails to read, blk_update_request() will advance
2899                 * bv_offset and adjust bv_len to compensate.  Print a warning
2900                 * for unaligned offsets, and an error if they don't add up to
2901                 * a full sector.
2902                 */
2903                if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2904                        btrfs_err(fs_info,
2905                "partial page read in btrfs with offset %u and length %u",
2906                                  bvec->bv_offset, bvec->bv_len);
2907                else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
2908                                     sectorsize))
2909                        btrfs_info(fs_info,
2910                "incomplete page read with offset %u and length %u",
2911                                   bvec->bv_offset, bvec->bv_len);
2912
2913                start = page_offset(page) + bvec->bv_offset;
2914                end = start + bvec->bv_len - 1;
2915                len = bvec->bv_len;
2916
2917                mirror = io_bio->mirror_num;
2918                if (likely(uptodate)) {
2919                        if (is_data_inode(inode))
2920                                ret = btrfs_verify_data_csum(io_bio,
2921                                                bio_offset, page, start, end,
2922                                                mirror);
2923                        else
2924                                ret = btrfs_validate_metadata_buffer(io_bio,
2925                                        page, start, end, mirror);
2926                        if (ret)
2927                                uptodate = 0;
2928                        else
2929                                clean_io_failure(BTRFS_I(inode)->root->fs_info,
2930                                                 failure_tree, tree, start,
2931                                                 page,
2932                                                 btrfs_ino(BTRFS_I(inode)), 0);
2933                }
2934
2935                if (likely(uptodate))
2936                        goto readpage_ok;
2937
2938                if (is_data_inode(inode)) {
2939
2940                        /*
2941                         * The generic bio_readpage_error handles errors the
2942                         * following way: If possible, new read requests are
2943                         * created and submitted and will end up in
2944                         * end_bio_extent_readpage as well (if we're lucky,
2945                         * not in the !uptodate case). In that case it returns
2946                         * 0 and we just go on with the next page in our bio.
2947                         * If it can't handle the error it will return -EIO and
2948                         * we remain responsible for that page.
2949                         */
2950                        if (!btrfs_submit_read_repair(inode, bio, bio_offset,
2951                                                page,
2952                                                start - page_offset(page),
2953                                                start, end, mirror,
2954                                                btrfs_submit_data_bio)) {
2955                                uptodate = !bio->bi_status;
2956                                ASSERT(bio_offset + len > bio_offset);
2957                                bio_offset += len;
2958                                continue;
2959                        }
2960                } else {
2961                        struct extent_buffer *eb;
2962
2963                        eb = (struct extent_buffer *)page->private;
2964                        set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
2965                        eb->read_mirror = mirror;
2966                        atomic_dec(&eb->io_pages);
2967                        if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
2968                                               &eb->bflags))
2969                                btree_readahead_hook(eb, -EIO);
2970                }
2971readpage_ok:
2972                if (likely(uptodate)) {
2973                        loff_t i_size = i_size_read(inode);
2974                        pgoff_t end_index = i_size >> PAGE_SHIFT;
2975                        unsigned off;
2976
2977                        /* Zero out the end if this page straddles i_size */
2978                        off = offset_in_page(i_size);
2979                        if (page->index == end_index && off)
2980                                zero_user_segment(page, off, PAGE_SIZE);
2981                }
2982                ASSERT(bio_offset + len > bio_offset);
2983                bio_offset += len;
2984
2985                /* Update page status and unlock */
2986                endio_readpage_update_page_status(page, uptodate);
2987                endio_readpage_release_extent(&processed, BTRFS_I(inode),
2988                                              start, end, uptodate);
2989        }
2990        /* Release the last extent */
2991        endio_readpage_release_extent(&processed, NULL, 0, 0, false);
2992        btrfs_io_bio_free_csum(io_bio);
2993        bio_put(bio);
2994}
2995
2996/*
2997 * Initialize the members up to but not including 'bio'. Use after allocating a
2998 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
2999 * 'bio' because use of __GFP_ZERO is not supported.
3000 */

3001static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
3002{
3003        memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
3004}
3005
3006/*
3007 * The following helpers allocate a bio. As it's backed by a bioset, it'll
3008 * never fail.  We're returning a bio right now but you can call btrfs_io_bio
3009 * for the appropriate container_of magic
3010 */
3011struct bio *btrfs_bio_alloc(u64 first_byte)
3012{
3013        struct bio *bio;
3014
3015        bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
3016        bio->bi_iter.bi_sector = first_byte >> 9;
3017        btrfs_io_bio_init(btrfs_io_bio(bio));
3018        return bio;
3019}
3020
3021struct bio *btrfs_bio_clone(struct bio *bio)
3022{
3023        struct btrfs_io_bio *btrfs_bio;
3024        struct bio *new;
3025
3026        /* Bio allocation backed by a bioset does not fail */
3027        new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
3028        btrfs_bio = btrfs_io_bio(new);
3029        btrfs_io_bio_init(btrfs_bio);
3030        btrfs_bio->iter = bio->bi_iter;
3031        return new;
3032}
3033
3034struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
3035{
3036        struct bio *bio;
3037
3038        /* Bio allocation backed by a bioset does not fail */
3039        bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
3040        btrfs_io_bio_init(btrfs_io_bio(bio));
3041        return bio;
3042}
3043
3044struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
3045{
3046        struct bio *bio;
3047        struct btrfs_io_bio *btrfs_bio;
3048
3049        /* this will never fail when it's backed by a bioset */
3050        bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
3051        ASSERT(bio);
3052
3053        btrfs_bio = btrfs_io_bio(bio);
3054        btrfs_io_bio_init(btrfs_bio);
3055
3056        bio_trim(bio, offset >> 9, size >> 9);
3057        btrfs_bio->iter = bio->bi_iter;
3058        return bio;
3059}
3060
3061/*
3062 * @opf:        bio REQ_OP_* and REQ_* flags as one value
3063 * @wbc:        optional writeback control for io accounting
3064 * @page:       page to add to the bio
3065 * @pg_offset:  offset of the new bio or to check whether we are adding
3066 *              a contiguous page to the previous one
3067 * @size:       portion of page that we want to write
3068 * @offset:     starting offset in the page
3069 * @bio_ret:    must be valid pointer, newly allocated bio will be stored there
3070 * @end_io_func:     end_io callback for new bio
3071 * @mirror_num:      desired mirror to read/write
3072 * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
3073 * @bio_flags:  flags of the current bio to see if we can merge them
3074 */
3075static int submit_extent_page(unsigned int opf,
3076                              struct writeback_control *wbc,
3077                              struct page *page, u64 offset,
3078                              size_t size, unsigned long pg_offset,
3079                              struct bio **bio_ret,
3080                              bio_end_io_t end_io_func,
3081                              int mirror_num,
3082                              unsigned long prev_bio_flags,
3083                              unsigned long bio_flags,
3084                              bool force_bio_submit)
3085{
3086        int ret = 0;
3087        struct bio *bio;
3088        size_t io_size = min_t(size_t, size, PAGE_SIZE);
3089        sector_t sector = offset >> 9;
3090        struct extent_io_tree *tree = &BTRFS_I(page->mapping->host)->io_tree;
3091
3092        ASSERT(bio_ret);
3093
3094        if (*bio_ret) {
3095                bool contig;
3096                bool can_merge = true;
3097
3098                bio = *bio_ret;
3099                if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
3100                        contig = bio->bi_iter.bi_sector == sector;
3101                else
3102                        contig = bio_end_sector(bio) == sector;
3103
3104                if (btrfs_bio_fits_in_stripe(page, io_size, bio, bio_flags))
3105                        can_merge = false;
3106
3107                if (prev_bio_flags != bio_flags || !contig || !can_merge ||
3108                    force_bio_submit ||
3109                    bio_add_page(bio, page, io_size, pg_offset) < io_size) {
3110                        ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
3111                        if (ret < 0) {
3112                                *bio_ret = NULL;
3113                                return ret;
3114                        }
3115                        bio = NULL;
3116                } else {
3117                        if (wbc)
3118                                wbc_account_cgroup_owner(wbc, page, io_size);
3119                        return 0;
3120                }
3121        }
3122
3123        bio = btrfs_bio_alloc(offset);
3124        bio_add_page(bio, page, io_size, pg_offset);
3125        bio->bi_end_io = end_io_func;
3126        bio->bi_private = tree;
3127        bio->bi_write_hint = page->mapping->host->i_write_hint;
3128        bio->bi_opf = opf;
3129        if (wbc) {
3130                struct block_device *bdev;
3131
3132                bdev = BTRFS_I(page->mapping->host)->root->fs_info->fs_devices->latest_bdev;
3133                bio_set_dev(bio, bdev);
3134                wbc_init_bio(wbc, bio);
3135                wbc_account_cgroup_owner(wbc, page, io_size);
3136        }
3137
3138        *bio_ret = bio;
3139
3140        return ret;
3141}
3142
3143static void attach_extent_buffer_page(struct extent_buffer *eb,
3144                                      struct page *page)
3145{
3146        /*
3147         * If the page is mapped to btree inode, we should hold the private
3148         * lock to prevent race.
3149         * For cloned or dummy extent buffers, their pages are not mapped and
3150         * will not race with any other ebs.
3151         */
3152        if (page->mapping)
3153                lockdep_assert_held(&page->mapping->private_lock);
3154
3155        if (!PagePrivate(page))
3156                attach_page_private(page, eb);
3157        else
3158                WARN_ON(page->private != (unsigned long)eb);
3159}
3160
3161void set_page_extent_mapped(struct page *page)
3162{
3163        if (!PagePrivate(page))
3164                attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3165}
3166
3167static struct extent_map *
3168__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3169                 u64 start, u64 len, struct extent_map **em_cached)
3170{
3171        struct extent_map *em;
3172
3173        if (em_cached && *em_cached) {
3174                em = *em_cached;
3175                if (extent_map_in_tree(em) && start >= em->start &&
3176                    start < extent_map_end(em)) {
3177                        refcount_inc(&em->refs);
3178                        return em;
3179                }
3180
3181                free_extent_map(em);
3182                *em_cached = NULL;
3183        }
3184
3185        em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
3186        if (em_cached && !IS_ERR_OR_NULL(em)) {
3187                BUG_ON(*em_cached);
3188                refcount_inc(&em->refs);
3189                *em_cached = em;
3190        }
3191        return em;
3192}
3193/*
3194 * basic readpage implementation.  Locked extent state structs are inserted
3195 * into the tree that are removed when the IO is done (by the end_io
3196 * handlers)
3197 * XXX JDM: This needs looking at to ensure proper page locking
3198 * return 0 on success, otherwise return error
3199 */
3200int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
3201                      struct bio **bio, unsigned long *bio_flags,
3202                      unsigned int read_flags, u64 *prev_em_start)
3203{
3204        struct inode *inode = page->mapping->host;
3205        u64 start = page_offset(page);
3206        const u64 end = start + PAGE_SIZE - 1;
3207        u64 cur = start;
3208        u64 extent_offset;
3209        u64 last_byte = i_size_read(inode);
3210        u64 block_start;
3211        u64 cur_end;
3212        struct extent_map *em;
3213        int ret = 0;
3214        int nr = 0;
3215        size_t pg_offset = 0;
3216        size_t iosize;
3217        size_t blocksize = inode->i_sb->s_blocksize;
3218        unsigned long this_bio_flag = 0;
3219        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
3220
3221        set_page_extent_mapped(page);
3222
3223        if (!PageUptodate(page)) {
3224                if (cleancache_get_page(page) == 0) {
3225                        BUG_ON(blocksize != PAGE_SIZE);
3226                        unlock_extent(tree, start, end);
3227                        goto out;
3228                }
3229        }
3230
3231        if (page->index == last_byte >> PAGE_SHIFT) {
3232                char *userpage;
3233                size_t zero_offset = offset_in_page(last_byte);
3234
3235                if (zero_offset) {
3236                        iosize = PAGE_SIZE - zero_offset;
3237                        userpage = kmap_atomic(page);
3238                        memset(userpage + zero_offset, 0, iosize);
3239                        flush_dcache_page(page);
3240                        kunmap_atomic(userpage);
3241                }
3242        }
3243        while (cur <= end) {
3244                bool force_bio_submit = false;
3245                u64 offset;
3246
3247                if (cur >= last_byte) {
3248                        char *userpage;
3249                        struct extent_state *cached = NULL;
3250
3251                        iosize = PAGE_SIZE - pg_offset;
3252                        userpage = kmap_atomic(page);
3253                        memset(userpage + pg_offset, 0, iosize);
3254                        flush_dcache_page(page);
3255                        kunmap_atomic(userpage);
3256                        set_extent_uptodate(tree, cur, cur + iosize - 1,
3257                                            &cached, GFP_NOFS);
3258                        unlock_extent_cached(tree, cur,
3259                                             cur + iosize - 1, &cached);
3260                        break;
3261                }
3262                em = __get_extent_map(inode, page, pg_offset, cur,
3263                                      end - cur + 1, em_cached);
3264                if (IS_ERR_OR_NULL(em)) {
3265                        SetPageError(page);
3266                        unlock_extent(tree, cur, end);
3267                        break;
3268                }
3269                extent_offset = cur - em->start;
3270                BUG_ON(extent_map_end(em) <= cur);
3271                BUG_ON(end < cur);
3272
3273                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3274                        this_bio_flag |= EXTENT_BIO_COMPRESSED;
3275                        extent_set_compress_type(&this_bio_flag,
3276                                                 em->compress_type);
3277                }
3278
3279                iosize = min(extent_map_end(em) - cur, end - cur + 1);
3280                cur_end = min(extent_map_end(em) - 1, end);
3281                iosize = ALIGN(iosize, blocksize);
3282                if (this_bio_flag & EXTENT_BIO_COMPRESSED)
3283                        offset = em->block_start;
3284                else
3285                        offset = em->block_start + extent_offset;
3286                block_start = em->block_start;
3287                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3288                        block_start = EXTENT_MAP_HOLE;
3289
3290                /*
3291                 * If we have a file range that points to a compressed extent
3292                 * and it's followed by a consecutive file range that points
3293                 * to the same compressed extent (possibly with a different
3294                 * offset and/or length, so it either points to the whole extent
3295                 * or only part of it), we must make sure we do not submit a
3296                 * single bio to populate the pages for the 2 ranges because
3297                 * this makes the compressed extent read zero out the pages
3298                 * belonging to the 2nd range. Imagine the following scenario:
3299                 *
3300                 *  File layout
3301                 *  [0 - 8K]                     [8K - 24K]
3302                 *    |                               |
3303                 *    |                               |
3304                 * points to extent X,         points to extent X,
3305                 * offset 4K, length of 8K     offset 0, length 16K
3306                 *
3307                 * [extent X, compressed length = 4K uncompressed length = 16K]
3308                 *
3309                 * If the bio to read the compressed extent covers both ranges,
3310                 * it will decompress extent X into the pages belonging to the
3311                 * first range and then it will stop, zeroing out the remaining
3312                 * pages that belong to the other range that points to extent X.
3313                 * So here we make sure we submit 2 bios, one for the first
3314                 * range and another one for the third range. Both will target
3315                 * the same physical extent from disk, but we can't currently
3316                 * make the compressed bio endio callback populate the pages
3317                 * for both ranges because each compressed bio is tightly
3318                 * coupled with a single extent map, and each range can have
3319                 * an extent map with a different offset value relative to the
3320                 * uncompressed data of our extent and different lengths. This
3321                 * is a corner case so we prioritize correctness over
3322                 * non-optimal behavior (submitting 2 bios for the same extent).
3323                 */
3324                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3325                    prev_em_start && *prev_em_start != (u64)-1 &&
3326                    *prev_em_start != em->start)
3327                        force_bio_submit = true;
3328
3329                if (prev_em_start)
3330                        *prev_em_start = em->start;
3331
3332                free_extent_map(em);
3333                em = NULL;
3334
3335                /* we've found a hole, just zero and go on */
3336                if (block_start == EXTENT_MAP_HOLE) {
3337                        char *userpage;
3338                        struct extent_state *cached = NULL;
3339
3340                        userpage = kmap_atomic(page);
3341                        memset(userpage + pg_offset, 0, iosize);
3342                        flush_dcache_page(page);
3343                        kunmap_atomic(userpage);
3344
3345                        set_extent_uptodate(tree, cur, cur + iosize - 1,
3346                                            &cached, GFP_NOFS);
3347                        unlock_extent_cached(tree, cur,
3348                                             cur + iosize - 1, &cached);
3349                        cur = cur + iosize;
3350                        pg_offset += iosize;
3351                        continue;
3352                }
3353                /* the get_extent function already copied into the page */
3354                if (test_range_bit(tree, cur, cur_end,
3355                                   EXTENT_UPTODATE, 1, NULL)) {
3356                        check_page_uptodate(tree, page);
3357                        unlock_extent(tree, cur, cur + iosize - 1);
3358                        cur = cur + iosize;
3359                        pg_offset += iosize;
3360                        continue;
3361                }
3362                /* we have an inline extent but it didn't get marked up
3363                 * to date.  Error out
3364                 */
3365                if (block_start == EXTENT_MAP_INLINE) {
3366                        SetPageError(page);
3367                        unlock_extent(tree, cur, cur + iosize - 1);
3368                        cur = cur + iosize;
3369                        pg_offset += iosize;
3370                        continue;
3371                }
3372
3373                ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
3374                                         page, offset, iosize,
3375                                         pg_offset, bio,
3376                                         end_bio_extent_readpage, 0,
3377                                         *bio_flags,
3378                                         this_bio_flag,
3379                                         force_bio_submit);
3380                if (!ret) {
3381                        nr++;
3382                        *bio_flags = this_bio_flag;
3383                } else {
3384                        SetPageError(page);
3385                        unlock_extent(tree, cur, cur + iosize - 1);
3386                        goto out;
3387                }
3388                cur = cur + iosize;
3389                pg_offset += iosize;
3390        }
3391out:
3392        if (!nr) {
3393                if (!PageError(page))
3394                        SetPageUptodate(page);
3395                unlock_page(page);
3396        }
3397        return ret;
3398}
3399
3400static inline void contiguous_readpages(struct page *pages[], int nr_pages,
3401                                             u64 start, u64 end,
3402                                             struct extent_map **em_cached,
3403                                             struct bio **bio,
3404                                             unsigned long *bio_flags,
3405                                             u64 *prev_em_start)
3406{
3407        struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
3408        int index;
3409
3410        btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3411
3412        for (index = 0; index < nr_pages; index++) {
3413                btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
3414                                  REQ_RAHEAD, prev_em_start);
3415                put_page(pages[index]);
3416        }
3417}
3418
3419static void update_nr_written(struct writeback_control *wbc,
3420                              unsigned long nr_written)
3421{
3422        wbc->nr_to_write -= nr_written;
3423}
3424
3425/*
3426 * helper for __extent_writepage, doing all of the delayed allocation setup.
3427 *
3428 * This returns 1 if btrfs_run_delalloc_range function did all the work required
3429 * to write the page (copy into inline extent).  In this case the IO has
3430 * been started and the page is already unlocked.
3431 *
3432 * This returns 0 if all went well (page still locked)
3433 * This returns < 0 if there were errors (page still locked)
3434 */
3435static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
3436                struct page *page, struct writeback_control *wbc,
3437                u64 delalloc_start, unsigned long *nr_written)
3438{
3439        u64 page_end = delalloc_start + PAGE_SIZE - 1;
3440        bool found;
3441        u64 delalloc_to_write = 0;
3442        u64 delalloc_end = 0;
3443        int ret;
3444        int page_started = 0;
3445
3446
3447        while (delalloc_end < page_end) {
3448                found = find_lock_delalloc_range(&inode->vfs_inode, page,
3449                                               &delalloc_start,
3450                                               &delalloc_end);
3451                if (!found) {
3452                        delalloc_start = delalloc_end + 1;
3453                        continue;
3454                }
3455                ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3456                                delalloc_end, &page_started, nr_written, wbc);
3457                if (ret) {
3458                        SetPageError(page);
3459                        /*
3460                         * btrfs_run_delalloc_range should return < 0 for error
3461                         * but just in case, we use > 0 here meaning the IO is
3462                         * started, so we don't want to return > 0 unless
3463                         * things are going well.
3464                         */
3465                        return ret < 0 ? ret : -EIO;
3466                }
3467                /*
3468                 * delalloc_end is already one less than the total length, so
3469                 * we don't subtract one from PAGE_SIZE
3470                 */
3471                delalloc_to_write += (delalloc_end - delalloc_start +
3472                                      PAGE_SIZE) >> PAGE_SHIFT;
3473                delalloc_start = delalloc_end + 1;
3474        }
3475        if (wbc->nr_to_write < delalloc_to_write) {
3476                int thresh = 8192;
3477
3478                if (delalloc_to_write < thresh * 2)
3479                        thresh = delalloc_to_write;
3480                wbc->nr_to_write = min_t(u64, delalloc_to_write,
3481                                         thresh);
3482        }
3483
3484        /* did the fill delalloc function already unlock and start
3485         * the IO?
3486         */
3487        if (page_started) {
3488                /*
3489                 * we've unlocked the page, so we can't update
3490                 * the mapping's writeback index, just update
3491                 * nr_to_write.
3492                 */
3493                wbc->nr_to_write -= *nr_written;
3494                return 1;
3495        }
3496
3497        return 0;
3498}
3499
3500/*
3501 * helper for __extent_writepage.  This calls the writepage start hooks,
3502 * and does the loop to map the page into extents and bios.
3503 *
3504 * We return 1 if the IO is started and the page is unlocked,
3505 * 0 if all went well (page still locked)
3506 * < 0 if there were errors (page still locked)
3507 */
3508static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
3509                                 struct page *page,
3510                                 struct writeback_control *wbc,
3511                                 struct extent_page_data *epd,
3512                                 loff_t i_size,
3513                                 unsigned long nr_written,
3514                                 int *nr_ret)
3515{
3516        struct extent_io_tree *tree = &inode->io_tree;
3517        u64 start = page_offset(page);
3518        u64 page_end = start + PAGE_SIZE - 1;
3519        u64 end;
3520        u64 cur = start;
3521        u64 extent_offset;
3522        u64 block_start;
3523        u64 iosize;
3524        struct extent_map *em;
3525        size_t pg_offset = 0;
3526        size_t blocksize;
3527        int ret = 0;
3528        int nr = 0;
3529        const unsigned int write_flags = wbc_to_write_flags(wbc);
3530        bool compressed;
3531
3532        ret = btrfs_writepage_cow_fixup(page, start, page_end);
3533        if (ret) {
3534                /* Fixup worker will requeue */
3535                redirty_page_for_writepage(wbc, page);
3536                update_nr_written(wbc, nr_written);
3537                unlock_page(page);
3538                return 1;
3539        }
3540
3541        /*
3542         * we don't want to touch the inode after unlocking the page,
3543         * so we update the mapping writeback index now
3544         */
3545        update_nr_written(wbc, nr_written + 1);
3546
3547        end = page_end;
3548        blocksize = inode->vfs_inode.i_sb->s_blocksize;
3549
3550        while (cur <= end) {
3551                u64 em_end;
3552                u64 offset;
3553
3554                if (cur >= i_size) {
3555                        btrfs_writepage_endio_finish_ordered(page, cur,
3556                                                             page_end, 1);
3557                        break;
3558                }
3559                em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
3560                if (IS_ERR_OR_NULL(em)) {
3561                        SetPageError(page);
3562                        ret = PTR_ERR_OR_ZERO(em);
3563                        break;
3564                }
3565
3566                extent_offset = cur - em->start;
3567                em_end = extent_map_end(em);
3568                BUG_ON(em_end <= cur);
3569                BUG_ON(end < cur);
3570                iosize = min(em_end - cur, end - cur + 1);
3571                iosize = ALIGN(iosize, blocksize);
3572                offset = em->block_start + extent_offset;
3573                block_start = em->block_start;
3574                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3575                free_extent_map(em);
3576                em = NULL;
3577
3578                /*
3579                 * compressed and inline extents are written through other
3580                 * paths in the FS
3581                 */
3582                if (compressed || block_start == EXTENT_MAP_HOLE ||
3583                    block_start == EXTENT_MAP_INLINE) {
3584                        if (compressed)
3585                                nr++;
3586                        else
3587                                btrfs_writepage_endio_finish_ordered(page, cur,
3588                                                        cur + iosize - 1, 1);
3589                        cur += iosize;
3590                        pg_offset += iosize;
3591                        continue;
3592                }
3593
3594                btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
3595                if (!PageWriteback(page)) {
3596                        btrfs_err(inode->root->fs_info,
3597                                   "page %lu not writeback, cur %llu end %llu",
3598                               page->index, cur, end);
3599                }
3600
3601                ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
3602                                         page, offset, iosize, pg_offset,
3603                                         &epd->bio,
3604                                         end_bio_extent_writepage,
3605                                         0, 0, 0, false);
3606                if (ret) {
3607                        SetPageError(page);
3608                        if (PageWriteback(page))
3609                                end_page_writeback(page);
3610                }
3611
3612                cur = cur + iosize;
3613                pg_offset += iosize;
3614                nr++;
3615        }
3616        *nr_ret = nr;
3617        return ret;
3618}
3619
3620/*
3621 * the writepage semantics are similar to regular writepage.  extent
3622 * records are inserted to lock ranges in the tree, and as dirty areas
3623 * are found, they are marked writeback.  Then the lock bits are removed
3624 * and the end_io handler clears the writeback ranges
3625 *
3626 * Return 0 if everything goes well.
3627 * Return <0 for error.
3628 */
3629static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3630                              struct extent_page_data *epd)
3631{
3632        struct inode *inode = page->mapping->host;
3633        u64 start = page_offset(page);
3634        u64 page_end = start + PAGE_SIZE - 1;
3635        int ret;
3636        int nr = 0;
3637        size_t pg_offset;
3638        loff_t i_size = i_size_read(inode);
3639        unsigned long end_index = i_size >> PAGE_SHIFT;
3640        unsigned long nr_written = 0;
3641
3642        trace___extent_writepage(page, inode, wbc);
3643
3644        WARN_ON(!PageLocked(page));
3645
3646        ClearPageError(page);
3647
3648        pg_offset = offset_in_page(i_size);
3649        if (page->index > end_index ||
3650           (page->index == end_index && !pg_offset)) {
3651                page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
3652                unlock_page(page);
3653                return 0;
3654        }
3655
3656        if (page->index == end_index) {
3657                char *userpage;
3658
3659                userpage = kmap_atomic(page);
3660                memset(userpage + pg_offset, 0,
3661                       PAGE_SIZE - pg_offset);
3662                kunmap_atomic(userpage);
3663                flush_dcache_page(page);
3664        }
3665
3666        set_page_extent_mapped(page);
3667
3668        if (!epd->extent_locked) {
3669                ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start,
3670                                         &nr_written);
3671                if (ret == 1)
3672                        return 0;
3673                if (ret)
3674                        goto done;
3675        }
3676
3677        ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
3678                                    nr_written, &nr);
3679        if (ret == 1)
3680                return 0;
3681
3682done:
3683        if (nr == 0) {
3684                /* make sure the mapping tag for page dirty gets cleared */
3685                set_page_writeback(page);
3686                end_page_writeback(page);
3687        }
3688        if (PageError(page)) {
3689                ret = ret < 0 ? ret : -EIO;
3690                end_extent_writepage(page, ret, start, page_end);
3691        }
3692        unlock_page(page);
3693        ASSERT(ret <= 0);
3694        return ret;
3695}
3696
3697void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3698{
3699        wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3700                       TASK_UNINTERRUPTIBLE);
3701}
3702
3703static void end_extent_buffer_writeback(struct extent_buffer *eb)
3704{
3705        clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3706        smp_mb__after_atomic();
3707        wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3708}
3709
3710/*
3711 * Lock extent buffer status and pages for writeback.
3712 *
3713 * May try to flush write bio if we can't get the lock.
3714 *
3715 * Return  0 if the extent buffer doesn't need to be submitted.
3716 *           (E.g. the extent buffer is not dirty)
3717 * Return >0 is the extent buffer is submitted to bio.
3718 * Return <0 if something went wrong, no page is locked.
3719 */
3720static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
3721                          struct extent_page_data *epd)
3722{
3723        struct btrfs_fs_info *fs_info = eb->fs_info;
3724        int i, num_pages, failed_page_nr;
3725        int flush = 0;
3726        int ret = 0;
3727
3728        if (!btrfs_try_tree_write_lock(eb)) {
3729                ret = flush_write_bio(epd);
3730                if (ret < 0)
3731                        return ret;
3732                flush = 1;
3733                btrfs_tree_lock(eb);
3734        }
3735
3736        if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3737                btrfs_tree_unlock(eb);
3738                if (!epd->sync_io)
3739                        return 0;
3740                if (!flush) {
3741                        ret = flush_write_bio(epd);
3742                        if (ret < 0)
3743                                return ret;
3744                        flush = 1;
3745                }
3746                while (1) {
3747                        wait_on_extent_buffer_writeback(eb);
3748                        btrfs_tree_lock(eb);
3749                        if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3750                                break;
3751                        btrfs_tree_unlock(eb);
3752                }
3753        }
3754
3755        /*
3756         * We need to do this to prevent races in people who check if the eb is
3757         * under IO since we can end up having no IO bits set for a short period
3758         * of time.
3759         */
3760        spin_lock(&eb->refs_lock);
3761        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3762                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3763                spin_unlock(&eb->refs_lock);
3764                btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3765                percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3766                                         -eb->len,
3767                                         fs_info->dirty_metadata_batch);
3768                ret = 1;
3769        } else {
3770                spin_unlock(&eb->refs_lock);
3771        }
3772
3773        btrfs_tree_unlock(eb);
3774
3775        if (!ret)
3776                return ret;
3777
3778        num_pages = num_extent_pages(eb);
3779        for (i = 0; i < num_pages; i++) {
3780                struct page *p = eb->pages[i];
3781
3782                if (!trylock_page(p)) {
3783                        if (!flush) {
3784                                int err;
3785
3786                                err = flush_write_bio(epd);
3787                                if (err < 0) {
3788                                        ret = err;
3789                                        failed_page_nr = i;
3790                                        goto err_unlock;
3791                                }
3792                                flush = 1;
3793                        }
3794                        lock_page(p);
3795                }
3796        }
3797
3798        return ret;
3799err_unlock:
3800        /* Unlock already locked pages */
3801        for (i = 0; i < failed_page_nr; i++)
3802                unlock_page(eb->pages[i]);
3803        /*
3804         * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
3805         * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
3806         * be made and undo everything done before.
3807         */
3808        btrfs_tree_lock(eb);
3809        spin_lock(&eb->refs_lock);
3810        set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3811        end_extent_buffer_writeback(eb);
3812        spin_unlock(&eb->refs_lock);
3813        percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
3814                                 fs_info->dirty_metadata_batch);
3815        btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3816        btrfs_tree_unlock(eb);
3817        return ret;
3818}
3819
3820static void set_btree_ioerr(struct page *page)
3821{
3822        struct extent_buffer *eb = (struct extent_buffer *)page->private;
3823        struct btrfs_fs_info *fs_info;
3824
3825        SetPageError(page);
3826        if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3827                return;
3828
3829        /*
3830         * If we error out, we should add back the dirty_metadata_bytes
3831         * to make it consistent.
3832         */
3833        fs_info = eb->fs_info;
3834        percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3835                                 eb->len, fs_info->dirty_metadata_batch);
3836
3837        /*
3838         * If writeback for a btree extent that doesn't belong to a log tree
3839         * failed, increment the counter transaction->eb_write_errors.
3840         * We do this because while the transaction is running and before it's
3841         * committing (when we call filemap_fdata[write|wait]_range against
3842         * the btree inode), we might have
3843         * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3844         * returns an error or an error happens during writeback, when we're
3845         * committing the transaction we wouldn't know about it, since the pages
3846         * can be no longer dirty nor marked anymore for writeback (if a
3847         * subsequent modification to the extent buffer didn't happen before the
3848         * transaction commit), which makes filemap_fdata[write|wait]_range not
3849         * able to find the pages tagged with SetPageError at transaction
3850         * commit time. So if this happens we must abort the transaction,
3851         * otherwise we commit a super block with btree roots that point to
3852         * btree nodes/leafs whose content on disk is invalid - either garbage
3853         * or the content of some node/leaf from a past generation that got
3854         * cowed or deleted and is no longer valid.
3855         *
3856         * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3857         * not be enough - we need to distinguish between log tree extents vs
3858         * non-log tree extents, and the next filemap_fdatawait_range() call
3859         * will catch and clear such errors in the mapping - and that call might
3860         * be from a log sync and not from a transaction commit. Also, checking
3861         * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3862         * not done and would not be reliable - the eb might have been released
3863         * from memory and reading it back again means that flag would not be
3864         * set (since it's a runtime flag, not persisted on disk).
3865         *
3866         * Using the flags below in the btree inode also makes us achieve the
3867         * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3868         * writeback for all dirty pages and before filemap_fdatawait_range()
3869         * is called, the writeback for all dirty pages had already finished
3870         * with errors - because we were not using AS_EIO/AS_ENOSPC,
3871         * filemap_fdatawait_range() would return success, as it could not know
3872         * that writeback errors happened (the pages were no longer tagged for
3873         * writeback).
3874         */
3875        switch (eb->log_index) {
3876        case -1:
3877                set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
3878                break;
3879        case 0:
3880                set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
3881                break;
3882        case 1:
3883                set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
3884                break;
3885        default:
3886                BUG(); /* unexpected, logic error */
3887        }
3888}
3889
3890static void end_bio_extent_buffer_writepage(struct bio *bio)
3891{
3892        struct bio_vec *bvec;
3893        struct extent_buffer *eb;
3894        int done;
3895        struct bvec_iter_all iter_all;
3896
3897        ASSERT(!bio_flagged(bio, BIO_CLONED));
3898        bio_for_each_segment_all(bvec, bio, iter_all) {
3899                struct page *page = bvec->bv_page;
3900
3901                eb = (struct extent_buffer *)page->private;
3902                BUG_ON(!eb);
3903                done = atomic_dec_and_test(&eb->io_pages);
3904
3905                if (bio->bi_status ||
3906                    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3907                        ClearPageUptodate(page);
3908                        set_btree_ioerr(page);
3909                }
3910
3911                end_page_writeback(page);
3912
3913                if (!done)
3914                        continue;
3915
3916                end_extent_buffer_writeback(eb);
3917        }
3918
3919        bio_put(bio);
3920}
3921
3922static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3923                        struct writeback_control *wbc,
3924                        struct extent_page_data *epd)
3925{
3926        u64 offset = eb->start;
3927        u32 nritems;
3928        int i, num_pages;
3929        unsigned long start, end;
3930        unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
3931        int ret = 0;
3932
3933        clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3934        num_pages = num_extent_pages(eb);
3935        atomic_set(&eb->io_pages, num_pages);
3936
3937        /* set btree blocks beyond nritems with 0 to avoid stale content. */
3938        nritems = btrfs_header_nritems(eb);
3939        if (btrfs_header_level(eb) > 0) {
3940                end = btrfs_node_key_ptr_offset(nritems);
3941
3942                memzero_extent_buffer(eb, end, eb->len - end);
3943        } else {
3944                /*
3945                 * leaf:
3946                 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
3947                 */
3948                start = btrfs_item_nr_offset(nritems);
3949                end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
3950                memzero_extent_buffer(eb, start, end - start);
3951        }
3952
3953        for (i = 0; i < num_pages; i++) {
3954                struct page *p = eb->pages[i];
3955
3956                clear_page_dirty_for_io(p);
3957                set_page_writeback(p);
3958                ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
3959                                         p, offset, PAGE_SIZE, 0,
3960                                         &epd->bio,
3961                                         end_bio_extent_buffer_writepage,
3962                                         0, 0, 0, false);
3963                if (ret) {
3964                        set_btree_ioerr(p);
3965                        if (PageWriteback(p))
3966                                end_page_writeback(p);
3967                        if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3968                                end_extent_buffer_writeback(eb);
3969                        ret = -EIO;
3970                        break;
3971                }
3972                offset += PAGE_SIZE;
3973                update_nr_written(wbc, 1);
3974                unlock_page(p);
3975        }
3976
3977        if (unlikely(ret)) {
3978                for (; i < num_pages; i++) {
3979                        struct page *p = eb->pages[i];
3980                        clear_page_dirty_for_io(p);
3981                        unlock_page(p);
3982                }
3983        }
3984
3985        return ret;
3986}
3987
3988/*
3989 * Submit all page(s) of one extent buffer.
3990 *
3991 * @page:       the page of one extent buffer
3992 * @eb_context: to determine if we need to submit this page, if current page
3993 *              belongs to this eb, we don't need to submit
3994 *
3995 * The caller should pass each page in their bytenr order, and here we use
3996 * @eb_context to determine if we have submitted pages of one extent buffer.
3997 *
3998 * If we have, we just skip until we hit a new page that doesn't belong to
3999 * current @eb_context.
4000 *

4001 * If not, we submit all the page(s) of the extent buffer.
4002 *
4003 * Return >0 if we have submitted the extent buffer successfully.
4004 * Return 0 if we don't need to submit the page, as it's already submitted by
4005 * previous call.
4006 * Return <0 for fatal error.
4007 */
4008static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4009                          struct extent_page_data *epd,
4010                          struct extent_buffer **eb_context)
4011{
4012        struct address_space *mapping = page->mapping;
4013        struct extent_buffer *eb;
4014        int ret;
4015
4016        if (!PagePrivate(page))
4017                return 0;
4018
4019        spin_lock(&mapping->private_lock);
4020        if (!PagePrivate(page)) {
4021                spin_unlock(&mapping->private_lock);
4022                return 0;
4023        }
4024
4025        eb = (struct extent_buffer *)page->private;
4026
4027        /*
4028         * Shouldn't happen and normally this would be a BUG_ON but no point
4029         * crashing the machine for something we can survive anyway.
4030         */
4031        if (WARN_ON(!eb)) {
4032                spin_unlock(&mapping->private_lock);
4033                return 0;
4034        }
4035
4036        if (eb == *eb_context) {
4037                spin_unlock(&mapping->private_lock);
4038                return 0;
4039        }
4040        ret = atomic_inc_not_zero(&eb->refs);
4041        spin_unlock(&mapping->private_lock);
4042        if (!ret)
4043                return 0;
4044
4045        *eb_context = eb;
4046
4047        ret = lock_extent_buffer_for_io(eb, epd);
4048        if (ret <= 0) {
4049                free_extent_buffer(eb);
4050                return ret;
4051        }
4052        ret = write_one_eb(eb, wbc, epd);
4053        free_extent_buffer(eb);
4054        if (ret < 0)
4055                return ret;
4056        return 1;
4057}
4058
4059int btree_write_cache_pages(struct address_space *mapping,
4060                                   struct writeback_control *wbc)
4061{
4062        struct extent_buffer *eb_context = NULL;
4063        struct extent_page_data epd = {
4064                .bio = NULL,
4065                .extent_locked = 0,
4066                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4067        };
4068        struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
4069        int ret = 0;
4070        int done = 0;
4071        int nr_to_write_done = 0;
4072        struct pagevec pvec;
4073        int nr_pages;
4074        pgoff_t index;
4075        pgoff_t end;            /* Inclusive */
4076        int scanned = 0;
4077        xa_mark_t tag;
4078
4079        pagevec_init(&pvec);
4080        if (wbc->range_cyclic) {
4081                index = mapping->writeback_index; /* Start from prev offset */
4082                end = -1;
4083                /*
4084                 * Start from the beginning does not need to cycle over the
4085                 * range, mark it as scanned.
4086                 */
4087                scanned = (index == 0);
4088        } else {
4089                index = wbc->range_start >> PAGE_SHIFT;
4090                end = wbc->range_end >> PAGE_SHIFT;
4091                scanned = 1;
4092        }
4093        if (wbc->sync_mode == WB_SYNC_ALL)
4094                tag = PAGECACHE_TAG_TOWRITE;
4095        else
4096                tag = PAGECACHE_TAG_DIRTY;
4097retry:
4098        if (wbc->sync_mode == WB_SYNC_ALL)
4099                tag_pages_for_writeback(mapping, index, end);
4100        while (!done && !nr_to_write_done && (index <= end) &&
4101               (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
4102                        tag))) {
4103                unsigned i;
4104
4105                for (i = 0; i < nr_pages; i++) {
4106                        struct page *page = pvec.pages[i];
4107
4108                        ret = submit_eb_page(page, wbc, &epd, &eb_context);
4109                        if (ret == 0)
4110                                continue;
4111                        if (ret < 0) {
4112                                done = 1;
4113                                break;
4114                        }
4115
4116                        /*
4117                         * the filesystem may choose to bump up nr_to_write.
4118                         * We have to make sure to honor the new nr_to_write
4119                         * at any time
4120                         */
4121                        nr_to_write_done = wbc->nr_to_write <= 0;
4122                }
4123                pagevec_release(&pvec);
4124                cond_resched();
4125        }
4126        if (!scanned && !done) {
4127                /*
4128                 * We hit the last page and there is more work to be done: wrap
4129                 * back to the start of the file
4130                 */
4131                scanned = 1;
4132                index = 0;
4133                goto retry;
4134        }
4135        if (ret < 0) {
4136                end_write_bio(&epd, ret);
4137                return ret;
4138        }
4139        /*
4140         * If something went wrong, don't allow any metadata write bio to be
4141         * submitted.
4142         *
4143         * This would prevent use-after-free if we had dirty pages not
4144         * cleaned up, which can still happen by fuzzed images.
4145         *
4146         * - Bad extent tree
4147         *   Allowing existing tree block to be allocated for other trees.
4148         *
4149         * - Log tree operations
4150         *   Exiting tree blocks get allocated to log tree, bumps its
4151         *   generation, then get cleaned in tree re-balance.
4152         *   Such tree block will not be written back, since it's clean,
4153         *   thus no WRITTEN flag set.
4154         *   And after log writes back, this tree block is not traced by
4155         *   any dirty extent_io_tree.
4156         *
4157         * - Offending tree block gets re-dirtied from its original owner
4158         *   Since it has bumped generation, no WRITTEN flag, it can be
4159         *   reused without COWing. This tree block will not be traced
4160         *   by btrfs_transaction::dirty_pages.
4161         *
4162         *   Now such dirty tree block will not be cleaned by any dirty
4163         *   extent io tree. Thus we don't want to submit such wild eb
4164         *   if the fs already has error.
4165         */
4166        if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
4167                ret = flush_write_bio(&epd);
4168        } else {
4169                ret = -EROFS;
4170                end_write_bio(&epd, ret);
4171        }
4172        return ret;
4173}
4174
4175/**
4176 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
4177 * @mapping: address space structure to write
4178 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
4179 * @data: data passed to __extent_writepage function
4180 *
4181 * If a page is already under I/O, write_cache_pages() skips it, even
4182 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
4183 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
4184 * and msync() need to guarantee that all the data which was dirty at the time
4185 * the call was made get new I/O started against them.  If wbc->sync_mode is
4186 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4187 * existing IO to complete.
4188 */
4189static int extent_write_cache_pages(struct address_space *mapping,
4190                             struct writeback_control *wbc,
4191                             struct extent_page_data *epd)
4192{
4193        struct inode *inode = mapping->host;
4194        int ret = 0;
4195        int done = 0;
4196        int nr_to_write_done = 0;
4197        struct pagevec pvec;
4198        int nr_pages;
4199        pgoff_t index;
4200        pgoff_t end;            /* Inclusive */
4201        pgoff_t done_index;
4202        int range_whole = 0;
4203        int scanned = 0;
4204        xa_mark_t tag;
4205
4206        /*
4207         * We have to hold onto the inode so that ordered extents can do their
4208         * work when the IO finishes.  The alternative to this is failing to add
4209         * an ordered extent if the igrab() fails there and that is a huge pain
4210         * to deal with, so instead just hold onto the inode throughout the
4211         * writepages operation.  If it fails here we are freeing up the inode
4212         * anyway and we'd rather not waste our time writing out stuff that is
4213         * going to be truncated anyway.
4214         */
4215        if (!igrab(inode))
4216                return 0;
4217
4218        pagevec_init(&pvec);
4219        if (wbc->range_cyclic) {
4220                index = mapping->writeback_index; /* Start from prev offset */
4221                end = -1;
4222                /*
4223                 * Start from the beginning does not need to cycle over the
4224                 * range, mark it as scanned.
4225                 */
4226                scanned = (index == 0);
4227        } else {
4228                index = wbc->range_start >> PAGE_SHIFT;
4229                end = wbc->range_end >> PAGE_SHIFT;
4230                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4231                        range_whole = 1;
4232                scanned = 1;
4233        }
4234
4235        /*
4236         * We do the tagged writepage as long as the snapshot flush bit is set
4237         * and we are the first one who do the filemap_flush() on this inode.
4238         *
4239         * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4240         * not race in and drop the bit.
4241         */
4242        if (range_whole && wbc->nr_to_write == LONG_MAX &&
4243            test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4244                               &BTRFS_I(inode)->runtime_flags))
4245                wbc->tagged_writepages = 1;
4246
4247        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
4248                tag = PAGECACHE_TAG_TOWRITE;
4249        else
4250                tag = PAGECACHE_TAG_DIRTY;
4251retry:
4252        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
4253                tag_pages_for_writeback(mapping, index, end);
4254        done_index = index;
4255        while (!done && !nr_to_write_done && (index <= end) &&
4256                        (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
4257                                                &index, end, tag))) {
4258                unsigned i;
4259
4260                for (i = 0; i < nr_pages; i++) {
4261                        struct page *page = pvec.pages[i];
4262
4263                        done_index = page->index + 1;
4264                        /*
4265                         * At this point we hold neither the i_pages lock nor
4266                         * the page lock: the page may be truncated or
4267                         * invalidated (changing page->mapping to NULL),
4268                         * or even swizzled back from swapper_space to
4269                         * tmpfs file mapping
4270                         */
4271                        if (!trylock_page(page)) {
4272                                ret = flush_write_bio(epd);
4273                                BUG_ON(ret < 0);
4274                                lock_page(page);
4275                        }
4276
4277                        if (unlikely(page->mapping != mapping)) {
4278                                unlock_page(page);
4279                                continue;
4280                        }
4281
4282                        if (wbc->sync_mode != WB_SYNC_NONE) {
4283                                if (PageWriteback(page)) {
4284                                        ret = flush_write_bio(epd);
4285                                        BUG_ON(ret < 0);
4286                                }
4287                                wait_on_page_writeback(page);
4288                        }
4289
4290                        if (PageWriteback(page) ||
4291                            !clear_page_dirty_for_io(page)) {
4292                                unlock_page(page);
4293                                continue;
4294                        }
4295
4296                        ret = __extent_writepage(page, wbc, epd);
4297                        if (ret < 0) {
4298                                done = 1;
4299                                break;
4300                        }
4301
4302                        /*
4303                         * the filesystem may choose to bump up nr_to_write.
4304                         * We have to make sure to honor the new nr_to_write
4305                         * at any time
4306                         */
4307                        nr_to_write_done = wbc->nr_to_write <= 0;
4308                }
4309                pagevec_release(&pvec);
4310                cond_resched();
4311        }
4312        if (!scanned && !done) {
4313                /*
4314                 * We hit the last page and there is more work to be done: wrap
4315                 * back to the start of the file
4316                 */
4317                scanned = 1;
4318                index = 0;
4319
4320                /*
4321                 * If we're looping we could run into a page that is locked by a
4322                 * writer and that writer could be waiting on writeback for a
4323                 * page in our current bio, and thus deadlock, so flush the
4324                 * write bio here.
4325                 */
4326                ret = flush_write_bio(epd);
4327                if (!ret)
4328                        goto retry;
4329        }
4330
4331        if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4332                mapping->writeback_index = done_index;
4333
4334        btrfs_add_delayed_iput(inode);
4335        return ret;
4336}
4337
4338int extent_write_full_page(struct page *page, struct writeback_control *wbc)
4339{
4340        int ret;
4341        struct extent_page_data epd = {
4342                .bio = NULL,
4343                .extent_locked = 0,
4344                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4345        };
4346
4347        ret = __extent_writepage(page, wbc, &epd);
4348        ASSERT(ret <= 0);
4349        if (ret < 0) {
4350                end_write_bio(&epd, ret);
4351                return ret;
4352        }
4353
4354        ret = flush_write_bio(&epd);
4355        ASSERT(ret <= 0);
4356        return ret;
4357}
4358
4359int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
4360                              int mode)
4361{
4362        int ret = 0;
4363        struct address_space *mapping = inode->i_mapping;
4364        struct page *page;
4365        unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4366                PAGE_SHIFT;
4367
4368        struct extent_page_data epd = {
4369                .bio = NULL,
4370                .extent_locked = 1,
4371                .sync_io = mode == WB_SYNC_ALL,
4372        };
4373        struct writeback_control wbc_writepages = {
4374                .sync_mode      = mode,
4375                .nr_to_write    = nr_pages * 2,
4376                .range_start    = start,
4377                .range_end      = end + 1,
4378                /* We're called from an async helper function */
4379                .punt_to_cgroup = 1,
4380                .no_cgroup_owner = 1,
4381        };
4382
4383        wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
4384        while (start <= end) {
4385                page = find_get_page(mapping, start >> PAGE_SHIFT);
4386                if (clear_page_dirty_for_io(page))
4387                        ret = __extent_writepage(page, &wbc_writepages, &epd);
4388                else {
4389                        btrfs_writepage_endio_finish_ordered(page, start,
4390                                                    start + PAGE_SIZE - 1, 1);
4391                        unlock_page(page);
4392                }
4393                put_page(page);
4394                start += PAGE_SIZE;
4395        }
4396
4397        ASSERT(ret <= 0);
4398        if (ret == 0)
4399                ret = flush_write_bio(&epd);
4400        else
4401                end_write_bio(&epd, ret);
4402
4403        wbc_detach_inode(&wbc_writepages);
4404        return ret;
4405}
4406
4407int extent_writepages(struct address_space *mapping,
4408                      struct writeback_control *wbc)
4409{
4410        int ret = 0;
4411        struct extent_page_data epd = {
4412                .bio = NULL,
4413                .extent_locked = 0,
4414                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4415        };
4416
4417        ret = extent_write_cache_pages(mapping, wbc, &epd);
4418        ASSERT(ret <= 0);
4419        if (ret < 0) {
4420                end_write_bio(&epd, ret);
4421                return ret;
4422        }
4423        ret = flush_write_bio(&epd);
4424        return ret;
4425}
4426
4427void extent_readahead(struct readahead_control *rac)
4428{
4429        struct bio *bio = NULL;
4430        unsigned long bio_flags = 0;
4431        struct page *pagepool[16];
4432        struct extent_map *em_cached = NULL;
4433        u64 prev_em_start = (u64)-1;
4434        int nr;
4435
4436        while ((nr = readahead_page_batch(rac, pagepool))) {
4437                u64 contig_start = page_offset(pagepool[0]);
4438                u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
4439
4440                ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
4441
4442                contiguous_readpages(pagepool, nr, contig_start, contig_end,
4443                                &em_cached, &bio, &bio_flags, &prev_em_start);
4444        }
4445
4446        if (em_cached)
4447                free_extent_map(em_cached);
4448
4449        if (bio) {
4450                if (submit_one_bio(bio, 0, bio_flags))
4451                        return;
4452        }
4453}
4454
4455/*
4456 * basic invalidatepage code, this waits on any locked or writeback
4457 * ranges corresponding to the page, and then deletes any extent state
4458 * records from the tree
4459 */
4460int extent_invalidatepage(struct extent_io_tree *tree,
4461                          struct page *page, unsigned long offset)
4462{
4463        struct extent_state *cached_state = NULL;
4464        u64 start = page_offset(page);
4465        u64 end = start + PAGE_SIZE - 1;
4466        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4467
4468        /* This function is only called for the btree inode */
4469        ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
4470
4471        start += ALIGN(offset, blocksize);
4472        if (start > end)
4473                return 0;
4474
4475        lock_extent_bits(tree, start, end, &cached_state);
4476        wait_on_page_writeback(page);
4477
4478        /*
4479         * Currently for btree io tree, only EXTENT_LOCKED is utilized,
4480         * so here we only need to unlock the extent range to free any
4481         * existing extent state.
4482         */
4483        unlock_extent_cached(tree, start, end, &cached_state);
4484        return 0;
4485}
4486
4487/*
4488 * a helper for releasepage, this tests for areas of the page that
4489 * are locked or under IO and drops the related state bits if it is safe
4490 * to drop the page.
4491 */
4492static int try_release_extent_state(struct extent_io_tree *tree,
4493                                    struct page *page, gfp_t mask)
4494{
4495        u64 start = page_offset(page);
4496        u64 end = start + PAGE_SIZE - 1;
4497        int ret = 1;
4498
4499        if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
4500                ret = 0;
4501        } else {
4502                /*
4503                 * At this point we can safely clear everything except the
4504                 * locked bit, the nodatasum bit and the delalloc new bit.
4505                 * The delalloc new bit will be cleared by ordered extent
4506                 * completion.
4507                 */
4508                ret = __clear_extent_bit(tree, start, end,
4509                         ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
4510                         0, 0, NULL, mask, NULL);
4511
4512                /* if clear_extent_bit failed for enomem reasons,
4513                 * we can't allow the release to continue.
4514                 */
4515                if (ret < 0)
4516                        ret = 0;
4517                else
4518                        ret = 1;
4519        }
4520        return ret;
4521}
4522
4523/*
4524 * a helper for releasepage.  As long as there are no locked extents
4525 * in the range corresponding to the page, both state records and extent
4526 * map records are removed
4527 */
4528int try_release_extent_mapping(struct page *page, gfp_t mask)
4529{
4530        struct extent_map *em;
4531        u64 start = page_offset(page);
4532        u64 end = start + PAGE_SIZE - 1;
4533        struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
4534        struct extent_io_tree *tree = &btrfs_inode->io_tree;
4535        struct extent_map_tree *map = &btrfs_inode->extent_tree;
4536
4537        if (gfpflags_allow_blocking(mask) &&
4538            page->mapping->host->i_size > SZ_16M) {
4539                u64 len;
4540                while (start <= end) {
4541                        struct btrfs_fs_info *fs_info;
4542                        u64 cur_gen;
4543
4544                        len = end - start + 1;
4545                        write_lock(&map->lock);
4546                        em = lookup_extent_mapping(map, start, len);
4547                        if (!em) {
4548                                write_unlock(&map->lock);
4549                                break;
4550                        }
4551                        if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4552                            em->start != start) {
4553                                write_unlock(&map->lock);
4554                                free_extent_map(em);
4555                                break;
4556                        }
4557                        if (test_range_bit(tree, em->start,
4558                                           extent_map_end(em) - 1,
4559                                           EXTENT_LOCKED, 0, NULL))
4560                                goto next;
4561                        /*
4562                         * If it's not in the list of modified extents, used
4563                         * by a fast fsync, we can remove it. If it's being
4564                         * logged we can safely remove it since fsync took an
4565                         * extra reference on the em.
4566                         */
4567                        if (list_empty(&em->list) ||
4568                            test_bit(EXTENT_FLAG_LOGGING, &em->flags))
4569                                goto remove_em;
4570                        /*
4571                         * If it's in the list of modified extents, remove it
4572                         * only if its generation is older then the current one,
4573                         * in which case we don't need it for a fast fsync.
4574                         * Otherwise don't remove it, we could be racing with an
4575                         * ongoing fast fsync that could miss the new extent.
4576                         */
4577                        fs_info = btrfs_inode->root->fs_info;
4578                        spin_lock(&fs_info->trans_lock);
4579                        cur_gen = fs_info->generation;
4580                        spin_unlock(&fs_info->trans_lock);
4581                        if (em->generation >= cur_gen)
4582                                goto next;
4583remove_em:
4584                        /*
4585                         * We only remove extent maps that are not in the list of
4586                         * modified extents or that are in the list but with a
4587                         * generation lower then the current generation, so there
4588                         * is no need to set the full fsync flag on the inode (it
4589                         * hurts the fsync performance for workloads with a data
4590                         * size that exceeds or is close to the system's memory).
4591                         */
4592                        remove_extent_mapping(map, em);
4593                        /* once for the rb tree */
4594                        free_extent_map(em);
4595next:
4596                        start = extent_map_end(em);
4597                        write_unlock(&map->lock);
4598
4599                        /* once for us */
4600                        free_extent_map(em);
4601
4602                        cond_resched(); /* Allow large-extent preemption. */
4603                }
4604        }
4605        return try_release_extent_state(tree, page, mask);
4606}
4607
4608/*
4609 * helper function for fiemap, which doesn't want to see any holes.
4610 * This maps until we find something past 'last'
4611 */
4612static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
4613                                                u64 offset, u64 last)
4614{
4615        u64 sectorsize = btrfs_inode_sectorsize(inode);
4616        struct extent_map *em;
4617        u64 len;
4618
4619        if (offset >= last)
4620                return NULL;
4621
4622        while (1) {
4623                len = last - offset;
4624                if (len == 0)
4625                        break;
4626                len = ALIGN(len, sectorsize);
4627                em = btrfs_get_extent_fiemap(inode, offset, len);
4628                if (IS_ERR_OR_NULL(em))
4629                        return em;
4630
4631                /* if this isn't a hole return it */
4632                if (em->block_start != EXTENT_MAP_HOLE)
4633                        return em;
4634
4635                /* this is a hole, advance to the next extent */
4636                offset = extent_map_end(em);
4637                free_extent_map(em);
4638                if (offset >= last)
4639                        break;
4640        }
4641        return NULL;
4642}
4643
4644/*
4645 * To cache previous fiemap extent
4646 *
4647 * Will be used for merging fiemap extent
4648 */
4649struct fiemap_cache {
4650        u64 offset;
4651        u64 phys;
4652        u64 len;
4653        u32 flags;
4654        bool cached;
4655};
4656
4657/*
4658 * Helper to submit fiemap extent.
4659 *
4660 * Will try to merge current fiemap extent specified by @offset, @phys,
4661 * @len and @flags with cached one.
4662 * And only when we fails to merge, cached one will be submitted as
4663 * fiemap extent.
4664 *
4665 * Return value is the same as fiemap_fill_next_extent().
4666 */
4667static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
4668                                struct fiemap_cache *cache,
4669                                u64 offset, u64 phys, u64 len, u32 flags)
4670{
4671        int ret = 0;
4672
4673        if (!cache->cached)
4674                goto assign;
4675
4676        /*
4677         * Sanity check, extent_fiemap() should have ensured that new
4678         * fiemap extent won't overlap with cached one.
4679         * Not recoverable.
4680         *
4681         * NOTE: Physical address can overlap, due to compression
4682         */
4683        if (cache->offset + cache->len > offset) {
4684                WARN_ON(1);
4685                return -EINVAL;
4686        }
4687
4688        /*
4689         * Only merges fiemap extents if
4690         * 1) Their logical addresses are continuous
4691         *
4692         * 2) Their physical addresses are continuous
4693         *    So truly compressed (physical size smaller than logical size)
4694         *    extents won't get merged with each other
4695         *
4696         * 3) Share same flags except FIEMAP_EXTENT_LAST
4697         *    So regular extent won't get merged with prealloc extent
4698         */
4699        if (cache->offset + cache->len  == offset &&
4700            cache->phys + cache->len == phys  &&
4701            (cache->flags & ~FIEMAP_EXTENT_LAST) ==
4702                        (flags & ~FIEMAP_EXTENT_LAST)) {
4703                cache->len += len;
4704                cache->flags |= flags;
4705                goto try_submit_last;
4706        }
4707
4708        /* Not mergeable, need to submit cached one */
4709        ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4710                                      cache->len, cache->flags);
4711        cache->cached = false;
4712        if (ret)
4713                return ret;
4714assign:
4715        cache->cached = true;
4716        cache->offset = offset;
4717        cache->phys = phys;
4718        cache->len = len;
4719        cache->flags = flags;
4720try_submit_last:
4721        if (cache->flags & FIEMAP_EXTENT_LAST) {
4722                ret = fiemap_fill_next_extent(fieinfo, cache->offset,
4723                                cache->phys, cache->len, cache->flags);
4724                cache->cached = false;
4725        }
4726        return ret;
4727}
4728
4729/*
4730 * Emit last fiemap cache
4731 *
4732 * The last fiemap cache may still be cached in the following case:
4733 * 0                  4k                    8k
4734 * |<- Fiemap range ->|
4735 * |<------------  First extent ----------->|
4736 *
4737 * In this case, the first extent range will be cached but not emitted.
4738 * So we must emit it before ending extent_fiemap().
4739 */
4740static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
4741                                  struct fiemap_cache *cache)
4742{
4743        int ret;
4744
4745        if (!cache->cached)
4746                return 0;
4747
4748        ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4749                                      cache->len, cache->flags);
4750        cache->cached = false;
4751        if (ret > 0)
4752                ret = 0;
4753        return ret;
4754}
4755
4756int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
4757                  u64 start, u64 len)
4758{
4759        int ret = 0;
4760        u64 off = start;
4761        u64 max = start + len;
4762        u32 flags = 0;
4763        u32 found_type;
4764        u64 last;
4765        u64 last_for_get_extent = 0;
4766        u64 disko = 0;
4767        u64 isize = i_size_read(&inode->vfs_inode);
4768        struct btrfs_key found_key;
4769        struct extent_map *em = NULL;
4770        struct extent_state *cached_state = NULL;
4771        struct btrfs_path *path;
4772        struct btrfs_root *root = inode->root;
4773        struct fiemap_cache cache = { 0 };
4774        struct ulist *roots;
4775        struct ulist *tmp_ulist;
4776        int end = 0;
4777        u64 em_start = 0;
4778        u64 em_len = 0;
4779        u64 em_end = 0;
4780
4781        if (len == 0)
4782                return -EINVAL;
4783
4784        path = btrfs_alloc_path();
4785        if (!path)
4786                return -ENOMEM;
4787
4788        roots = ulist_alloc(GFP_KERNEL);
4789        tmp_ulist = ulist_alloc(GFP_KERNEL);
4790        if (!roots || !tmp_ulist) {
4791                ret = -ENOMEM;
4792                goto out_free_ulist;
4793        }
4794
4795        start = round_down(start, btrfs_inode_sectorsize(inode));
4796        len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4797
4798        /*
4799         * lookup the last file extent.  We're not using i_size here
4800         * because there might be preallocation past i_size
4801         */
4802        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
4803                                       0);
4804        if (ret < 0) {
4805                goto out_free_ulist;
4806        } else {
4807                WARN_ON(!ret);
4808                if (ret == 1)
4809                        ret = 0;
4810        }
4811
4812        path->slots[0]--;
4813        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4814        found_type = found_key.type;
4815
4816        /* No extents, but there might be delalloc bits */
4817        if (found_key.objectid != btrfs_ino(inode) ||
4818            found_type != BTRFS_EXTENT_DATA_KEY) {
4819                /* have to trust i_size as the end */
4820                last = (u64)-1;
4821                last_for_get_extent = isize;
4822        } else {
4823                /*
4824                 * remember the start of the last extent.  There are a
4825                 * bunch of different factors that go into the length of the
4826                 * extent, so its much less complex to remember where it started
4827                 */
4828                last = found_key.offset;
4829                last_for_get_extent = last + 1;
4830        }
4831        btrfs_release_path(path);
4832
4833        /*
4834         * we might have some extents allocated but more delalloc past those
4835         * extents.  so, we trust isize unless the start of the last extent is
4836         * beyond isize
4837         */
4838        if (last < isize) {
4839                last = (u64)-1;
4840                last_for_get_extent = isize;
4841        }
4842
4843        lock_extent_bits(&inode->io_tree, start, start + len - 1,
4844                         &cached_state);
4845
4846        em = get_extent_skip_holes(inode, start, last_for_get_extent);
4847        if (!em)
4848                goto out;
4849        if (IS_ERR(em)) {
4850                ret = PTR_ERR(em);
4851                goto out;
4852        }
4853
4854        while (!end) {
4855                u64 offset_in_extent = 0;
4856
4857                /* break if the extent we found is outside the range */
4858                if (em->start >= max || extent_map_end(em) < off)
4859                        break;
4860
4861                /*
4862                 * get_extent may return an extent that starts before our
4863                 * requested range.  We have to make sure the ranges
4864                 * we return to fiemap always move forward and don't
4865                 * overlap, so adjust the offsets here
4866                 */
4867                em_start = max(em->start, off);
4868
4869                /*
4870                 * record the offset from the start of the extent
4871                 * for adjusting the disk offset below.  Only do this if the
4872                 * extent isn't compressed since our in ram offset may be past
4873                 * what we have actually allocated on disk.
4874                 */
4875                if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4876                        offset_in_extent = em_start - em->start;
4877                em_end = extent_map_end(em);
4878                em_len = em_end - em_start;
4879                flags = 0;
4880                if (em->block_start < EXTENT_MAP_LAST_BYTE)
4881                        disko = em->block_start + offset_in_extent;
4882                else
4883                        disko = 0;
4884
4885                /*
4886                 * bump off for our next call to get_extent
4887                 */
4888                off = extent_map_end(em);
4889                if (off >= max)
4890                        end = 1;
4891
4892                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
4893                        end = 1;
4894                        flags |= FIEMAP_EXTENT_LAST;
4895                } else if (em->block_start == EXTENT_MAP_INLINE) {
4896                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
4897                                  FIEMAP_EXTENT_NOT_ALIGNED);
4898                } else if (em->block_start == EXTENT_MAP_DELALLOC) {
4899                        flags |= (FIEMAP_EXTENT_DELALLOC |
4900                                  FIEMAP_EXTENT_UNKNOWN);
4901                } else if (fieinfo->fi_extents_max) {
4902                        u64 bytenr = em->block_start -
4903                                (em->start - em->orig_start);
4904
4905                        /*
4906                         * As btrfs supports shared space, this information
4907                         * can be exported to userspace tools via
4908                         * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
4909                         * then we're just getting a count and we can skip the
4910                         * lookup stuff.
4911                         */
4912                        ret = btrfs_check_shared(root, btrfs_ino(inode),
4913                                                 bytenr, roots, tmp_ulist);
4914                        if (ret < 0)
4915                                goto out_free;
4916                        if (ret)
4917                                flags |= FIEMAP_EXTENT_SHARED;
4918                        ret = 0;
4919                }
4920                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4921                        flags |= FIEMAP_EXTENT_ENCODED;
4922                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4923                        flags |= FIEMAP_EXTENT_UNWRITTEN;
4924
4925                free_extent_map(em);
4926                em = NULL;
4927                if ((em_start >= last) || em_len == (u64)-1 ||
4928                   (last == (u64)-1 && isize <= em_end)) {
4929                        flags |= FIEMAP_EXTENT_LAST;
4930                        end = 1;
4931                }
4932
4933                /* now scan forward to see if this is really the last extent. */
4934                em = get_extent_skip_holes(inode, off, last_for_get_extent);
4935                if (IS_ERR(em)) {
4936                        ret = PTR_ERR(em);
4937                        goto out;
4938                }
4939                if (!em) {
4940                        flags |= FIEMAP_EXTENT_LAST;
4941                        end = 1;
4942                }
4943                ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
4944                                           em_len, flags);
4945                if (ret) {
4946                        if (ret == 1)
4947                                ret = 0;
4948                        goto out_free;
4949                }
4950        }
4951out_free:
4952        if (!ret)
4953                ret = emit_last_fiemap_cache(fieinfo, &cache);
4954        free_extent_map(em);
4955out:
4956        unlock_extent_cached(&inode->io_tree, start, start + len - 1,
4957                             &cached_state);
4958
4959out_free_ulist:
4960        btrfs_free_path(path);
4961        ulist_free(roots);
4962        ulist_free(tmp_ulist);
4963        return ret;
4964}
4965
4966static void __free_extent_buffer(struct extent_buffer *eb)
4967{
4968        kmem_cache_free(extent_buffer_cache, eb);
4969}
4970
4971int extent_buffer_under_io(const struct extent_buffer *eb)
4972{
4973        return (atomic_read(&eb->io_pages) ||
4974                test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4975                test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4976}
4977
4978/*
4979 * Release all pages attached to the extent buffer.
4980 */
4981static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
4982{
4983        int i;
4984        int num_pages;
4985        int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
4986
4987        BUG_ON(extent_buffer_under_io(eb));
4988
4989        num_pages = num_extent_pages(eb);
4990        for (i = 0; i < num_pages; i++) {
4991                struct page *page = eb->pages[i];
4992
4993                if (!page)
4994                        continue;
4995                if (mapped)
4996                        spin_lock(&page->mapping->private_lock);
4997                /*
4998                 * We do this since we'll remove the pages after we've
4999                 * removed the eb from the radix tree, so we could race
5000                 * and have this page now attached to the new eb.  So

5001                 * only clear page_private if it's still connected to
5002                 * this eb.
5003                 */
5004                if (PagePrivate(page) &&
5005                    page->private == (unsigned long)eb) {
5006                        BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5007                        BUG_ON(PageDirty(page));
5008                        BUG_ON(PageWriteback(page));
5009                        /*
5010                         * We need to make sure we haven't be attached
5011                         * to a new eb.
5012                         */
5013                        detach_page_private(page);
5014                }
5015
5016                if (mapped)
5017                        spin_unlock(&page->mapping->private_lock);
5018
5019                /* One for when we allocated the page */
5020                put_page(page);
5021        }
5022}
5023
5024/*
5025 * Helper for releasing the extent buffer.
5026 */
5027static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5028{
5029        btrfs_release_extent_buffer_pages(eb);
5030        btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
5031        __free_extent_buffer(eb);
5032}
5033
5034static struct extent_buffer *
5035__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
5036                      unsigned long len)
5037{
5038        struct extent_buffer *eb = NULL;
5039
5040        eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
5041        eb->start = start;
5042        eb->len = len;
5043        eb->fs_info = fs_info;
5044        eb->bflags = 0;
5045        init_rwsem(&eb->lock);
5046
5047        btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5048                             &fs_info->allocated_ebs);
5049
5050        spin_lock_init(&eb->refs_lock);
5051        atomic_set(&eb->refs, 1);
5052        atomic_set(&eb->io_pages, 0);
5053
5054        ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
5055
5056        return eb;
5057}
5058
5059struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
5060{
5061        int i;
5062        struct page *p;
5063        struct extent_buffer *new;
5064        int num_pages = num_extent_pages(src);
5065
5066        new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
5067        if (new == NULL)
5068                return NULL;
5069
5070        for (i = 0; i < num_pages; i++) {
5071                p = alloc_page(GFP_NOFS);
5072                if (!p) {
5073                        btrfs_release_extent_buffer(new);
5074                        return NULL;
5075                }
5076                attach_extent_buffer_page(new, p);
5077                WARN_ON(PageDirty(p));
5078                SetPageUptodate(p);
5079                new->pages[i] = p;
5080                copy_page(page_address(p), page_address(src->pages[i]));
5081        }
5082
5083        set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
5084        set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5085
5086        return new;
5087}
5088
5089struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5090                                                  u64 start, unsigned long len)
5091{
5092        struct extent_buffer *eb;
5093        int num_pages;
5094        int i;
5095
5096        eb = __alloc_extent_buffer(fs_info, start, len);
5097        if (!eb)
5098                return NULL;
5099
5100        num_pages = num_extent_pages(eb);
5101        for (i = 0; i < num_pages; i++) {
5102                eb->pages[i] = alloc_page(GFP_NOFS);
5103                if (!eb->pages[i])
5104                        goto err;
5105        }
5106        set_extent_buffer_uptodate(eb);
5107        btrfs_set_header_nritems(eb, 0);
5108        set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5109
5110        return eb;
5111err:
5112        for (; i > 0; i--)
5113                __free_page(eb->pages[i - 1]);
5114        __free_extent_buffer(eb);
5115        return NULL;
5116}
5117
5118struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5119                                                u64 start)
5120{
5121        return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
5122}
5123
5124static void check_buffer_tree_ref(struct extent_buffer *eb)
5125{
5126        int refs;
5127        /*
5128         * The TREE_REF bit is first set when the extent_buffer is added
5129         * to the radix tree. It is also reset, if unset, when a new reference
5130         * is created by find_extent_buffer.
5131         *
5132         * It is only cleared in two cases: freeing the last non-tree
5133         * reference to the extent_buffer when its STALE bit is set or
5134         * calling releasepage when the tree reference is the only reference.
5135         *
5136         * In both cases, care is taken to ensure that the extent_buffer's
5137         * pages are not under io. However, releasepage can be concurrently
5138         * called with creating new references, which is prone to race
5139         * conditions between the calls to check_buffer_tree_ref in those
5140         * codepaths and clearing TREE_REF in try_release_extent_buffer.
5141         *
5142         * The actual lifetime of the extent_buffer in the radix tree is
5143         * adequately protected by the refcount, but the TREE_REF bit and
5144         * its corresponding reference are not. To protect against this
5145         * class of races, we call check_buffer_tree_ref from the codepaths
5146         * which trigger io after they set eb->io_pages. Note that once io is
5147         * initiated, TREE_REF can no longer be cleared, so that is the
5148         * moment at which any such race is best fixed.
5149         */
5150        refs = atomic_read(&eb->refs);
5151        if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5152                return;
5153
5154        spin_lock(&eb->refs_lock);
5155        if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5156                atomic_inc(&eb->refs);
5157        spin_unlock(&eb->refs_lock);
5158}
5159
5160static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5161                struct page *accessed)
5162{
5163        int num_pages, i;
5164
5165        check_buffer_tree_ref(eb);
5166
5167        num_pages = num_extent_pages(eb);
5168        for (i = 0; i < num_pages; i++) {
5169                struct page *p = eb->pages[i];
5170
5171                if (p != accessed)
5172                        mark_page_accessed(p);
5173        }
5174}
5175
5176struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
5177                                         u64 start)
5178{
5179        struct extent_buffer *eb;
5180
5181        rcu_read_lock();
5182        eb = radix_tree_lookup(&fs_info->buffer_radix,
5183                               start >> fs_info->sectorsize_bits);
5184        if (eb && atomic_inc_not_zero(&eb->refs)) {
5185                rcu_read_unlock();
5186                /*
5187                 * Lock our eb's refs_lock to avoid races with
5188                 * free_extent_buffer. When we get our eb it might be flagged
5189                 * with EXTENT_BUFFER_STALE and another task running
5190                 * free_extent_buffer might have seen that flag set,
5191                 * eb->refs == 2, that the buffer isn't under IO (dirty and
5192                 * writeback flags not set) and it's still in the tree (flag
5193                 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
5194                 * of decrementing the extent buffer's reference count twice.
5195                 * So here we could race and increment the eb's reference count,
5196                 * clear its stale flag, mark it as dirty and drop our reference
5197                 * before the other task finishes executing free_extent_buffer,
5198                 * which would later result in an attempt to free an extent
5199                 * buffer that is dirty.
5200                 */
5201                if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
5202                        spin_lock(&eb->refs_lock);
5203                        spin_unlock(&eb->refs_lock);
5204                }
5205                mark_extent_buffer_accessed(eb, NULL);
5206                return eb;
5207        }
5208        rcu_read_unlock();
5209
5210        return NULL;
5211}
5212
5213#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5214struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
5215                                        u64 start)
5216{
5217        struct extent_buffer *eb, *exists = NULL;
5218        int ret;
5219
5220        eb = find_extent_buffer(fs_info, start);
5221        if (eb)
5222                return eb;
5223        eb = alloc_dummy_extent_buffer(fs_info, start);
5224        if (!eb)
5225                return ERR_PTR(-ENOMEM);
5226        eb->fs_info = fs_info;
5227again:
5228        ret = radix_tree_preload(GFP_NOFS);
5229        if (ret) {
5230                exists = ERR_PTR(ret);
5231                goto free_eb;
5232        }
5233        spin_lock(&fs_info->buffer_lock);
5234        ret = radix_tree_insert(&fs_info->buffer_radix,
5235                                start >> fs_info->sectorsize_bits, eb);
5236        spin_unlock(&fs_info->buffer_lock);
5237        radix_tree_preload_end();
5238        if (ret == -EEXIST) {
5239                exists = find_extent_buffer(fs_info, start);
5240                if (exists)
5241                        goto free_eb;
5242                else
5243                        goto again;
5244        }
5245        check_buffer_tree_ref(eb);
5246        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5247
5248        return eb;
5249free_eb:
5250        btrfs_release_extent_buffer(eb);
5251        return exists;
5252}
5253#endif
5254
5255struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
5256                                          u64 start, u64 owner_root, int level)
5257{
5258        unsigned long len = fs_info->nodesize;
5259        int num_pages;
5260        int i;
5261        unsigned long index = start >> PAGE_SHIFT;
5262        struct extent_buffer *eb;
5263        struct extent_buffer *exists = NULL;
5264        struct page *p;
5265        struct address_space *mapping = fs_info->btree_inode->i_mapping;
5266        int uptodate = 1;
5267        int ret;
5268
5269        if (!IS_ALIGNED(start, fs_info->sectorsize)) {
5270                btrfs_err(fs_info, "bad tree block start %llu", start);
5271                return ERR_PTR(-EINVAL);
5272        }
5273
5274        if (fs_info->sectorsize < PAGE_SIZE &&
5275            offset_in_page(start) + len > PAGE_SIZE) {
5276                btrfs_err(fs_info,
5277                "tree block crosses page boundary, start %llu nodesize %lu",
5278                          start, len);
5279                return ERR_PTR(-EINVAL);
5280        }
5281
5282        eb = find_extent_buffer(fs_info, start);
5283        if (eb)
5284                return eb;
5285
5286        eb = __alloc_extent_buffer(fs_info, start, len);
5287        if (!eb)
5288                return ERR_PTR(-ENOMEM);
5289        btrfs_set_buffer_lockdep_class(owner_root, eb, level);
5290
5291        num_pages = num_extent_pages(eb);
5292        for (i = 0; i < num_pages; i++, index++) {
5293                p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
5294                if (!p) {
5295                        exists = ERR_PTR(-ENOMEM);
5296                        goto free_eb;
5297                }
5298
5299                spin_lock(&mapping->private_lock);
5300                if (PagePrivate(p)) {
5301                        /*
5302                         * We could have already allocated an eb for this page
5303                         * and attached one so lets see if we can get a ref on
5304                         * the existing eb, and if we can we know it's good and
5305                         * we can just return that one, else we know we can just
5306                         * overwrite page->private.
5307                         */
5308                        exists = (struct extent_buffer *)p->private;
5309                        if (atomic_inc_not_zero(&exists->refs)) {
5310                                spin_unlock(&mapping->private_lock);
5311                                unlock_page(p);
5312                                put_page(p);
5313                                mark_extent_buffer_accessed(exists, p);
5314                                goto free_eb;
5315                        }
5316                        exists = NULL;
5317
5318                        WARN_ON(PageDirty(p));
5319                        detach_page_private(p);
5320                }
5321                attach_extent_buffer_page(eb, p);
5322                spin_unlock(&mapping->private_lock);
5323                WARN_ON(PageDirty(p));
5324                eb->pages[i] = p;
5325                if (!PageUptodate(p))
5326                        uptodate = 0;
5327
5328                /*
5329                 * We can't unlock the pages just yet since the extent buffer
5330                 * hasn't been properly inserted in the radix tree, this
5331                 * opens a race with btree_releasepage which can free a page
5332                 * while we are still filling in all pages for the buffer and
5333                 * we could crash.
5334                 */
5335        }
5336        if (uptodate)
5337                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5338again:
5339        ret = radix_tree_preload(GFP_NOFS);
5340        if (ret) {
5341                exists = ERR_PTR(ret);
5342                goto free_eb;
5343        }
5344
5345        spin_lock(&fs_info->buffer_lock);
5346        ret = radix_tree_insert(&fs_info->buffer_radix,
5347                                start >> fs_info->sectorsize_bits, eb);
5348        spin_unlock(&fs_info->buffer_lock);
5349        radix_tree_preload_end();
5350        if (ret == -EEXIST) {
5351                exists = find_extent_buffer(fs_info, start);
5352                if (exists)
5353                        goto free_eb;
5354                else
5355                        goto again;
5356        }
5357        /* add one reference for the tree */
5358        check_buffer_tree_ref(eb);
5359        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5360
5361        /*
5362         * Now it's safe to unlock the pages because any calls to
5363         * btree_releasepage will correctly detect that a page belongs to a
5364         * live buffer and won't free them prematurely.
5365         */
5366        for (i = 0; i < num_pages; i++)
5367                unlock_page(eb->pages[i]);
5368        return eb;
5369
5370free_eb:
5371        WARN_ON(!atomic_dec_and_test(&eb->refs));
5372        for (i = 0; i < num_pages; i++) {
5373                if (eb->pages[i])
5374                        unlock_page(eb->pages[i]);
5375        }
5376
5377        btrfs_release_extent_buffer(eb);
5378        return exists;
5379}
5380
5381static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5382{
5383        struct extent_buffer *eb =
5384                        container_of(head, struct extent_buffer, rcu_head);
5385
5386        __free_extent_buffer(eb);
5387}
5388
5389static int release_extent_buffer(struct extent_buffer *eb)
5390        __releases(&eb->refs_lock)
5391{
5392        lockdep_assert_held(&eb->refs_lock);
5393
5394        WARN_ON(atomic_read(&eb->refs) == 0);
5395        if (atomic_dec_and_test(&eb->refs)) {
5396                if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
5397                        struct btrfs_fs_info *fs_info = eb->fs_info;
5398
5399                        spin_unlock(&eb->refs_lock);
5400
5401                        spin_lock(&fs_info->buffer_lock);
5402                        radix_tree_delete(&fs_info->buffer_radix,
5403                                          eb->start >> fs_info->sectorsize_bits);
5404                        spin_unlock(&fs_info->buffer_lock);
5405                } else {
5406                        spin_unlock(&eb->refs_lock);
5407                }
5408
5409                btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
5410                /* Should be safe to release our pages at this point */
5411                btrfs_release_extent_buffer_pages(eb);
5412#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5413                if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
5414                        __free_extent_buffer(eb);
5415                        return 1;
5416                }
5417#endif
5418                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
5419                return 1;
5420        }
5421        spin_unlock(&eb->refs_lock);
5422
5423        return 0;
5424}
5425
5426void free_extent_buffer(struct extent_buffer *eb)
5427{
5428        int refs;
5429        int old;
5430        if (!eb)
5431                return;
5432
5433        while (1) {
5434                refs = atomic_read(&eb->refs);
5435                if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
5436                    || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
5437                        refs == 1))
5438                        break;
5439                old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5440                if (old == refs)
5441                        return;
5442        }
5443
5444        spin_lock(&eb->refs_lock);
5445        if (atomic_read(&eb->refs) == 2 &&
5446            test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
5447            !extent_buffer_under_io(eb) &&
5448            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5449                atomic_dec(&eb->refs);
5450
5451        /*
5452         * I know this is terrible, but it's temporary until we stop tracking
5453         * the uptodate bits and such for the extent buffers.
5454         */
5455        release_extent_buffer(eb);
5456}
5457
5458void free_extent_buffer_stale(struct extent_buffer *eb)
5459{
5460        if (!eb)
5461                return;
5462
5463        spin_lock(&eb->refs_lock);
5464        set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5465
5466        if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
5467            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5468                atomic_dec(&eb->refs);
5469        release_extent_buffer(eb);
5470}
5471
5472void clear_extent_buffer_dirty(const struct extent_buffer *eb)
5473{
5474        int i;
5475        int num_pages;
5476        struct page *page;
5477
5478        num_pages = num_extent_pages(eb);
5479
5480        for (i = 0; i < num_pages; i++) {
5481                page = eb->pages[i];
5482                if (!PageDirty(page))
5483                        continue;
5484
5485                lock_page(page);
5486                WARN_ON(!PagePrivate(page));
5487
5488                clear_page_dirty_for_io(page);
5489                xa_lock_irq(&page->mapping->i_pages);
5490                if (!PageDirty(page))
5491                        __xa_clear_mark(&page->mapping->i_pages,
5492                                        page_index(page), PAGECACHE_TAG_DIRTY);
5493                xa_unlock_irq(&page->mapping->i_pages);
5494                ClearPageError(page);
5495                unlock_page(page);
5496        }
5497        WARN_ON(atomic_read(&eb->refs) == 0);
5498}
5499
5500bool set_extent_buffer_dirty(struct extent_buffer *eb)
5501{
5502        int i;
5503        int num_pages;
5504        bool was_dirty;
5505
5506        check_buffer_tree_ref(eb);
5507
5508        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
5509
5510        num_pages = num_extent_pages(eb);
5511        WARN_ON(atomic_read(&eb->refs) == 0);
5512        WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5513
5514        if (!was_dirty)
5515                for (i = 0; i < num_pages; i++)
5516                        set_page_dirty(eb->pages[i]);
5517
5518#ifdef CONFIG_BTRFS_DEBUG
5519        for (i = 0; i < num_pages; i++)
5520                ASSERT(PageDirty(eb->pages[i]));
5521#endif
5522
5523        return was_dirty;
5524}
5525
5526void clear_extent_buffer_uptodate(struct extent_buffer *eb)
5527{
5528        int i;
5529        struct page *page;
5530        int num_pages;
5531
5532        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5533        num_pages = num_extent_pages(eb);
5534        for (i = 0; i < num_pages; i++) {
5535                page = eb->pages[i];
5536                if (page)
5537                        ClearPageUptodate(page);
5538        }
5539}
5540
5541void set_extent_buffer_uptodate(struct extent_buffer *eb)
5542{
5543        int i;
5544        struct page *page;
5545        int num_pages;
5546
5547        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5548        num_pages = num_extent_pages(eb);
5549        for (i = 0; i < num_pages; i++) {
5550                page = eb->pages[i];
5551                SetPageUptodate(page);
5552        }
5553}
5554
5555int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
5556{
5557        int i;
5558        struct page *page;
5559        int err;
5560        int ret = 0;
5561        int locked_pages = 0;
5562        int all_uptodate = 1;
5563        int num_pages;
5564        unsigned long num_reads = 0;
5565        struct bio *bio = NULL;
5566        unsigned long bio_flags = 0;
5567
5568        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
5569                return 0;
5570
5571        num_pages = num_extent_pages(eb);
5572        for (i = 0; i < num_pages; i++) {
5573                page = eb->pages[i];
5574                if (wait == WAIT_NONE) {
5575                        if (!trylock_page(page))
5576                                goto unlock_exit;
5577                } else {
5578                        lock_page(page);
5579                }
5580                locked_pages++;
5581        }
5582        /*
5583         * We need to firstly lock all pages to make sure that
5584         * the uptodate bit of our pages won't be affected by
5585         * clear_extent_buffer_uptodate().
5586         */
5587        for (i = 0; i < num_pages; i++) {
5588                page = eb->pages[i];
5589                if (!PageUptodate(page)) {
5590                        num_reads++;
5591                        all_uptodate = 0;
5592                }
5593        }
5594
5595        if (all_uptodate) {
5596                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5597                goto unlock_exit;
5598        }
5599
5600        clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5601        eb->read_mirror = 0;
5602        atomic_set(&eb->io_pages, num_reads);
5603        /*
5604         * It is possible for releasepage to clear the TREE_REF bit before we
5605         * set io_pages. See check_buffer_tree_ref for a more detailed comment.
5606         */
5607        check_buffer_tree_ref(eb);
5608        for (i = 0; i < num_pages; i++) {
5609                page = eb->pages[i];
5610
5611                if (!PageUptodate(page)) {
5612                        if (ret) {
5613                                atomic_dec(&eb->io_pages);
5614                                unlock_page(page);
5615                                continue;
5616                        }
5617
5618                        ClearPageError(page);
5619                        err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
5620                                         page, page_offset(page), PAGE_SIZE, 0,
5621                                         &bio, end_bio_extent_readpage,
5622                                         mirror_num, 0, 0, false);
5623                        if (err) {
5624                                /*
5625                                 * We failed to submit the bio so it's the
5626                                 * caller's responsibility to perform cleanup
5627                                 * i.e unlock page/set error bit.
5628                                 */
5629                                ret = err;
5630                                SetPageError(page);
5631                                unlock_page(page);
5632                                atomic_dec(&eb->io_pages);
5633                        }
5634                } else {
5635                        unlock_page(page);
5636                }
5637        }
5638
5639        if (bio) {
5640                err = submit_one_bio(bio, mirror_num, bio_flags);
5641                if (err)
5642                        return err;
5643        }
5644
5645        if (ret || wait != WAIT_COMPLETE)
5646                return ret;
5647
5648        for (i = 0; i < num_pages; i++) {
5649                page = eb->pages[i];
5650                wait_on_page_locked(page);
5651                if (!PageUptodate(page))
5652                        ret = -EIO;
5653        }
5654
5655        return ret;
5656
5657unlock_exit:
5658        while (locked_pages > 0) {
5659                locked_pages--;
5660                page = eb->pages[locked_pages];
5661                unlock_page(page);
5662        }
5663        return ret;
5664}
5665
5666static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
5667                            unsigned long len)
5668{
5669        btrfs_warn(eb->fs_info,
5670                "access to eb bytenr %llu len %lu out of range start %lu len %lu",
5671                eb->start, eb->len, start, len);
5672        WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5673
5674        return true;
5675}
5676
5677/*
5678 * Check if the [start, start + len) range is valid before reading/writing
5679 * the eb.
5680 * NOTE: @start and @len are offset inside the eb, not logical address.
5681 *
5682 * Caller should not touch the dst/src memory if this function returns error.
5683 */
5684static inline int check_eb_range(const struct extent_buffer *eb,
5685                                 unsigned long start, unsigned long len)
5686{
5687        unsigned long offset;
5688
5689        /* start, start + len should not go beyond eb->len nor overflow */
5690        if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
5691                return report_eb_range(eb, start, len);
5692
5693        return false;
5694}
5695
5696void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
5697                        unsigned long start, unsigned long len)
5698{
5699        size_t cur;
5700        size_t offset;
5701        struct page *page;
5702        char *kaddr;
5703        char *dst = (char *)dstv;
5704        unsigned long i = get_eb_page_index(start);
5705
5706        if (check_eb_range(eb, start, len))
5707                return;
5708
5709        offset = get_eb_offset_in_page(eb, start);
5710
5711        while (len > 0) {
5712                page = eb->pages[i];
5713
5714                cur = min(len, (PAGE_SIZE - offset));
5715                kaddr = page_address(page);
5716                memcpy(dst, kaddr + offset, cur);
5717
5718                dst += cur;
5719                len -= cur;
5720                offset = 0;
5721                i++;
5722        }
5723}
5724
5725int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
5726                                       void __user *dstv,
5727                                       unsigned long start, unsigned long len)
5728{
5729        size_t cur;
5730        size_t offset;
5731        struct page *page;
5732        char *kaddr;
5733        char __user *dst = (char __user *)dstv;
5734        unsigned long i = get_eb_page_index(start);
5735        int ret = 0;
5736
5737        WARN_ON(start > eb->len);
5738        WARN_ON(start + len > eb->start + eb->len);
5739
5740        offset = get_eb_offset_in_page(eb, start);
5741
5742        while (len > 0) {
5743                page = eb->pages[i];
5744
5745                cur = min(len, (PAGE_SIZE - offset));
5746                kaddr = page_address(page);
5747                if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
5748                        ret = -EFAULT;
5749                        break;
5750                }
5751
5752                dst += cur;
5753                len -= cur;
5754                offset = 0;
5755                i++;
5756        }
5757
5758        return ret;
5759}
5760
5761int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
5762                         unsigned long start, unsigned long len)
5763{
5764        size_t cur;
5765        size_t offset;
5766        struct page *page;
5767        char *kaddr;
5768        char *ptr = (char *)ptrv;
5769        unsigned long i = get_eb_page_index(start);
5770        int ret = 0;
5771
5772        if (check_eb_range(eb, start, len))
5773                return -EINVAL;
5774
5775        offset = get_eb_offset_in_page(eb, start);
5776
5777        while (len > 0) {
5778                page = eb->pages[i];
5779
5780                cur = min(len, (PAGE_SIZE - offset));
5781
5782                kaddr = page_address(page);
5783                ret = memcmp(ptr, kaddr + offset, cur);
5784                if (ret)
5785                        break;
5786
5787                ptr += cur;
5788                len -= cur;
5789                offset = 0;
5790                i++;
5791        }
5792        return ret;
5793}
5794
5795void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
5796                const void *srcv)
5797{
5798        char *kaddr;
5799
5800        WARN_ON(!PageUptodate(eb->pages[0]));
5801        kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
5802        memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
5803                        BTRFS_FSID_SIZE);
5804}
5805
5806void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
5807{
5808        char *kaddr;
5809
5810        WARN_ON(!PageUptodate(eb->pages[0]));
5811        kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
5812        memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
5813                        BTRFS_FSID_SIZE);
5814}
5815
5816void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
5817                         unsigned long start, unsigned long len)
5818{
5819        size_t cur;
5820        size_t offset;
5821        struct page *page;
5822        char *kaddr;
5823        char *src = (char *)srcv;
5824        unsigned long i = get_eb_page_index(start);
5825
5826        if (check_eb_range(eb, start, len))
5827                return;
5828
5829        offset = get_eb_offset_in_page(eb, start);
5830
5831        while (len > 0) {
5832                page = eb->pages[i];
5833                WARN_ON(!PageUptodate(page));
5834
5835                cur = min(len, PAGE_SIZE - offset);
5836                kaddr = page_address(page);
5837                memcpy(kaddr + offset, src, cur);
5838
5839                src += cur;
5840                len -= cur;
5841                offset = 0;
5842                i++;
5843        }
5844}
5845
5846void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
5847                unsigned long len)
5848{
5849        size_t cur;
5850        size_t offset;
5851        struct page *page;
5852        char *kaddr;
5853        unsigned long i = get_eb_page_index(start);
5854
5855        if (check_eb_range(eb, start, len))
5856                return;
5857
5858        offset = get_eb_offset_in_page(eb, start);
5859
5860        while (len > 0) {
5861                page = eb->pages[i];
5862                WARN_ON(!PageUptodate(page));
5863
5864                cur = min(len, PAGE_SIZE - offset);
5865                kaddr = page_address(page);
5866                memset(kaddr + offset, 0, cur);
5867
5868                len -= cur;
5869                offset = 0;
5870                i++;
5871        }
5872}
5873
5874void copy_extent_buffer_full(const struct extent_buffer *dst,
5875                             const struct extent_buffer *src)
5876{
5877        int i;
5878        int num_pages;
5879
5880        ASSERT(dst->len == src->len);
5881
5882        if (dst->fs_info->sectorsize == PAGE_SIZE) {
5883                num_pages = num_extent_pages(dst);
5884                for (i = 0; i < num_pages; i++)
5885                        copy_page(page_address(dst->pages[i]),
5886                                  page_address(src->pages[i]));
5887        } else {
5888                size_t src_offset = get_eb_offset_in_page(src, 0);
5889                size_t dst_offset = get_eb_offset_in_page(dst, 0);
5890
5891                ASSERT(src->fs_info->sectorsize < PAGE_SIZE);
5892                memcpy(page_address(dst->pages[0]) + dst_offset,
5893                       page_address(src->pages[0]) + src_offset,
5894                       src->len);
5895        }
5896}
5897
5898void copy_extent_buffer(const struct extent_buffer *dst,
5899                        const struct extent_buffer *src,
5900                        unsigned long dst_offset, unsigned long src_offset,
5901                        unsigned long len)
5902{
5903        u64 dst_len = dst->len;
5904        size_t cur;
5905        size_t offset;
5906        struct page *page;
5907        char *kaddr;
5908        unsigned long i = get_eb_page_index(dst_offset);
5909
5910        if (check_eb_range(dst, dst_offset, len) ||
5911            check_eb_range(src, src_offset, len))
5912                return;
5913
5914        WARN_ON(src->len != dst_len);
5915
5916        offset = get_eb_offset_in_page(dst, dst_offset);
5917
5918        while (len > 0) {
5919                page = dst->pages[i];
5920                WARN_ON(!PageUptodate(page));
5921
5922                cur = min(len, (unsigned long)(PAGE_SIZE - offset));
5923
5924                kaddr = page_address(page);
5925                read_extent_buffer(src, kaddr + offset, src_offset, cur);
5926
5927                src_offset += cur;
5928                len -= cur;
5929                offset = 0;
5930                i++;
5931        }
5932}
5933
5934/*
5935 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
5936 * given bit number
5937 * @eb: the extent buffer
5938 * @start: offset of the bitmap item in the extent buffer
5939 * @nr: bit number
5940 * @page_index: return index of the page in the extent buffer that contains the
5941 * given bit number
5942 * @page_offset: return offset into the page given by page_index
5943 *
5944 * This helper hides the ugliness of finding the byte in an extent buffer which
5945 * contains a given bit.
5946 */
5947static inline void eb_bitmap_offset(const struct extent_buffer *eb,
5948                                    unsigned long start, unsigned long nr,
5949                                    unsigned long *page_index,
5950                                    size_t *page_offset)
5951{
5952        size_t byte_offset = BIT_BYTE(nr);
5953        size_t offset;
5954
5955        /*
5956         * The byte we want is the offset of the extent buffer + the offset of
5957         * the bitmap item in the extent buffer + the offset of the byte in the
5958         * bitmap item.
5959         */
5960        offset = start + offset_in_page(eb->start) + byte_offset;
5961
5962        *page_index = offset >> PAGE_SHIFT;
5963        *page_offset = offset_in_page(offset);
5964}
5965
5966/**
5967 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
5968 * @eb: the extent buffer
5969 * @start: offset of the bitmap item in the extent buffer
5970 * @nr: bit number to test
5971 */
5972int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
5973                           unsigned long nr)
5974{
5975        u8 *kaddr;
5976        struct page *page;
5977        unsigned long i;
5978        size_t offset;
5979
5980        eb_bitmap_offset(eb, start, nr, &i, &offset);
5981        page = eb->pages[i];
5982        WARN_ON(!PageUptodate(page));
5983        kaddr = page_address(page);
5984        return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
5985}
5986
5987/**
5988 * extent_buffer_bitmap_set - set an area of a bitmap
5989 * @eb: the extent buffer
5990 * @start: offset of the bitmap item in the extent buffer
5991 * @pos: bit number of the first bit
5992 * @len: number of bits to set
5993 */
5994void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
5995                              unsigned long pos, unsigned long len)
5996{
5997        u8 *kaddr;
5998        struct page *page;
5999        unsigned long i;
6000        size_t offset;

6001        const unsigned int size = pos + len;
6002        int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
6003        u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
6004
6005        eb_bitmap_offset(eb, start, pos, &i, &offset);
6006        page = eb->pages[i];
6007        WARN_ON(!PageUptodate(page));
6008        kaddr = page_address(page);
6009
6010        while (len >= bits_to_set) {
6011                kaddr[offset] |= mask_to_set;
6012                len -= bits_to_set;
6013                bits_to_set = BITS_PER_BYTE;
6014                mask_to_set = ~0;
6015                if (++offset >= PAGE_SIZE && len > 0) {
6016                        offset = 0;
6017                        page = eb->pages[++i];
6018                        WARN_ON(!PageUptodate(page));
6019                        kaddr = page_address(page);
6020                }
6021        }
6022        if (len) {
6023                mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
6024                kaddr[offset] |= mask_to_set;
6025        }
6026}
6027
6028
6029/**
6030 * extent_buffer_bitmap_clear - clear an area of a bitmap
6031 * @eb: the extent buffer
6032 * @start: offset of the bitmap item in the extent buffer
6033 * @pos: bit number of the first bit
6034 * @len: number of bits to clear
6035 */
6036void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
6037                                unsigned long start, unsigned long pos,
6038                                unsigned long len)
6039{
6040        u8 *kaddr;
6041        struct page *page;
6042        unsigned long i;
6043        size_t offset;
6044        const unsigned int size = pos + len;
6045        int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
6046        u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
6047
6048        eb_bitmap_offset(eb, start, pos, &i, &offset);
6049        page = eb->pages[i];
6050        WARN_ON(!PageUptodate(page));
6051        kaddr = page_address(page);
6052
6053        while (len >= bits_to_clear) {
6054                kaddr[offset] &= ~mask_to_clear;
6055                len -= bits_to_clear;
6056                bits_to_clear = BITS_PER_BYTE;
6057                mask_to_clear = ~0;
6058                if (++offset >= PAGE_SIZE && len > 0) {
6059                        offset = 0;
6060                        page = eb->pages[++i];
6061                        WARN_ON(!PageUptodate(page));
6062                        kaddr = page_address(page);
6063                }
6064        }
6065        if (len) {
6066                mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
6067                kaddr[offset] &= ~mask_to_clear;
6068        }
6069}
6070
6071static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
6072{
6073        unsigned long distance = (src > dst) ? src - dst : dst - src;
6074        return distance < len;
6075}
6076
6077static void copy_pages(struct page *dst_page, struct page *src_page,
6078                       unsigned long dst_off, unsigned long src_off,
6079                       unsigned long len)
6080{
6081        char *dst_kaddr = page_address(dst_page);
6082        char *src_kaddr;
6083        int must_memmove = 0;
6084
6085        if (dst_page != src_page) {
6086                src_kaddr = page_address(src_page);
6087        } else {
6088                src_kaddr = dst_kaddr;
6089                if (areas_overlap(src_off, dst_off, len))
6090                        must_memmove = 1;
6091        }
6092
6093        if (must_memmove)
6094                memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
6095        else
6096                memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
6097}
6098
6099void memcpy_extent_buffer(const struct extent_buffer *dst,
6100                          unsigned long dst_offset, unsigned long src_offset,
6101                          unsigned long len)
6102{
6103        size_t cur;
6104        size_t dst_off_in_page;
6105        size_t src_off_in_page;
6106        unsigned long dst_i;
6107        unsigned long src_i;
6108
6109        if (check_eb_range(dst, dst_offset, len) ||
6110            check_eb_range(dst, src_offset, len))
6111                return;
6112
6113        while (len > 0) {
6114                dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
6115                src_off_in_page = get_eb_offset_in_page(dst, src_offset);
6116
6117                dst_i = get_eb_page_index(dst_offset);
6118                src_i = get_eb_page_index(src_offset);
6119
6120                cur = min(len, (unsigned long)(PAGE_SIZE -
6121                                               src_off_in_page));
6122                cur = min_t(unsigned long, cur,
6123                        (unsigned long)(PAGE_SIZE - dst_off_in_page));
6124
6125                copy_pages(dst->pages[dst_i], dst->pages[src_i],
6126                           dst_off_in_page, src_off_in_page, cur);
6127
6128                src_offset += cur;
6129                dst_offset += cur;
6130                len -= cur;
6131        }
6132}
6133
6134void memmove_extent_buffer(const struct extent_buffer *dst,
6135                           unsigned long dst_offset, unsigned long src_offset,
6136                           unsigned long len)
6137{
6138        size_t cur;
6139        size_t dst_off_in_page;
6140        size_t src_off_in_page;
6141        unsigned long dst_end = dst_offset + len - 1;
6142        unsigned long src_end = src_offset + len - 1;
6143        unsigned long dst_i;
6144        unsigned long src_i;
6145
6146        if (check_eb_range(dst, dst_offset, len) ||
6147            check_eb_range(dst, src_offset, len))
6148                return;
6149        if (dst_offset < src_offset) {
6150                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
6151                return;
6152        }
6153        while (len > 0) {
6154                dst_i = get_eb_page_index(dst_end);
6155                src_i = get_eb_page_index(src_end);
6156
6157                dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
6158                src_off_in_page = get_eb_offset_in_page(dst, src_end);
6159
6160                cur = min_t(unsigned long, len, src_off_in_page + 1);
6161                cur = min(cur, dst_off_in_page + 1);
6162                copy_pages(dst->pages[dst_i], dst->pages[src_i],
6163                           dst_off_in_page - cur + 1,
6164                           src_off_in_page - cur + 1, cur);
6165
6166                dst_end -= cur;
6167                src_end -= cur;
6168                len -= cur;
6169        }
6170}
6171
6172int try_release_extent_buffer(struct page *page)
6173{
6174        struct extent_buffer *eb;
6175
6176        /*
6177         * We need to make sure nobody is attaching this page to an eb right
6178         * now.
6179         */
6180        spin_lock(&page->mapping->private_lock);
6181        if (!PagePrivate(page)) {
6182                spin_unlock(&page->mapping->private_lock);
6183                return 1;
6184        }
6185
6186        eb = (struct extent_buffer *)page->private;
6187        BUG_ON(!eb);
6188
6189        /*
6190         * This is a little awful but should be ok, we need to make sure that
6191         * the eb doesn't disappear out from under us while we're looking at
6192         * this page.
6193         */
6194        spin_lock(&eb->refs_lock);
6195        if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
6196                spin_unlock(&eb->refs_lock);
6197                spin_unlock(&page->mapping->private_lock);
6198                return 0;
6199        }
6200        spin_unlock(&page->mapping->private_lock);
6201
6202        /*
6203         * If tree ref isn't set then we know the ref on this eb is a real ref,
6204         * so just return, this page will likely be freed soon anyway.
6205         */
6206        if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
6207                spin_unlock(&eb->refs_lock);
6208                return 0;
6209        }
6210
6211        return release_extent_buffer(eb);
6212}
6213
6214/*
6215 * btrfs_readahead_tree_block - attempt to readahead a child block
6216 * @fs_info:    the fs_info
6217 * @bytenr:     bytenr to read
6218 * @owner_root: objectid of the root that owns this eb
6219 * @gen:        generation for the uptodate check, can be 0
6220 * @level:      level for the eb
6221 *
6222 * Attempt to readahead a tree block at @bytenr.  If @gen is 0 then we do a
6223 * normal uptodate check of the eb, without checking the generation.  If we have
6224 * to read the block we will not block on anything.
6225 */
6226void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
6227                                u64 bytenr, u64 owner_root, u64 gen, int level)
6228{
6229        struct extent_buffer *eb;
6230        int ret;
6231
6232        eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
6233        if (IS_ERR(eb))
6234                return;
6235
6236        if (btrfs_buffer_uptodate(eb, gen, 1)) {
6237                free_extent_buffer(eb);
6238                return;
6239        }
6240
6241        ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
6242        if (ret < 0)
6243                free_extent_buffer_stale(eb);
6244        else
6245                free_extent_buffer(eb);
6246}
6247
6248/*
6249 * btrfs_readahead_node_child - readahead a node's child block
6250 * @node:       parent node we're reading from
6251 * @slot:       slot in the parent node for the child we want to read
6252 *
6253 * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
6254 * the slot in the node provided.
6255 */
6256void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
6257{
6258        btrfs_readahead_tree_block(node->fs_info,
6259                                   btrfs_node_blockptr(node, slot),
6260                                   btrfs_header_owner(node),
6261                                   btrfs_node_ptr_generation(node, slot),
6262                                   btrfs_header_level(node) - 1);
6263}
6264