LXR linux/fs/btrfs/extent

   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/bitops.h>
   4#include <linux/slab.h>
   5#include <linux/bio.h>
   6#include <linux/mm.h>
   7#include <linux/pagemap.h>
   8#include <linux/page-flags.h>
   9#include <linux/spinlock.h>
  10#include <linux/blkdev.h>
  11#include <linux/swap.h>
  12#include <linux/writeback.h>
  13#include <linux/pagevec.h>
  14#include <linux/prefetch.h>
  15#include <linux/cleancache.h>
  16#include "extent_io.h"
  17#include "extent_map.h"
  18#include "ctree.h"
  19#include "btrfs_inode.h"
  20#include "volumes.h"
  21#include "check-integrity.h"
  22#include "locking.h"
  23#include "rcu-string.h"
  24#include "backref.h"
  25#include "disk-io.h"
  26
  27static struct kmem_cache *extent_state_cache;
  28static struct kmem_cache *extent_buffer_cache;
  29static struct bio_set btrfs_bioset;
  30
  31static inline bool extent_state_in_tree(const struct extent_state *state)
  32{
  33        return !RB_EMPTY_NODE(&state->rb_node);
  34}
  35
  36#ifdef CONFIG_BTRFS_DEBUG
  37static LIST_HEAD(buffers);
  38static LIST_HEAD(states);
  39
  40static DEFINE_SPINLOCK(leak_lock);
  41
  42static inline
  43void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
  44{
  45        unsigned long flags;
  46
  47        spin_lock_irqsave(&leak_lock, flags);
  48        list_add(new, head);
  49        spin_unlock_irqrestore(&leak_lock, flags);
  50}
  51
  52static inline
  53void btrfs_leak_debug_del(struct list_head *entry)
  54{
  55        unsigned long flags;
  56
  57        spin_lock_irqsave(&leak_lock, flags);
  58        list_del(entry);
  59        spin_unlock_irqrestore(&leak_lock, flags);
  60}
  61
  62static inline
  63void btrfs_leak_debug_check(void)
  64{
  65        struct extent_state *state;
  66        struct extent_buffer *eb;
  67
  68        while (!list_empty(&states)) {
  69                state = list_entry(states.next, struct extent_state, leak_list);
  70                pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
  71                       state->start, state->end, state->state,
  72                       extent_state_in_tree(state),
  73                       refcount_read(&state->refs));
  74                list_del(&state->leak_list);
  75                kmem_cache_free(extent_state_cache, state);
  76        }
  77
  78        while (!list_empty(&buffers)) {
  79                eb = list_entry(buffers.next, struct extent_buffer, leak_list);
  80                pr_err("BTRFS: buffer leak start %llu len %lu refs %d bflags %lu\n",
  81                       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags);
  82                list_del(&eb->leak_list);
  83                kmem_cache_free(extent_buffer_cache, eb);
  84        }
  85}
  86
  87#define btrfs_debug_check_extent_io_range(tree, start, end)             \
  88        __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
  89static inline void __btrfs_debug_check_extent_io_range(const char *caller,
  90                struct extent_io_tree *tree, u64 start, u64 end)
  91{
  92        struct inode *inode = tree->private_data;
  93        u64 isize;
  94
  95        if (!inode || !is_data_inode(inode))
  96                return;
  97
  98        isize = i_size_read(inode);
  99        if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
 100                btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
 101                    "%s: ino %llu isize %llu odd range [%llu,%llu]",
 102                        caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
 103        }
 104}
 105#else
 106#define btrfs_leak_debug_add(new, head) do {} while (0)
 107#define btrfs_leak_debug_del(entry)     do {} while (0)
 108#define btrfs_leak_debug_check()        do {} while (0)
 109#define btrfs_debug_check_extent_io_range(c, s, e)      do {} while (0)
 110#endif
 111
 112struct tree_entry {
 113        u64 start;
 114        u64 end;
 115        struct rb_node rb_node;
 116};
 117
 118struct extent_page_data {
 119        struct bio *bio;
 120        struct extent_io_tree *tree;
 121        /* tells writepage not to lock the state bits for this range
 122         * it still does the unlocking
 123         */
 124        unsigned int extent_locked:1;
 125
 126        /* tells the submit_bio code to use REQ_SYNC */
 127        unsigned int sync_io:1;
 128};
 129
 130static int add_extent_changeset(struct extent_state *state, unsigned bits,
 131                                 struct extent_changeset *changeset,
 132                                 int set)
 133{
 134        int ret;
 135
 136        if (!changeset)
 137                return 0;
 138        if (set && (state->state & bits) == bits)
 139                return 0;
 140        if (!set && (state->state & bits) == 0)
 141                return 0;
 142        changeset->bytes_changed += state->end - state->start + 1;
 143        ret = ulist_add(&changeset->range_changed, state->start, state->end,
 144                        GFP_ATOMIC);
 145        return ret;
 146}
 147
 148static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
 149                                       unsigned long bio_flags)
 150{
 151        blk_status_t ret = 0;
 152        struct extent_io_tree *tree = bio->bi_private;
 153
 154        bio->bi_private = NULL;
 155
 156        if (tree->ops)
 157                ret = tree->ops->submit_bio_hook(tree->private_data, bio,
 158                                                 mirror_num, bio_flags);
 159        else
 160                btrfsic_submit_bio(bio);
 161
 162        return blk_status_to_errno(ret);
 163}
 164
 165/* Cleanup unsubmitted bios */
 166static void end_write_bio(struct extent_page_data *epd, int ret)
 167{
 168        if (epd->bio) {
 169                epd->bio->bi_status = errno_to_blk_status(ret);
 170                bio_endio(epd->bio);
 171                epd->bio = NULL;
 172        }
 173}
 174
 175/*
 176 * Submit bio from extent page data via submit_one_bio
 177 *
 178 * Return 0 if everything is OK.
 179 * Return <0 for error.
 180 */
 181static int __must_check flush_write_bio(struct extent_page_data *epd)
 182{
 183        int ret = 0;
 184
 185        if (epd->bio) {
 186                ret = submit_one_bio(epd->bio, 0, 0);
 187                /*
 188                 * Clean up of epd->bio is handled by its endio function.
 189                 * And endio is either triggered by successful bio execution
 190                 * or the error handler of submit bio hook.
 191                 * So at this point, no matter what happened, we don't need
 192                 * to clean up epd->bio.
 193                 */
 194                epd->bio = NULL;
 195        }
 196        return ret;
 197}
 198
 199int __init extent_io_init(void)
 200{
 201        extent_state_cache = kmem_cache_create("btrfs_extent_state",
 202                        sizeof(struct extent_state), 0,
 203                        SLAB_MEM_SPREAD, NULL);
 204        if (!extent_state_cache)
 205                return -ENOMEM;
 206
 207        extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 208                        sizeof(struct extent_buffer), 0,
 209                        SLAB_MEM_SPREAD, NULL);
 210        if (!extent_buffer_cache)
 211                goto free_state_cache;
 212
 213        if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
 214                        offsetof(struct btrfs_io_bio, bio),
 215                        BIOSET_NEED_BVECS))
 216                goto free_buffer_cache;
 217
 218        if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
 219                goto free_bioset;
 220
 221        return 0;
 222
 223free_bioset:
 224        bioset_exit(&btrfs_bioset);
 225
 226free_buffer_cache:
 227        kmem_cache_destroy(extent_buffer_cache);
 228        extent_buffer_cache = NULL;
 229
 230free_state_cache:
 231        kmem_cache_destroy(extent_state_cache);
 232        extent_state_cache = NULL;
 233        return -ENOMEM;
 234}
 235
 236void __cold extent_io_exit(void)
 237{
 238        btrfs_leak_debug_check();
 239
 240        /*
 241         * Make sure all delayed rcu free are flushed before we
 242         * destroy caches.
 243         */
 244        rcu_barrier();
 245        kmem_cache_destroy(extent_state_cache);
 246        kmem_cache_destroy(extent_buffer_cache);
 247        bioset_exit(&btrfs_bioset);
 248}
 249
 250void extent_io_tree_init(struct btrfs_fs_info *fs_info,
 251                         struct extent_io_tree *tree, unsigned int owner,
 252                         void *private_data)
 253{
 254        tree->fs_info = fs_info;
 255        tree->state = RB_ROOT;
 256        tree->ops = NULL;
 257        tree->dirty_bytes = 0;
 258        spin_lock_init(&tree->lock);
 259        tree->private_data = private_data;
 260        tree->owner = owner;
 261}
 262
 263void extent_io_tree_release(struct extent_io_tree *tree)
 264{
 265        spin_lock(&tree->lock);
 266        /*
 267         * Do a single barrier for the waitqueue_active check here, the state
 268         * of the waitqueue should not change once extent_io_tree_release is
 269         * called.
 270         */
 271        smp_mb();
 272        while (!RB_EMPTY_ROOT(&tree->state)) {
 273                struct rb_node *node;
 274                struct extent_state *state;
 275
 276                node = rb_first(&tree->state);
 277                state = rb_entry(node, struct extent_state, rb_node);
 278                rb_erase(&state->rb_node, &tree->state);
 279                RB_CLEAR_NODE(&state->rb_node);
 280                /*
 281                 * btree io trees aren't supposed to have tasks waiting for
 282                 * changes in the flags of extent states ever.
 283                 */
 284                ASSERT(!waitqueue_active(&state->wq));
 285                free_extent_state(state);
 286
 287                cond_resched_lock(&tree->lock);
 288        }
 289        spin_unlock(&tree->lock);
 290}
 291
 292static struct extent_state *alloc_extent_state(gfp_t mask)
 293{
 294        struct extent_state *state;
 295
 296        /*
 297         * The given mask might be not appropriate for the slab allocator,
 298         * drop the unsupported bits
 299         */
 300        mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
 301        state = kmem_cache_alloc(extent_state_cache, mask);
 302        if (!state)
 303                return state;
 304        state->state = 0;
 305        state->failrec = NULL;
 306        RB_CLEAR_NODE(&state->rb_node);
 307        btrfs_leak_debug_add(&state->leak_list, &states);
 308        refcount_set(&state->refs, 1);
 309        init_waitqueue_head(&state->wq);
 310        trace_alloc_extent_state(state, mask, _RET_IP_);
 311        return state;
 312}
 313
 314void free_extent_state(struct extent_state *state)
 315{
 316        if (!state)
 317                return;
 318        if (refcount_dec_and_test(&state->refs)) {
 319                WARN_ON(extent_state_in_tree(state));
 320                btrfs_leak_debug_del(&state->leak_list);
 321                trace_free_extent_state(state, _RET_IP_);
 322                kmem_cache_free(extent_state_cache, state);
 323        }
 324}
 325
 326static struct rb_node *tree_insert(struct rb_root *root,
 327                                   struct rb_node *search_start,
 328                                   u64 offset,
 329                                   struct rb_node *node,
 330                                   struct rb_node ***p_in,
 331                                   struct rb_node **parent_in)
 332{
 333        struct rb_node **p;
 334        struct rb_node *parent = NULL;
 335        struct tree_entry *entry;
 336
 337        if (p_in && parent_in) {
 338                p = *p_in;
 339                parent = *parent_in;
 340                goto do_insert;
 341        }
 342
 343        p = search_start ? &search_start : &root->rb_node;
 344        while (*p) {
 345                parent = *p;
 346                entry = rb_entry(parent, struct tree_entry, rb_node);
 347
 348                if (offset < entry->start)
 349                        p = &(*p)->rb_left;
 350                else if (offset > entry->end)
 351                        p = &(*p)->rb_right;
 352                else
 353                        return parent;
 354        }
 355
 356do_insert:
 357        rb_link_node(node, parent, p);
 358        rb_insert_color(node, root);
 359        return NULL;
 360}
 361
 362/**
 363 * __etree_search - searche @tree for an entry that contains @offset. Such
 364 * entry would have entry->start <= offset && entry->end >= offset.
 365 *
 366 * @tree - the tree to search
 367 * @offset - offset that should fall within an entry in @tree
 368 * @next_ret - pointer to the first entry whose range ends after @offset
 369 * @prev - pointer to the first entry whose range begins before @offset
 370 * @p_ret - pointer where new node should be anchored (used when inserting an
 371 *          entry in the tree)
 372 * @parent_ret - points to entry which would have been the parent of the entry,
 373 *               containing @offset
 374 *
 375 * This function returns a pointer to the entry that contains @offset byte
 376 * address. If no such entry exists, then NULL is returned and the other
 377 * pointer arguments to the function are filled, otherwise the found entry is
 378 * returned and other pointers are left untouched.
 379 */
 380static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 381                                      struct rb_node **next_ret,
 382                                      struct rb_node **prev_ret,
 383                                      struct rb_node ***p_ret,
 384                                      struct rb_node **parent_ret)
 385{
 386        struct rb_root *root = &tree->state;
 387        struct rb_node **n = &root->rb_node;
 388        struct rb_node *prev = NULL;
 389        struct rb_node *orig_prev = NULL;
 390        struct tree_entry *entry;
 391        struct tree_entry *prev_entry = NULL;
 392
 393        while (*n) {
 394                prev = *n;
 395                entry = rb_entry(prev, struct tree_entry, rb_node);
 396                prev_entry = entry;
 397
 398                if (offset < entry->start)
 399                        n = &(*n)->rb_left;
 400                else if (offset > entry->end)
 401                        n = &(*n)->rb_right;
 402                else
 403                        return *n;
 404        }
 405
 406        if (p_ret)
 407                *p_ret = n;
 408        if (parent_ret)
 409                *parent_ret = prev;
 410
 411        if (next_ret) {
 412                orig_prev = prev;
 413                while (prev && offset > prev_entry->end) {
 414                        prev = rb_next(prev);
 415                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 416                }
 417                *next_ret = prev;
 418                prev = orig_prev;
 419        }
 420
 421        if (prev_ret) {
 422                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 423                while (prev && offset < prev_entry->start) {
 424                        prev = rb_prev(prev);
 425                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 426                }
 427                *prev_ret = prev;
 428        }
 429        return NULL;
 430}
 431
 432static inline struct rb_node *
 433tree_search_for_insert(struct extent_io_tree *tree,
 434                       u64 offset,
 435                       struct rb_node ***p_ret,
 436                       struct rb_node **parent_ret)
 437{
 438        struct rb_node *next= NULL;
 439        struct rb_node *ret;
 440
 441        ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret);
 442        if (!ret)
 443                return next;
 444        return ret;
 445}
 446
 447static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 448                                          u64 offset)
 449{
 450        return tree_search_for_insert(tree, offset, NULL, NULL);
 451}
 452
 453/*
 454 * utility function to look for merge candidates inside a given range.
 455 * Any extents with matching state are merged together into a single
 456 * extent in the tree.  Extents with EXTENT_IO in their state field
 457 * are not merged because the end_io handlers need to be able to do
 458 * operations on them without sleeping (or doing allocations/splits).
 459 *
 460 * This should be called with the tree lock held.
 461 */
 462static void merge_state(struct extent_io_tree *tree,
 463                        struct extent_state *state)
 464{
 465        struct extent_state *other;
 466        struct rb_node *other_node;
 467
 468        if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 469                return;
 470
 471        other_node = rb_prev(&state->rb_node);
 472        if (other_node) {
 473                other = rb_entry(other_node, struct extent_state, rb_node);
 474                if (other->end == state->start - 1 &&
 475                    other->state == state->state) {
 476                        if (tree->private_data &&
 477                            is_data_inode(tree->private_data))
 478                                btrfs_merge_delalloc_extent(tree->private_data,
 479                                                            state, other);
 480                        state->start = other->start;
 481                        rb_erase(&other->rb_node, &tree->state);
 482                        RB_CLEAR_NODE(&other->rb_node);
 483                        free_extent_state(other);
 484                }
 485        }
 486        other_node = rb_next(&state->rb_node);
 487        if (other_node) {
 488                other = rb_entry(other_node, struct extent_state, rb_node);
 489                if (other->start == state->end + 1 &&
 490                    other->state == state->state) {
 491                        if (tree->private_data &&
 492                            is_data_inode(tree->private_data))
 493                                btrfs_merge_delalloc_extent(tree->private_data,
 494                                                            state, other);
 495                        state->end = other->end;
 496                        rb_erase(&other->rb_node, &tree->state);
 497                        RB_CLEAR_NODE(&other->rb_node);
 498                        free_extent_state(other);
 499                }
 500        }
 501}
 502
 503static void set_state_bits(struct extent_io_tree *tree,
 504                           struct extent_state *state, unsigned *bits,
 505                           struct extent_changeset *changeset);
 506
 507/*
 508 * insert an extent_state struct into the tree.  'bits' are set on the
 509 * struct before it is inserted.
 510 *
 511 * This may return -EEXIST if the extent is already there, in which case the
 512 * state struct is freed.
 513 *
 514 * The tree lock is not taken internally.  This is a utility function and
 515 * probably isn't what you want to call (see set/clear_extent_bit).
 516 */
 517static int insert_state(struct extent_io_tree *tree,
 518                        struct extent_state *state, u64 start, u64 end,
 519                        struct rb_node ***p,
 520                        struct rb_node **parent,
 521                        unsigned *bits, struct extent_changeset *changeset)
 522{
 523        struct rb_node *node;
 524
 525        if (end < start) {
 526                btrfs_err(tree->fs_info,
 527                        "insert state: end < start %llu %llu", end, start);
 528                WARN_ON(1);
 529        }
 530        state->start = start;
 531        state->end = end;
 532
 533        set_state_bits(tree, state, bits, changeset);
 534
 535        node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
 536        if (node) {
 537                struct extent_state *found;
 538                found = rb_entry(node, struct extent_state, rb_node);
 539                btrfs_err(tree->fs_info,
 540                       "found node %llu %llu on insert of %llu %llu",
 541                       found->start, found->end, start, end);
 542                return -EEXIST;
 543        }
 544        merge_state(tree, state);
 545        return 0;
 546}
 547
 548/*
 549 * split a given extent state struct in two, inserting the preallocated
 550 * struct 'prealloc' as the newly created second half.  'split' indicates an
 551 * offset inside 'orig' where it should be split.
 552 *
 553 * Before calling,
 554 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 555 * are two extent state structs in the tree:
 556 * prealloc: [orig->start, split - 1]
 557 * orig: [ split, orig->end ]
 558 *
 559 * The tree locks are not taken by this function. They need to be held
 560 * by the caller.
 561 */
 562static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 563                       struct extent_state *prealloc, u64 split)
 564{
 565        struct rb_node *node;
 566
 567        if (tree->private_data && is_data_inode(tree->private_data))
 568                btrfs_split_delalloc_extent(tree->private_data, orig, split);
 569
 570        prealloc->start = orig->start;
 571        prealloc->end = split - 1;
 572        prealloc->state = orig->state;
 573        orig->start = split;
 574
 575        node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
 576                           &prealloc->rb_node, NULL, NULL);
 577        if (node) {
 578                free_extent_state(prealloc);
 579                return -EEXIST;
 580        }
 581        return 0;
 582}
 583
 584static struct extent_state *next_state(struct extent_state *state)
 585{
 586        struct rb_node *next = rb_next(&state->rb_node);
 587        if (next)
 588                return rb_entry(next, struct extent_state, rb_node);
 589        else
 590                return NULL;
 591}
 592
 593/*
 594 * utility function to clear some bits in an extent state struct.
 595 * it will optionally wake up anyone waiting on this state (wake == 1).
 596 *
 597 * If no bits are set on the state struct after clearing things, the
 598 * struct is freed and removed from the tree
 599 */
 600static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 601                                            struct extent_state *state,
 602                                            unsigned *bits, int wake,
 603                                            struct extent_changeset *changeset)
 604{
 605        struct extent_state *next;
 606        unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
 607        int ret;
 608
 609        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 610                u64 range = state->end - state->start + 1;
 611                WARN_ON(range > tree->dirty_bytes);
 612                tree->dirty_bytes -= range;
 613        }
 614
 615        if (tree->private_data && is_data_inode(tree->private_data))
 616                btrfs_clear_delalloc_extent(tree->private_data, state, bits);
 617
 618        ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
 619        BUG_ON(ret < 0);
 620        state->state &= ~bits_to_clear;
 621        if (wake)
 622                wake_up(&state->wq);
 623        if (state->state == 0) {
 624                next = next_state(state);
 625                if (extent_state_in_tree(state)) {
 626                        rb_erase(&state->rb_node, &tree->state);
 627                        RB_CLEAR_NODE(&state->rb_node);
 628                        free_extent_state(state);
 629                } else {
 630                        WARN_ON(1);
 631                }
 632        } else {
 633                merge_state(tree, state);
 634                next = next_state(state);
 635        }
 636        return next;
 637}
 638
 639static struct extent_state *
 640alloc_extent_state_atomic(struct extent_state *prealloc)
 641{
 642        if (!prealloc)
 643                prealloc = alloc_extent_state(GFP_ATOMIC);
 644
 645        return prealloc;
 646}
 647
 648static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 649{
 650        struct inode *inode = tree->private_data;
 651
 652        btrfs_panic(btrfs_sb(inode->i_sb), err,
 653        "locking error: extent tree was modified by another thread while locked");
 654}
 655
 656/*
 657 * clear some bits on a range in the tree.  This may require splitting
 658 * or inserting elements in the tree, so the gfp mask is used to
 659 * indicate which allocations or sleeping are allowed.
 660 *
 661 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 662 * the given range from the tree regardless of state (ie for truncate).
 663 *
 664 * the range [start, end] is inclusive.
 665 *
 666 * This takes the tree lock, and returns 0 on success and < 0 on error.
 667 */
 668int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 669                              unsigned bits, int wake, int delete,
 670                              struct extent_state **cached_state,
 671                              gfp_t mask, struct extent_changeset *changeset)
 672{
 673        struct extent_state *state;
 674        struct extent_state *cached;
 675        struct extent_state *prealloc = NULL;
 676        struct rb_node *node;
 677        u64 last_end;
 678        int err;
 679        int clear = 0;
 680
 681        btrfs_debug_check_extent_io_range(tree, start, end);
 682        trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
 683
 684        if (bits & EXTENT_DELALLOC)
 685                bits |= EXTENT_NORESERVE;
 686
 687        if (delete)
 688                bits |= ~EXTENT_CTLBITS;
 689
 690        if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
 691                clear = 1;
 692again:
 693        if (!prealloc && gfpflags_allow_blocking(mask)) {
 694                /*
 695                 * Don't care for allocation failure here because we might end
 696                 * up not needing the pre-allocated extent state at all, which
 697                 * is the case if we only have in the tree extent states that
 698                 * cover our input range and don't cover too any other range.
 699                 * If we end up needing a new extent state we allocate it later.
 700                 */
 701                prealloc = alloc_extent_state(mask);
 702        }
 703
 704        spin_lock(&tree->lock);
 705        if (cached_state) {
 706                cached = *cached_state;
 707
 708                if (clear) {
 709                        *cached_state = NULL;
 710                        cached_state = NULL;
 711                }
 712
 713                if (cached && extent_state_in_tree(cached) &&
 714                    cached->start <= start && cached->end > start) {
 715                        if (clear)
 716                                refcount_dec(&cached->refs);
 717                        state = cached;
 718                        goto hit_next;
 719                }
 720                if (clear)
 721                        free_extent_state(cached);
 722        }
 723        /*
 724         * this search will find the extents that end after
 725         * our range starts
 726         */
 727        node = tree_search(tree, start);
 728        if (!node)
 729                goto out;
 730        state = rb_entry(node, struct extent_state, rb_node);
 731hit_next:
 732        if (state->start > end)
 733                goto out;
 734        WARN_ON(state->end < start);
 735        last_end = state->end;
 736
 737        /* the state doesn't have the wanted bits, go ahead */
 738        if (!(state->state & bits)) {
 739                state = next_state(state);
 740                goto next;
 741        }
 742
 743        /*
 744         *     | ---- desired range ---- |
 745         *  | state | or
 746         *  | ------------- state -------------- |
 747         *
 748         * We need to split the extent we found, and may flip
 749         * bits on second half.
 750         *
 751         * If the extent we found extends past our range, we
 752         * just split and search again.  It'll get split again
 753         * the next time though.
 754         *
 755         * If the extent we found is inside our range, we clear
 756         * the desired bit on it.
 757         */
 758
 759        if (state->start < start) {
 760                prealloc = alloc_extent_state_atomic(prealloc);
 761                BUG_ON(!prealloc);
 762                err = split_state(tree, state, prealloc, start);
 763                if (err)
 764                        extent_io_tree_panic(tree, err);
 765
 766                prealloc = NULL;
 767                if (err)
 768                        goto out;
 769                if (state->end <= end) {
 770                        state = clear_state_bit(tree, state, &bits, wake,
 771                                                changeset);
 772                        goto next;
 773                }
 774                goto search_again;
 775        }
 776        /*
 777         * | ---- desired range ---- |
 778         *                        | state |
 779         * We need to split the extent, and clear the bit
 780         * on the first half
 781         */
 782        if (state->start <= end && state->end > end) {
 783                prealloc = alloc_extent_state_atomic(prealloc);
 784                BUG_ON(!prealloc);
 785                err = split_state(tree, state, prealloc, end + 1);
 786                if (err)
 787                        extent_io_tree_panic(tree, err);
 788
 789                if (wake)
 790                        wake_up(&state->wq);
 791
 792                clear_state_bit(tree, prealloc, &bits, wake, changeset);
 793
 794                prealloc = NULL;
 795                goto out;
 796        }
 797
 798        state = clear_state_bit(tree, state, &bits, wake, changeset);
 799next:
 800        if (last_end == (u64)-1)
 801                goto out;
 802        start = last_end + 1;
 803        if (start <= end && state && !need_resched())
 804                goto hit_next;
 805
 806search_again:
 807        if (start > end)
 808                goto out;
 809        spin_unlock(&tree->lock);
 810        if (gfpflags_allow_blocking(mask))
 811                cond_resched();
 812        goto again;
 813
 814out:
 815        spin_unlock(&tree->lock);
 816        if (prealloc)
 817                free_extent_state(prealloc);
 818
 819        return 0;
 820
 821}
 822
 823static void wait_on_state(struct extent_io_tree *tree,
 824                          struct extent_state *state)
 825                __releases(tree->lock)
 826                __acquires(tree->lock)
 827{
 828        DEFINE_WAIT(wait);
 829        prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 830        spin_unlock(&tree->lock);
 831        schedule();
 832        spin_lock(&tree->lock);
 833        finish_wait(&state->wq, &wait);
 834}
 835
 836/*
 837 * waits for one or more bits to clear on a range in the state tree.
 838 * The range [start, end] is inclusive.
 839 * The tree lock is taken by this function
 840 */
 841static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 842                            unsigned long bits)
 843{
 844        struct extent_state *state;
 845        struct rb_node *node;
 846
 847        btrfs_debug_check_extent_io_range(tree, start, end);
 848
 849        spin_lock(&tree->lock);
 850again:
 851        while (1) {
 852                /*
 853                 * this search will find all the extents that end after
 854                 * our range starts
 855                 */
 856                node = tree_search(tree, start);
 857process_node:
 858                if (!node)
 859                        break;
 860
 861                state = rb_entry(node, struct extent_state, rb_node);
 862
 863                if (state->start > end)
 864                        goto out;
 865
 866                if (state->state & bits) {
 867                        start = state->start;
 868                        refcount_inc(&state->refs);
 869                        wait_on_state(tree, state);
 870                        free_extent_state(state);
 871                        goto again;
 872                }
 873                start = state->end + 1;
 874
 875                if (start > end)
 876                        break;
 877
 878                if (!cond_resched_lock(&tree->lock)) {
 879                        node = rb_next(node);
 880                        goto process_node;
 881                }
 882        }
 883out:
 884        spin_unlock(&tree->lock);
 885}
 886
 887static void set_state_bits(struct extent_io_tree *tree,
 888                           struct extent_state *state,
 889                           unsigned *bits, struct extent_changeset *changeset)
 890{
 891        unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
 892        int ret;
 893
 894        if (tree->private_data && is_data_inode(tree->private_data))
 895                btrfs_set_delalloc_extent(tree->private_data, state, bits);
 896
 897        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 898                u64 range = state->end - state->start + 1;
 899                tree->dirty_bytes += range;
 900        }
 901        ret = add_extent_changeset(state, bits_to_set, changeset, 1);
 902        BUG_ON(ret < 0);
 903        state->state |= bits_to_set;
 904}
 905
 906static void cache_state_if_flags(struct extent_state *state,
 907                                 struct extent_state **cached_ptr,
 908                                 unsigned flags)
 909{
 910        if (cached_ptr && !(*cached_ptr)) {
 911                if (!flags || (state->state & flags)) {
 912                        *cached_ptr = state;
 913                        refcount_inc(&state->refs);
 914                }
 915        }
 916}
 917
 918static void cache_state(struct extent_state *state,
 919                        struct extent_state **cached_ptr)
 920{
 921        return cache_state_if_flags(state, cached_ptr,
 922                                    EXTENT_LOCKED | EXTENT_BOUNDARY);
 923}
 924
 925/*
 926 * set some bits on a range in the tree.  This may require allocations or
 927 * sleeping, so the gfp mask is used to indicate what is allowed.
 928 *
 929 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 930 * part of the range already has the desired bits set.  The start of the
 931 * existing range is returned in failed_start in this case.
 932 *
 933 * [start, end] is inclusive This takes the tree lock.
 934 */
 935
 936static int __must_check
 937__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 938                 unsigned bits, unsigned exclusive_bits,
 939                 u64 *failed_start, struct extent_state **cached_state,
 940                 gfp_t mask, struct extent_changeset *changeset)
 941{
 942        struct extent_state *state;
 943        struct extent_state *prealloc = NULL;
 944        struct rb_node *node;
 945        struct rb_node **p;
 946        struct rb_node *parent;
 947        int err = 0;
 948        u64 last_start;
 949        u64 last_end;
 950
 951        btrfs_debug_check_extent_io_range(tree, start, end);
 952        trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
 953
 954again:
 955        if (!prealloc && gfpflags_allow_blocking(mask)) {
 956                /*
 957                 * Don't care for allocation failure here because we might end
 958                 * up not needing the pre-allocated extent state at all, which
 959                 * is the case if we only have in the tree extent states that
 960                 * cover our input range and don't cover too any other range.
 961                 * If we end up needing a new extent state we allocate it later.
 962                 */
 963                prealloc = alloc_extent_state(mask);
 964        }
 965
 966        spin_lock(&tree->lock);
 967        if (cached_state && *cached_state) {
 968                state = *cached_state;
 969                if (state->start <= start && state->end > start &&
 970                    extent_state_in_tree(state)) {
 971                        node = &state->rb_node;
 972                        goto hit_next;
 973                }
 974        }
 975        /*
 976         * this search will find all the extents that end after
 977         * our range starts.
 978         */
 979        node = tree_search_for_insert(tree, start, &p, &parent);
 980        if (!node) {
 981                prealloc = alloc_extent_state_atomic(prealloc);
 982                BUG_ON(!prealloc);
 983                err = insert_state(tree, prealloc, start, end,
 984                                   &p, &parent, &bits, changeset);
 985                if (err)
 986                        extent_io_tree_panic(tree, err);
 987
 988                cache_state(prealloc, cached_state);
 989                prealloc = NULL;
 990                goto out;
 991        }
 992        state = rb_entry(node, struct extent_state, rb_node);
 993hit_next:
 994        last_start = state->start;
 995        last_end = state->end;
 996
 997        /*
 998         * | ---- desired range ---- |
 999         * | state |
1000         *

1001         * Just lock what we found and keep going
1002         */
1003        if (state->start == start && state->end <= end) {
1004                if (state->state & exclusive_bits) {
1005                        *failed_start = state->start;
1006                        err = -EEXIST;
1007                        goto out;
1008                }
1009
1010                set_state_bits(tree, state, &bits, changeset);
1011                cache_state(state, cached_state);
1012                merge_state(tree, state);
1013                if (last_end == (u64)-1)
1014                        goto out;
1015                start = last_end + 1;
1016                state = next_state(state);
1017                if (start < end && state && state->start == start &&
1018                    !need_resched())
1019                        goto hit_next;
1020                goto search_again;
1021        }
1022
1023        /*
1024         *     | ---- desired range ---- |
1025         * | state |
1026         *   or
1027         * | ------------- state -------------- |
1028         *
1029         * We need to split the extent we found, and may flip bits on
1030         * second half.
1031         *
1032         * If the extent we found extends past our
1033         * range, we just split and search again.  It'll get split
1034         * again the next time though.
1035         *
1036         * If the extent we found is inside our range, we set the
1037         * desired bit on it.
1038         */
1039        if (state->start < start) {
1040                if (state->state & exclusive_bits) {
1041                        *failed_start = start;
1042                        err = -EEXIST;
1043                        goto out;
1044                }
1045
1046                prealloc = alloc_extent_state_atomic(prealloc);
1047                BUG_ON(!prealloc);
1048                err = split_state(tree, state, prealloc, start);
1049                if (err)
1050                        extent_io_tree_panic(tree, err);
1051
1052                prealloc = NULL;
1053                if (err)
1054                        goto out;
1055                if (state->end <= end) {
1056                        set_state_bits(tree, state, &bits, changeset);
1057                        cache_state(state, cached_state);
1058                        merge_state(tree, state);
1059                        if (last_end == (u64)-1)
1060                                goto out;
1061                        start = last_end + 1;
1062                        state = next_state(state);
1063                        if (start < end && state && state->start == start &&
1064                            !need_resched())
1065                                goto hit_next;
1066                }
1067                goto search_again;
1068        }
1069        /*
1070         * | ---- desired range ---- |
1071         *     | state | or               | state |
1072         *
1073         * There's a hole, we need to insert something in it and
1074         * ignore the extent we found.
1075         */
1076        if (state->start > start) {
1077                u64 this_end;
1078                if (end < last_start)
1079                        this_end = end;
1080                else
1081                        this_end = last_start - 1;
1082
1083                prealloc = alloc_extent_state_atomic(prealloc);
1084                BUG_ON(!prealloc);
1085
1086                /*
1087                 * Avoid to free 'prealloc' if it can be merged with
1088                 * the later extent.
1089                 */
1090                err = insert_state(tree, prealloc, start, this_end,
1091                                   NULL, NULL, &bits, changeset);
1092                if (err)
1093                        extent_io_tree_panic(tree, err);
1094
1095                cache_state(prealloc, cached_state);
1096                prealloc = NULL;
1097                start = this_end + 1;
1098                goto search_again;
1099        }
1100        /*
1101         * | ---- desired range ---- |
1102         *                        | state |
1103         * We need to split the extent, and set the bit
1104         * on the first half
1105         */
1106        if (state->start <= end && state->end > end) {
1107                if (state->state & exclusive_bits) {
1108                        *failed_start = start;
1109                        err = -EEXIST;
1110                        goto out;
1111                }
1112
1113                prealloc = alloc_extent_state_atomic(prealloc);
1114                BUG_ON(!prealloc);
1115                err = split_state(tree, state, prealloc, end + 1);
1116                if (err)
1117                        extent_io_tree_panic(tree, err);
1118
1119                set_state_bits(tree, prealloc, &bits, changeset);
1120                cache_state(prealloc, cached_state);
1121                merge_state(tree, prealloc);
1122                prealloc = NULL;
1123                goto out;
1124        }
1125
1126search_again:
1127        if (start > end)
1128                goto out;
1129        spin_unlock(&tree->lock);
1130        if (gfpflags_allow_blocking(mask))
1131                cond_resched();
1132        goto again;
1133
1134out:
1135        spin_unlock(&tree->lock);
1136        if (prealloc)
1137                free_extent_state(prealloc);
1138
1139        return err;
1140
1141}
1142
1143int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1144                   unsigned bits, u64 * failed_start,
1145                   struct extent_state **cached_state, gfp_t mask)
1146{
1147        return __set_extent_bit(tree, start, end, bits, 0, failed_start,
1148                                cached_state, mask, NULL);
1149}
1150
1151
1152/**
1153 * convert_extent_bit - convert all bits in a given range from one bit to
1154 *                      another
1155 * @tree:       the io tree to search
1156 * @start:      the start offset in bytes
1157 * @end:        the end offset in bytes (inclusive)
1158 * @bits:       the bits to set in this range
1159 * @clear_bits: the bits to clear in this range
1160 * @cached_state:       state that we're going to cache
1161 *
1162 * This will go through and set bits for the given range.  If any states exist
1163 * already in this range they are set with the given bit and cleared of the
1164 * clear_bits.  This is only meant to be used by things that are mergeable, ie
1165 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1166 * boundary bits like LOCK.
1167 *
1168 * All allocations are done with GFP_NOFS.
1169 */
1170int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1171                       unsigned bits, unsigned clear_bits,
1172                       struct extent_state **cached_state)
1173{
1174        struct extent_state *state;
1175        struct extent_state *prealloc = NULL;
1176        struct rb_node *node;
1177        struct rb_node **p;
1178        struct rb_node *parent;
1179        int err = 0;
1180        u64 last_start;
1181        u64 last_end;
1182        bool first_iteration = true;
1183
1184        btrfs_debug_check_extent_io_range(tree, start, end);
1185        trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1186                                       clear_bits);
1187
1188again:
1189        if (!prealloc) {
1190                /*
1191                 * Best effort, don't worry if extent state allocation fails
1192                 * here for the first iteration. We might have a cached state
1193                 * that matches exactly the target range, in which case no
1194                 * extent state allocations are needed. We'll only know this
1195                 * after locking the tree.
1196                 */
1197                prealloc = alloc_extent_state(GFP_NOFS);
1198                if (!prealloc && !first_iteration)
1199                        return -ENOMEM;
1200        }
1201
1202        spin_lock(&tree->lock);
1203        if (cached_state && *cached_state) {
1204                state = *cached_state;
1205                if (state->start <= start && state->end > start &&
1206                    extent_state_in_tree(state)) {
1207                        node = &state->rb_node;
1208                        goto hit_next;
1209                }
1210        }
1211
1212        /*
1213         * this search will find all the extents that end after
1214         * our range starts.
1215         */
1216        node = tree_search_for_insert(tree, start, &p, &parent);
1217        if (!node) {
1218                prealloc = alloc_extent_state_atomic(prealloc);
1219                if (!prealloc) {
1220                        err = -ENOMEM;
1221                        goto out;
1222                }
1223                err = insert_state(tree, prealloc, start, end,
1224                                   &p, &parent, &bits, NULL);
1225                if (err)
1226                        extent_io_tree_panic(tree, err);
1227                cache_state(prealloc, cached_state);
1228                prealloc = NULL;
1229                goto out;
1230        }
1231        state = rb_entry(node, struct extent_state, rb_node);
1232hit_next:
1233        last_start = state->start;
1234        last_end = state->end;
1235
1236        /*
1237         * | ---- desired range ---- |
1238         * | state |
1239         *
1240         * Just lock what we found and keep going
1241         */
1242        if (state->start == start && state->end <= end) {
1243                set_state_bits(tree, state, &bits, NULL);
1244                cache_state(state, cached_state);
1245                state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
1246                if (last_end == (u64)-1)
1247                        goto out;
1248                start = last_end + 1;
1249                if (start < end && state && state->start == start &&
1250                    !need_resched())
1251                        goto hit_next;
1252                goto search_again;
1253        }
1254
1255        /*
1256         *     | ---- desired range ---- |
1257         * | state |
1258         *   or
1259         * | ------------- state -------------- |
1260         *
1261         * We need to split the extent we found, and may flip bits on
1262         * second half.
1263         *
1264         * If the extent we found extends past our
1265         * range, we just split and search again.  It'll get split
1266         * again the next time though.
1267         *
1268         * If the extent we found is inside our range, we set the
1269         * desired bit on it.
1270         */
1271        if (state->start < start) {
1272                prealloc = alloc_extent_state_atomic(prealloc);
1273                if (!prealloc) {
1274                        err = -ENOMEM;
1275                        goto out;
1276                }
1277                err = split_state(tree, state, prealloc, start);
1278                if (err)
1279                        extent_io_tree_panic(tree, err);
1280                prealloc = NULL;
1281                if (err)
1282                        goto out;
1283                if (state->end <= end) {
1284                        set_state_bits(tree, state, &bits, NULL);
1285                        cache_state(state, cached_state);
1286                        state = clear_state_bit(tree, state, &clear_bits, 0,
1287                                                NULL);
1288                        if (last_end == (u64)-1)
1289                                goto out;
1290                        start = last_end + 1;
1291                        if (start < end && state && state->start == start &&
1292                            !need_resched())
1293                                goto hit_next;
1294                }
1295                goto search_again;
1296        }
1297        /*
1298         * | ---- desired range ---- |
1299         *     | state | or               | state |
1300         *
1301         * There's a hole, we need to insert something in it and
1302         * ignore the extent we found.
1303         */
1304        if (state->start > start) {
1305                u64 this_end;
1306                if (end < last_start)
1307                        this_end = end;
1308                else
1309                        this_end = last_start - 1;
1310
1311                prealloc = alloc_extent_state_atomic(prealloc);
1312                if (!prealloc) {
1313                        err = -ENOMEM;
1314                        goto out;
1315                }
1316
1317                /*
1318                 * Avoid to free 'prealloc' if it can be merged with
1319                 * the later extent.
1320                 */
1321                err = insert_state(tree, prealloc, start, this_end,
1322                                   NULL, NULL, &bits, NULL);
1323                if (err)
1324                        extent_io_tree_panic(tree, err);
1325                cache_state(prealloc, cached_state);
1326                prealloc = NULL;
1327                start = this_end + 1;
1328                goto search_again;
1329        }
1330        /*
1331         * | ---- desired range ---- |
1332         *                        | state |
1333         * We need to split the extent, and set the bit
1334         * on the first half
1335         */
1336        if (state->start <= end && state->end > end) {
1337                prealloc = alloc_extent_state_atomic(prealloc);
1338                if (!prealloc) {
1339                        err = -ENOMEM;
1340                        goto out;
1341                }
1342
1343                err = split_state(tree, state, prealloc, end + 1);
1344                if (err)
1345                        extent_io_tree_panic(tree, err);
1346
1347                set_state_bits(tree, prealloc, &bits, NULL);
1348                cache_state(prealloc, cached_state);
1349                clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
1350                prealloc = NULL;
1351                goto out;
1352        }
1353
1354search_again:
1355        if (start > end)
1356                goto out;
1357        spin_unlock(&tree->lock);
1358        cond_resched();
1359        first_iteration = false;
1360        goto again;
1361
1362out:
1363        spin_unlock(&tree->lock);
1364        if (prealloc)
1365                free_extent_state(prealloc);
1366
1367        return err;
1368}
1369
1370/* wrappers around set/clear extent bit */
1371int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1372                           unsigned bits, struct extent_changeset *changeset)
1373{
1374        /*
1375         * We don't support EXTENT_LOCKED yet, as current changeset will
1376         * record any bits changed, so for EXTENT_LOCKED case, it will
1377         * either fail with -EEXIST or changeset will record the whole
1378         * range.
1379         */
1380        BUG_ON(bits & EXTENT_LOCKED);
1381
1382        return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1383                                changeset);
1384}
1385
1386int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1387                           unsigned bits)
1388{
1389        return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1390                                GFP_NOWAIT, NULL);
1391}
1392
1393int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1394                     unsigned bits, int wake, int delete,
1395                     struct extent_state **cached)
1396{
1397        return __clear_extent_bit(tree, start, end, bits, wake, delete,
1398                                  cached, GFP_NOFS, NULL);
1399}
1400
1401int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1402                unsigned bits, struct extent_changeset *changeset)
1403{
1404        /*
1405         * Don't support EXTENT_LOCKED case, same reason as
1406         * set_record_extent_bits().
1407         */
1408        BUG_ON(bits & EXTENT_LOCKED);
1409
1410        return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
1411                                  changeset);
1412}
1413
1414/*
1415 * either insert or lock state struct between start and end use mask to tell
1416 * us if waiting is desired.
1417 */
1418int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1419                     struct extent_state **cached_state)
1420{
1421        int err;
1422        u64 failed_start;
1423
1424        while (1) {
1425                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
1426                                       EXTENT_LOCKED, &failed_start,
1427                                       cached_state, GFP_NOFS, NULL);
1428                if (err == -EEXIST) {
1429                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1430                        start = failed_start;
1431                } else
1432                        break;
1433                WARN_ON(start > end);
1434        }
1435        return err;
1436}
1437
1438int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1439{
1440        int err;
1441        u64 failed_start;
1442
1443        err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1444                               &failed_start, NULL, GFP_NOFS, NULL);
1445        if (err == -EEXIST) {
1446                if (failed_start > start)
1447                        clear_extent_bit(tree, start, failed_start - 1,
1448                                         EXTENT_LOCKED, 1, 0, NULL);
1449                return 0;
1450        }
1451        return 1;
1452}
1453
1454void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1455{
1456        unsigned long index = start >> PAGE_SHIFT;
1457        unsigned long end_index = end >> PAGE_SHIFT;
1458        struct page *page;
1459
1460        while (index <= end_index) {
1461                page = find_get_page(inode->i_mapping, index);
1462                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1463                clear_page_dirty_for_io(page);
1464                put_page(page);
1465                index++;
1466        }
1467}
1468
1469void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1470{
1471        unsigned long index = start >> PAGE_SHIFT;
1472        unsigned long end_index = end >> PAGE_SHIFT;
1473        struct page *page;
1474
1475        while (index <= end_index) {
1476                page = find_get_page(inode->i_mapping, index);
1477                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1478                __set_page_dirty_nobuffers(page);
1479                account_page_redirty(page);
1480                put_page(page);
1481                index++;
1482        }
1483}
1484
1485/* find the first state struct with 'bits' set after 'start', and
1486 * return it.  tree->lock must be held.  NULL will returned if
1487 * nothing was found after 'start'
1488 */
1489static struct extent_state *
1490find_first_extent_bit_state(struct extent_io_tree *tree,
1491                            u64 start, unsigned bits)
1492{
1493        struct rb_node *node;
1494        struct extent_state *state;
1495
1496        /*
1497         * this search will find all the extents that end after
1498         * our range starts.
1499         */
1500        node = tree_search(tree, start);
1501        if (!node)
1502                goto out;
1503
1504        while (1) {
1505                state = rb_entry(node, struct extent_state, rb_node);
1506                if (state->end >= start && (state->state & bits))
1507                        return state;
1508
1509                node = rb_next(node);
1510                if (!node)
1511                        break;
1512        }
1513out:
1514        return NULL;
1515}
1516
1517/*
1518 * find the first offset in the io tree with 'bits' set. zero is
1519 * returned if we find something, and *start_ret and *end_ret are
1520 * set to reflect the state struct that was found.
1521 *
1522 * If nothing was found, 1 is returned. If found something, return 0.
1523 */
1524int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1525                          u64 *start_ret, u64 *end_ret, unsigned bits,
1526                          struct extent_state **cached_state)
1527{
1528        struct extent_state *state;
1529        int ret = 1;
1530
1531        spin_lock(&tree->lock);
1532        if (cached_state && *cached_state) {
1533                state = *cached_state;
1534                if (state->end == start - 1 && extent_state_in_tree(state)) {
1535                        while ((state = next_state(state)) != NULL) {
1536                                if (state->state & bits)
1537                                        goto got_it;
1538                        }
1539                        free_extent_state(*cached_state);
1540                        *cached_state = NULL;
1541                        goto out;
1542                }
1543                free_extent_state(*cached_state);
1544                *cached_state = NULL;
1545        }
1546
1547        state = find_first_extent_bit_state(tree, start, bits);
1548got_it:
1549        if (state) {
1550                cache_state_if_flags(state, cached_state, 0);
1551                *start_ret = state->start;
1552                *end_ret = state->end;
1553                ret = 0;
1554        }
1555out:
1556        spin_unlock(&tree->lock);
1557        return ret;
1558}
1559
1560/**
1561 * find_first_clear_extent_bit - find the first range that has @bits not set.
1562 * This range could start before @start.
1563 *
1564 * @tree - the tree to search
1565 * @start - the offset at/after which the found extent should start
1566 * @start_ret - records the beginning of the range
1567 * @end_ret - records the end of the range (inclusive)
1568 * @bits - the set of bits which must be unset
1569 *
1570 * Since unallocated range is also considered one which doesn't have the bits
1571 * set it's possible that @end_ret contains -1, this happens in case the range
1572 * spans (last_range_end, end of device]. In this case it's up to the caller to
1573 * trim @end_ret to the appropriate size.
1574 */
1575void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1576                                 u64 *start_ret, u64 *end_ret, unsigned bits)
1577{
1578        struct extent_state *state;
1579        struct rb_node *node, *prev = NULL, *next;
1580
1581        spin_lock(&tree->lock);
1582
1583        /* Find first extent with bits cleared */
1584        while (1) {
1585                node = __etree_search(tree, start, &next, &prev, NULL, NULL);
1586                if (!node) {
1587                        node = next;
1588                        if (!node) {
1589                                /*
1590                                 * We are past the last allocated chunk,
1591                                 * set start at the end of the last extent. The
1592                                 * device alloc tree should never be empty so
1593                                 * prev is always set.
1594                                 */
1595                                ASSERT(prev);
1596                                state = rb_entry(prev, struct extent_state, rb_node);
1597                                *start_ret = state->end + 1;
1598                                *end_ret = -1;
1599                                goto out;
1600                        }
1601                }
1602                /*
1603                 * At this point 'node' either contains 'start' or start is
1604                 * before 'node'
1605                 */
1606                state = rb_entry(node, struct extent_state, rb_node);
1607
1608                if (in_range(start, state->start, state->end - state->start + 1)) {
1609                        if (state->state & bits) {
1610                                /*
1611                                 * |--range with bits sets--|
1612                                 *    |
1613                                 *    start
1614                                 */
1615                                start = state->end + 1;
1616                        } else {
1617                                /*
1618                                 * 'start' falls within a range that doesn't
1619                                 * have the bits set, so take its start as
1620                                 * the beginning of the desired range
1621                                 *
1622                                 * |--range with bits cleared----|
1623                                 *      |
1624                                 *      start
1625                                 */
1626                                *start_ret = state->start;
1627                                break;
1628                        }
1629                } else {
1630                        /*
1631                         * |---prev range---|---hole/unset---|---node range---|
1632                         *                          |
1633                         *                        start
1634                         *
1635                         *                        or
1636                         *
1637                         * |---hole/unset--||--first node--|
1638                         * 0   |
1639                         *    start
1640                         */
1641                        if (prev) {
1642                                state = rb_entry(prev, struct extent_state,
1643                                                 rb_node);
1644                                *start_ret = state->end + 1;
1645                        } else {
1646                                *start_ret = 0;
1647                        }
1648                        break;
1649                }
1650        }
1651
1652        /*
1653         * Find the longest stretch from start until an entry which has the
1654         * bits set
1655         */
1656        while (1) {
1657                state = rb_entry(node, struct extent_state, rb_node);
1658                if (state->end >= start && !(state->state & bits)) {
1659                        *end_ret = state->end;
1660                } else {
1661                        *end_ret = state->start - 1;
1662                        break;
1663                }
1664
1665                node = rb_next(node);
1666                if (!node)
1667                        break;
1668        }
1669out:
1670        spin_unlock(&tree->lock);
1671}
1672
1673/*
1674 * find a contiguous range of bytes in the file marked as delalloc, not
1675 * more than 'max_bytes'.  start and end are used to return the range,
1676 *
1677 * true is returned if we find something, false if nothing was in the tree
1678 */
1679static noinline bool find_delalloc_range(struct extent_io_tree *tree,
1680                                        u64 *start, u64 *end, u64 max_bytes,
1681                                        struct extent_state **cached_state)
1682{
1683        struct rb_node *node;
1684        struct extent_state *state;
1685        u64 cur_start = *start;
1686        bool found = false;
1687        u64 total_bytes = 0;
1688
1689        spin_lock(&tree->lock);
1690
1691        /*
1692         * this search will find all the extents that end after
1693         * our range starts.
1694         */
1695        node = tree_search(tree, cur_start);
1696        if (!node) {
1697                *end = (u64)-1;
1698                goto out;
1699        }
1700
1701        while (1) {
1702                state = rb_entry(node, struct extent_state, rb_node);
1703                if (found && (state->start != cur_start ||
1704                              (state->state & EXTENT_BOUNDARY))) {
1705                        goto out;
1706                }
1707                if (!(state->state & EXTENT_DELALLOC)) {
1708                        if (!found)
1709                                *end = state->end;
1710                        goto out;
1711                }
1712                if (!found) {
1713                        *start = state->start;
1714                        *cached_state = state;
1715                        refcount_inc(&state->refs);
1716                }
1717                found = true;
1718                *end = state->end;
1719                cur_start = state->end + 1;
1720                node = rb_next(node);
1721                total_bytes += state->end - state->start + 1;
1722                if (total_bytes >= max_bytes)
1723                        break;
1724                if (!node)
1725                        break;
1726        }
1727out:
1728        spin_unlock(&tree->lock);
1729        return found;
1730}
1731
1732static int __process_pages_contig(struct address_space *mapping,
1733                                  struct page *locked_page,
1734                                  pgoff_t start_index, pgoff_t end_index,
1735                                  unsigned long page_ops, pgoff_t *index_ret);
1736
1737static noinline void __unlock_for_delalloc(struct inode *inode,
1738                                           struct page *locked_page,
1739                                           u64 start, u64 end)
1740{
1741        unsigned long index = start >> PAGE_SHIFT;
1742        unsigned long end_index = end >> PAGE_SHIFT;
1743
1744        ASSERT(locked_page);
1745        if (index == locked_page->index && end_index == index)
1746                return;
1747
1748        __process_pages_contig(inode->i_mapping, locked_page, index, end_index,
1749                               PAGE_UNLOCK, NULL);
1750}
1751
1752static noinline int lock_delalloc_pages(struct inode *inode,
1753                                        struct page *locked_page,
1754                                        u64 delalloc_start,
1755                                        u64 delalloc_end)
1756{
1757        unsigned long index = delalloc_start >> PAGE_SHIFT;
1758        unsigned long index_ret = index;
1759        unsigned long end_index = delalloc_end >> PAGE_SHIFT;
1760        int ret;
1761
1762        ASSERT(locked_page);
1763        if (index == locked_page->index && index == end_index)
1764                return 0;
1765
1766        ret = __process_pages_contig(inode->i_mapping, locked_page, index,
1767                                     end_index, PAGE_LOCK, &index_ret);
1768        if (ret == -EAGAIN)
1769                __unlock_for_delalloc(inode, locked_page, delalloc_start,
1770                                      (u64)index_ret << PAGE_SHIFT);
1771        return ret;
1772}
1773
1774/*
1775 * Find and lock a contiguous range of bytes in the file marked as delalloc, no
1776 * more than @max_bytes.  @Start and @end are used to return the range,
1777 *
1778 * Return: true if we find something
1779 *         false if nothing was in the tree
1780 */
1781EXPORT_FOR_TESTS
1782noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
1783                                    struct page *locked_page, u64 *start,
1784                                    u64 *end)
1785{
1786        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1787        u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
1788        u64 delalloc_start;
1789        u64 delalloc_end;
1790        bool found;
1791        struct extent_state *cached_state = NULL;
1792        int ret;
1793        int loops = 0;
1794
1795again:
1796        /* step one, find a bunch of delalloc bytes starting at start */
1797        delalloc_start = *start;
1798        delalloc_end = 0;
1799        found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1800                                    max_bytes, &cached_state);
1801        if (!found || delalloc_end <= *start) {
1802                *start = delalloc_start;
1803                *end = delalloc_end;
1804                free_extent_state(cached_state);
1805                return false;
1806        }
1807
1808        /*
1809         * start comes from the offset of locked_page.  We have to lock
1810         * pages in order, so we can't process delalloc bytes before
1811         * locked_page
1812         */
1813        if (delalloc_start < *start)
1814                delalloc_start = *start;
1815
1816        /*
1817         * make sure to limit the number of pages we try to lock down
1818         */
1819        if (delalloc_end + 1 - delalloc_start > max_bytes)
1820                delalloc_end = delalloc_start + max_bytes - 1;
1821
1822        /* step two, lock all the pages after the page that has start */
1823        ret = lock_delalloc_pages(inode, locked_page,
1824                                  delalloc_start, delalloc_end);
1825        ASSERT(!ret || ret == -EAGAIN);
1826        if (ret == -EAGAIN) {
1827                /* some of the pages are gone, lets avoid looping by
1828                 * shortening the size of the delalloc range we're searching
1829                 */
1830                free_extent_state(cached_state);
1831                cached_state = NULL;
1832                if (!loops) {
1833                        max_bytes = PAGE_SIZE;
1834                        loops = 1;
1835                        goto again;
1836                } else {
1837                        found = false;
1838                        goto out_failed;
1839                }
1840        }
1841
1842        /* step three, lock the state bits for the whole range */
1843        lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
1844
1845        /* then test to make sure it is all still delalloc */
1846        ret = test_range_bit(tree, delalloc_start, delalloc_end,
1847                             EXTENT_DELALLOC, 1, cached_state);
1848        if (!ret) {
1849                unlock_extent_cached(tree, delalloc_start, delalloc_end,
1850                                     &cached_state);
1851                __unlock_for_delalloc(inode, locked_page,
1852                              delalloc_start, delalloc_end);
1853                cond_resched();
1854                goto again;
1855        }
1856        free_extent_state(cached_state);
1857        *start = delalloc_start;
1858        *end = delalloc_end;
1859out_failed:
1860        return found;
1861}
1862
1863static int __process_pages_contig(struct address_space *mapping,
1864                                  struct page *locked_page,
1865                                  pgoff_t start_index, pgoff_t end_index,
1866                                  unsigned long page_ops, pgoff_t *index_ret)
1867{
1868        unsigned long nr_pages = end_index - start_index + 1;
1869        unsigned long pages_locked = 0;
1870        pgoff_t index = start_index;
1871        struct page *pages[16];
1872        unsigned ret;
1873        int err = 0;
1874        int i;
1875
1876        if (page_ops & PAGE_LOCK) {
1877                ASSERT(page_ops == PAGE_LOCK);
1878                ASSERT(index_ret && *index_ret == start_index);
1879        }
1880
1881        if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1882                mapping_set_error(mapping, -EIO);
1883
1884        while (nr_pages > 0) {
1885                ret = find_get_pages_contig(mapping, index,
1886                                     min_t(unsigned long,
1887                                     nr_pages, ARRAY_SIZE(pages)), pages);
1888                if (ret == 0) {
1889                        /*
1890                         * Only if we're going to lock these pages,
1891                         * can we find nothing at @index.
1892                         */
1893                        ASSERT(page_ops & PAGE_LOCK);
1894                        err = -EAGAIN;
1895                        goto out;
1896                }
1897
1898                for (i = 0; i < ret; i++) {
1899                        if (page_ops & PAGE_SET_PRIVATE2)
1900                                SetPagePrivate2(pages[i]);
1901
1902                        if (pages[i] == locked_page) {
1903                                put_page(pages[i]);
1904                                pages_locked++;
1905                                continue;
1906                        }
1907                        if (page_ops & PAGE_CLEAR_DIRTY)
1908                                clear_page_dirty_for_io(pages[i]);
1909                        if (page_ops & PAGE_SET_WRITEBACK)
1910                                set_page_writeback(pages[i]);
1911                        if (page_ops & PAGE_SET_ERROR)
1912                                SetPageError(pages[i]);
1913                        if (page_ops & PAGE_END_WRITEBACK)
1914                                end_page_writeback(pages[i]);
1915                        if (page_ops & PAGE_UNLOCK)
1916                                unlock_page(pages[i]);
1917                        if (page_ops & PAGE_LOCK) {
1918                                lock_page(pages[i]);
1919                                if (!PageDirty(pages[i]) ||
1920                                    pages[i]->mapping != mapping) {
1921                                        unlock_page(pages[i]);
1922                                        put_page(pages[i]);
1923                                        err = -EAGAIN;
1924                                        goto out;
1925                                }
1926                        }
1927                        put_page(pages[i]);
1928                        pages_locked++;
1929                }
1930                nr_pages -= ret;
1931                index += ret;
1932                cond_resched();
1933        }
1934out:
1935        if (err && index_ret)
1936                *index_ret = start_index + pages_locked - 1;
1937        return err;
1938}
1939
1940void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
1941                                 u64 delalloc_end, struct page *locked_page,
1942                                 unsigned clear_bits,
1943                                 unsigned long page_ops)
1944{
1945        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
1946                         NULL);
1947
1948        __process_pages_contig(inode->i_mapping, locked_page,
1949                               start >> PAGE_SHIFT, end >> PAGE_SHIFT,
1950                               page_ops, NULL);
1951}
1952
1953/*
1954 * count the number of bytes in the tree that have a given bit(s)
1955 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1956 * cached.  The total number found is returned.
1957 */
1958u64 count_range_bits(struct extent_io_tree *tree,
1959                     u64 *start, u64 search_end, u64 max_bytes,
1960                     unsigned bits, int contig)
1961{
1962        struct rb_node *node;
1963        struct extent_state *state;
1964        u64 cur_start = *start;
1965        u64 total_bytes = 0;
1966        u64 last = 0;
1967        int found = 0;
1968
1969        if (WARN_ON(search_end <= cur_start))
1970                return 0;
1971
1972        spin_lock(&tree->lock);
1973        if (cur_start == 0 && bits == EXTENT_DIRTY) {
1974                total_bytes = tree->dirty_bytes;
1975                goto out;
1976        }
1977        /*
1978         * this search will find all the extents that end after
1979         * our range starts.
1980         */
1981        node = tree_search(tree, cur_start);
1982        if (!node)
1983                goto out;
1984
1985        while (1) {
1986                state = rb_entry(node, struct extent_state, rb_node);
1987                if (state->start > search_end)
1988                        break;
1989                if (contig && found && state->start > last + 1)
1990                        break;
1991                if (state->end >= cur_start && (state->state & bits) == bits) {
1992                        total_bytes += min(search_end, state->end) + 1 -
1993                                       max(cur_start, state->start);
1994                        if (total_bytes >= max_bytes)
1995                                break;
1996                        if (!found) {
1997                                *start = max(cur_start, state->start);
1998                                found = 1;
1999                        }
2000                        last = state->end;

2001                } else if (contig && found) {
2002                        break;
2003                }
2004                node = rb_next(node);
2005                if (!node)
2006                        break;
2007        }
2008out:
2009        spin_unlock(&tree->lock);
2010        return total_bytes;
2011}
2012
2013/*
2014 * set the private field for a given byte offset in the tree.  If there isn't
2015 * an extent_state there already, this does nothing.
2016 */
2017static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
2018                struct io_failure_record *failrec)
2019{
2020        struct rb_node *node;
2021        struct extent_state *state;
2022        int ret = 0;
2023
2024        spin_lock(&tree->lock);
2025        /*
2026         * this search will find all the extents that end after
2027         * our range starts.
2028         */
2029        node = tree_search(tree, start);
2030        if (!node) {
2031                ret = -ENOENT;
2032                goto out;
2033        }
2034        state = rb_entry(node, struct extent_state, rb_node);
2035        if (state->start != start) {
2036                ret = -ENOENT;
2037                goto out;
2038        }
2039        state->failrec = failrec;
2040out:
2041        spin_unlock(&tree->lock);
2042        return ret;
2043}
2044
2045static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
2046                struct io_failure_record **failrec)
2047{
2048        struct rb_node *node;
2049        struct extent_state *state;
2050        int ret = 0;
2051
2052        spin_lock(&tree->lock);
2053        /*
2054         * this search will find all the extents that end after
2055         * our range starts.
2056         */
2057        node = tree_search(tree, start);
2058        if (!node) {
2059                ret = -ENOENT;
2060                goto out;
2061        }
2062        state = rb_entry(node, struct extent_state, rb_node);
2063        if (state->start != start) {
2064                ret = -ENOENT;
2065                goto out;
2066        }
2067        *failrec = state->failrec;
2068out:
2069        spin_unlock(&tree->lock);
2070        return ret;
2071}
2072
2073/*
2074 * searches a range in the state tree for a given mask.
2075 * If 'filled' == 1, this returns 1 only if every extent in the tree
2076 * has the bits set.  Otherwise, 1 is returned if any bit in the
2077 * range is found set.
2078 */
2079int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
2080                   unsigned bits, int filled, struct extent_state *cached)
2081{
2082        struct extent_state *state = NULL;
2083        struct rb_node *node;
2084        int bitset = 0;
2085
2086        spin_lock(&tree->lock);
2087        if (cached && extent_state_in_tree(cached) && cached->start <= start &&
2088            cached->end > start)
2089                node = &cached->rb_node;
2090        else
2091                node = tree_search(tree, start);
2092        while (node && start <= end) {
2093                state = rb_entry(node, struct extent_state, rb_node);
2094
2095                if (filled && state->start > start) {
2096                        bitset = 0;
2097                        break;
2098                }
2099
2100                if (state->start > end)
2101                        break;
2102
2103                if (state->state & bits) {
2104                        bitset = 1;
2105                        if (!filled)
2106                                break;
2107                } else if (filled) {
2108                        bitset = 0;
2109                        break;
2110                }
2111
2112                if (state->end == (u64)-1)
2113                        break;
2114
2115                start = state->end + 1;
2116                if (start > end)
2117                        break;
2118                node = rb_next(node);
2119                if (!node) {
2120                        if (filled)
2121                                bitset = 0;
2122                        break;
2123                }
2124        }
2125        spin_unlock(&tree->lock);
2126        return bitset;
2127}
2128
2129/*
2130 * helper function to set a given page up to date if all the
2131 * extents in the tree for that page are up to date
2132 */
2133static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
2134{
2135        u64 start = page_offset(page);
2136        u64 end = start + PAGE_SIZE - 1;
2137        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
2138                SetPageUptodate(page);
2139}
2140
2141int free_io_failure(struct extent_io_tree *failure_tree,
2142                    struct extent_io_tree *io_tree,
2143                    struct io_failure_record *rec)
2144{
2145        int ret;
2146        int err = 0;
2147
2148        set_state_failrec(failure_tree, rec->start, NULL);
2149        ret = clear_extent_bits(failure_tree, rec->start,
2150                                rec->start + rec->len - 1,
2151                                EXTENT_LOCKED | EXTENT_DIRTY);
2152        if (ret)
2153                err = ret;
2154
2155        ret = clear_extent_bits(io_tree, rec->start,
2156                                rec->start + rec->len - 1,
2157                                EXTENT_DAMAGED);
2158        if (ret && !err)
2159                err = ret;
2160
2161        kfree(rec);
2162        return err;
2163}
2164
2165/*
2166 * this bypasses the standard btrfs submit functions deliberately, as
2167 * the standard behavior is to write all copies in a raid setup. here we only
2168 * want to write the one bad copy. so we do the mapping for ourselves and issue
2169 * submit_bio directly.
2170 * to avoid any synchronization issues, wait for the data after writing, which
2171 * actually prevents the read that triggered the error from finishing.
2172 * currently, there can be no more than two copies of every data bit. thus,
2173 * exactly one rewrite is required.
2174 */
2175int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2176                      u64 length, u64 logical, struct page *page,
2177                      unsigned int pg_offset, int mirror_num)
2178{
2179        struct bio *bio;
2180        struct btrfs_device *dev;
2181        u64 map_length = 0;
2182        u64 sector;
2183        struct btrfs_bio *bbio = NULL;
2184        int ret;
2185
2186        ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
2187        BUG_ON(!mirror_num);
2188
2189        bio = btrfs_io_bio_alloc(1);
2190        bio->bi_iter.bi_size = 0;
2191        map_length = length;
2192
2193        /*
2194         * Avoid races with device replace and make sure our bbio has devices
2195         * associated to its stripes that don't go away while we are doing the
2196         * read repair operation.
2197         */
2198        btrfs_bio_counter_inc_blocked(fs_info);
2199        if (btrfs_is_parity_mirror(fs_info, logical, length)) {
2200                /*
2201                 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2202                 * to update all raid stripes, but here we just want to correct
2203                 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2204                 * stripe's dev and sector.
2205                 */
2206                ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2207                                      &map_length, &bbio, 0);
2208                if (ret) {
2209                        btrfs_bio_counter_dec(fs_info);
2210                        bio_put(bio);
2211                        return -EIO;
2212                }
2213                ASSERT(bbio->mirror_num == 1);
2214        } else {
2215                ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2216                                      &map_length, &bbio, mirror_num);
2217                if (ret) {
2218                        btrfs_bio_counter_dec(fs_info);
2219                        bio_put(bio);
2220                        return -EIO;
2221                }
2222                BUG_ON(mirror_num != bbio->mirror_num);
2223        }
2224
2225        sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
2226        bio->bi_iter.bi_sector = sector;
2227        dev = bbio->stripes[bbio->mirror_num - 1].dev;
2228        btrfs_put_bbio(bbio);
2229        if (!dev || !dev->bdev ||
2230            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2231                btrfs_bio_counter_dec(fs_info);
2232                bio_put(bio);
2233                return -EIO;
2234        }
2235        bio_set_dev(bio, dev->bdev);
2236        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
2237        bio_add_page(bio, page, length, pg_offset);
2238
2239        if (btrfsic_submit_bio_wait(bio)) {
2240                /* try to remap that extent elsewhere? */
2241                btrfs_bio_counter_dec(fs_info);
2242                bio_put(bio);
2243                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2244                return -EIO;
2245        }
2246
2247        btrfs_info_rl_in_rcu(fs_info,
2248                "read error corrected: ino %llu off %llu (dev %s sector %llu)",
2249                                  ino, start,
2250                                  rcu_str_deref(dev->name), sector);
2251        btrfs_bio_counter_dec(fs_info);
2252        bio_put(bio);
2253        return 0;
2254}
2255
2256int btrfs_repair_eb_io_failure(struct extent_buffer *eb, int mirror_num)
2257{
2258        struct btrfs_fs_info *fs_info = eb->fs_info;
2259        u64 start = eb->start;
2260        int i, num_pages = num_extent_pages(eb);
2261        int ret = 0;
2262
2263        if (sb_rdonly(fs_info->sb))
2264                return -EROFS;
2265
2266        for (i = 0; i < num_pages; i++) {
2267                struct page *p = eb->pages[i];
2268
2269                ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2270                                        start - page_offset(p), mirror_num);
2271                if (ret)
2272                        break;
2273                start += PAGE_SIZE;
2274        }
2275
2276        return ret;
2277}
2278
2279/*
2280 * each time an IO finishes, we do a fast check in the IO failure tree
2281 * to see if we need to process or clean up an io_failure_record
2282 */
2283int clean_io_failure(struct btrfs_fs_info *fs_info,
2284                     struct extent_io_tree *failure_tree,
2285                     struct extent_io_tree *io_tree, u64 start,
2286                     struct page *page, u64 ino, unsigned int pg_offset)
2287{
2288        u64 private;
2289        struct io_failure_record *failrec;
2290        struct extent_state *state;
2291        int num_copies;
2292        int ret;
2293
2294        private = 0;
2295        ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2296                               EXTENT_DIRTY, 0);
2297        if (!ret)
2298                return 0;
2299
2300        ret = get_state_failrec(failure_tree, start, &failrec);
2301        if (ret)
2302                return 0;
2303
2304        BUG_ON(!failrec->this_mirror);
2305
2306        if (failrec->in_validation) {
2307                /* there was no real error, just free the record */
2308                btrfs_debug(fs_info,
2309                        "clean_io_failure: freeing dummy error at %llu",
2310                        failrec->start);
2311                goto out;
2312        }
2313        if (sb_rdonly(fs_info->sb))
2314                goto out;
2315
2316        spin_lock(&io_tree->lock);
2317        state = find_first_extent_bit_state(io_tree,
2318                                            failrec->start,
2319                                            EXTENT_LOCKED);
2320        spin_unlock(&io_tree->lock);
2321
2322        if (state && state->start <= failrec->start &&
2323            state->end >= failrec->start + failrec->len - 1) {
2324                num_copies = btrfs_num_copies(fs_info, failrec->logical,
2325                                              failrec->len);
2326                if (num_copies > 1)  {
2327                        repair_io_failure(fs_info, ino, start, failrec->len,
2328                                          failrec->logical, page, pg_offset,
2329                                          failrec->failed_mirror);
2330                }
2331        }
2332
2333out:
2334        free_io_failure(failure_tree, io_tree, failrec);
2335
2336        return 0;
2337}
2338
2339/*
2340 * Can be called when
2341 * - hold extent lock
2342 * - under ordered extent
2343 * - the inode is freeing
2344 */
2345void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
2346{
2347        struct extent_io_tree *failure_tree = &inode->io_failure_tree;
2348        struct io_failure_record *failrec;
2349        struct extent_state *state, *next;
2350
2351        if (RB_EMPTY_ROOT(&failure_tree->state))
2352                return;
2353
2354        spin_lock(&failure_tree->lock);
2355        state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2356        while (state) {
2357                if (state->start > end)
2358                        break;
2359
2360                ASSERT(state->end <= end);
2361
2362                next = next_state(state);
2363
2364                failrec = state->failrec;
2365                free_extent_state(state);
2366                kfree(failrec);
2367
2368                state = next;
2369        }
2370        spin_unlock(&failure_tree->lock);
2371}
2372
2373int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2374                struct io_failure_record **failrec_ret)
2375{
2376        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2377        struct io_failure_record *failrec;
2378        struct extent_map *em;
2379        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2380        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2381        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2382        int ret;
2383        u64 logical;
2384
2385        ret = get_state_failrec(failure_tree, start, &failrec);
2386        if (ret) {
2387                failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2388                if (!failrec)
2389                        return -ENOMEM;
2390
2391                failrec->start = start;
2392                failrec->len = end - start + 1;
2393                failrec->this_mirror = 0;
2394                failrec->bio_flags = 0;
2395                failrec->in_validation = 0;
2396
2397                read_lock(&em_tree->lock);
2398                em = lookup_extent_mapping(em_tree, start, failrec->len);
2399                if (!em) {
2400                        read_unlock(&em_tree->lock);
2401                        kfree(failrec);
2402                        return -EIO;
2403                }
2404
2405                if (em->start > start || em->start + em->len <= start) {
2406                        free_extent_map(em);
2407                        em = NULL;
2408                }
2409                read_unlock(&em_tree->lock);
2410                if (!em) {
2411                        kfree(failrec);
2412                        return -EIO;
2413                }
2414
2415                logical = start - em->start;
2416                logical = em->block_start + logical;
2417                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2418                        logical = em->block_start;
2419                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2420                        extent_set_compress_type(&failrec->bio_flags,
2421                                                 em->compress_type);
2422                }
2423
2424                btrfs_debug(fs_info,
2425                        "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
2426                        logical, start, failrec->len);
2427
2428                failrec->logical = logical;
2429                free_extent_map(em);
2430
2431                /* set the bits in the private failure tree */
2432                ret = set_extent_bits(failure_tree, start, end,
2433                                        EXTENT_LOCKED | EXTENT_DIRTY);
2434                if (ret >= 0)
2435                        ret = set_state_failrec(failure_tree, start, failrec);
2436                /* set the bits in the inode's tree */
2437                if (ret >= 0)
2438                        ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
2439                if (ret < 0) {
2440                        kfree(failrec);
2441                        return ret;
2442                }
2443        } else {
2444                btrfs_debug(fs_info,
2445                        "Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
2446                        failrec->logical, failrec->start, failrec->len,
2447                        failrec->in_validation);
2448                /*
2449                 * when data can be on disk more than twice, add to failrec here
2450                 * (e.g. with a list for failed_mirror) to make
2451                 * clean_io_failure() clean all those errors at once.
2452                 */
2453        }
2454
2455        *failrec_ret = failrec;
2456
2457        return 0;
2458}
2459
2460bool btrfs_check_repairable(struct inode *inode, unsigned failed_bio_pages,
2461                           struct io_failure_record *failrec, int failed_mirror)
2462{
2463        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2464        int num_copies;
2465
2466        num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
2467        if (num_copies == 1) {
2468                /*
2469                 * we only have a single copy of the data, so don't bother with
2470                 * all the retry and error correction code that follows. no
2471                 * matter what the error is, it is very likely to persist.
2472                 */
2473                btrfs_debug(fs_info,
2474                        "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
2475                        num_copies, failrec->this_mirror, failed_mirror);
2476                return false;
2477        }
2478
2479        /*
2480         * there are two premises:
2481         *      a) deliver good data to the caller
2482         *      b) correct the bad sectors on disk
2483         */
2484        if (failed_bio_pages > 1) {
2485                /*
2486                 * to fulfill b), we need to know the exact failing sectors, as
2487                 * we don't want to rewrite any more than the failed ones. thus,
2488                 * we need separate read requests for the failed bio
2489                 *
2490                 * if the following BUG_ON triggers, our validation request got
2491                 * merged. we need separate requests for our algorithm to work.
2492                 */
2493                BUG_ON(failrec->in_validation);
2494                failrec->in_validation = 1;
2495                failrec->this_mirror = failed_mirror;
2496        } else {
2497                /*
2498                 * we're ready to fulfill a) and b) alongside. get a good copy
2499                 * of the failed sector and if we succeed, we have setup
2500                 * everything for repair_io_failure to do the rest for us.
2501                 */
2502                if (failrec->in_validation) {
2503                        BUG_ON(failrec->this_mirror != failed_mirror);
2504                        failrec->in_validation = 0;
2505                        failrec->this_mirror = 0;
2506                }
2507                failrec->failed_mirror = failed_mirror;
2508                failrec->this_mirror++;
2509                if (failrec->this_mirror == failed_mirror)
2510                        failrec->this_mirror++;
2511        }
2512
2513        if (failrec->this_mirror > num_copies) {
2514                btrfs_debug(fs_info,
2515                        "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
2516                        num_copies, failrec->this_mirror, failed_mirror);
2517                return false;
2518        }
2519
2520        return true;
2521}
2522
2523
2524struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2525                                    struct io_failure_record *failrec,
2526                                    struct page *page, int pg_offset, int icsum,
2527                                    bio_end_io_t *endio_func, void *data)
2528{
2529        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2530        struct bio *bio;
2531        struct btrfs_io_bio *btrfs_failed_bio;
2532        struct btrfs_io_bio *btrfs_bio;
2533
2534        bio = btrfs_io_bio_alloc(1);
2535        bio->bi_end_io = endio_func;
2536        bio->bi_iter.bi_sector = failrec->logical >> 9;
2537        bio_set_dev(bio, fs_info->fs_devices->latest_bdev);
2538        bio->bi_iter.bi_size = 0;
2539        bio->bi_private = data;
2540
2541        btrfs_failed_bio = btrfs_io_bio(failed_bio);
2542        if (btrfs_failed_bio->csum) {
2543                u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
2544
2545                btrfs_bio = btrfs_io_bio(bio);
2546                btrfs_bio->csum = btrfs_bio->csum_inline;
2547                icsum *= csum_size;
2548                memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
2549                       csum_size);
2550        }
2551
2552        bio_add_page(bio, page, failrec->len, pg_offset);
2553
2554        return bio;
2555}
2556
2557/*
2558 * This is a generic handler for readpage errors. If other copies exist, read
2559 * those and write back good data to the failed position. Does not investigate
2560 * in remapping the failed extent elsewhere, hoping the device will be smart
2561 * enough to do this as needed
2562 */
2563static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2564                              struct page *page, u64 start, u64 end,
2565                              int failed_mirror)
2566{
2567        struct io_failure_record *failrec;
2568        struct inode *inode = page->mapping->host;
2569        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2570        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2571        struct bio *bio;
2572        int read_mode = 0;
2573        blk_status_t status;
2574        int ret;
2575        unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
2576
2577        BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2578
2579        ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2580        if (ret)
2581                return ret;
2582
2583        if (!btrfs_check_repairable(inode, failed_bio_pages, failrec,
2584                                    failed_mirror)) {
2585                free_io_failure(failure_tree, tree, failrec);
2586                return -EIO;
2587        }
2588
2589        if (failed_bio_pages > 1)
2590                read_mode |= REQ_FAILFAST_DEV;
2591
2592        phy_offset >>= inode->i_sb->s_blocksize_bits;
2593        bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2594                                      start - page_offset(page),
2595                                      (int)phy_offset, failed_bio->bi_end_io,
2596                                      NULL);
2597        bio->bi_opf = REQ_OP_READ | read_mode;
2598
2599        btrfs_debug(btrfs_sb(inode->i_sb),
2600                "Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
2601                read_mode, failrec->this_mirror, failrec->in_validation);
2602
2603        status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
2604                                         failrec->bio_flags);
2605        if (status) {
2606                free_io_failure(failure_tree, tree, failrec);
2607                bio_put(bio);
2608                ret = blk_status_to_errno(status);
2609        }
2610
2611        return ret;
2612}
2613
2614/* lots and lots of room for performance fixes in the end_bio funcs */
2615
2616void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2617{
2618        int uptodate = (err == 0);
2619        int ret = 0;
2620
2621        btrfs_writepage_endio_finish_ordered(page, start, end, uptodate);
2622
2623        if (!uptodate) {
2624                ClearPageUptodate(page);
2625                SetPageError(page);
2626                ret = err < 0 ? err : -EIO;
2627                mapping_set_error(page->mapping, ret);
2628        }
2629}
2630
2631/*
2632 * after a writepage IO is done, we need to:
2633 * clear the uptodate bits on error
2634 * clear the writeback bits in the extent tree for this IO
2635 * end_page_writeback if the page has no more pending IO
2636 *
2637 * Scheduling is not allowed, so the extent state tree is expected
2638 * to have one and only one object corresponding to this IO.
2639 */
2640static void end_bio_extent_writepage(struct bio *bio)
2641{
2642        int error = blk_status_to_errno(bio->bi_status);
2643        struct bio_vec *bvec;
2644        u64 start;
2645        u64 end;
2646        struct bvec_iter_all iter_all;
2647
2648        ASSERT(!bio_flagged(bio, BIO_CLONED));
2649        bio_for_each_segment_all(bvec, bio, iter_all) {
2650                struct page *page = bvec->bv_page;
2651                struct inode *inode = page->mapping->host;
2652                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2653
2654                /* We always issue full-page reads, but if some block
2655                 * in a page fails to read, blk_update_request() will
2656                 * advance bv_offset and adjust bv_len to compensate.
2657                 * Print a warning for nonzero offsets, and an error
2658                 * if they don't add up to a full page.  */
2659                if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2660                        if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
2661                                btrfs_err(fs_info,
2662                                   "partial page write in btrfs with offset %u and length %u",
2663                                        bvec->bv_offset, bvec->bv_len);
2664                        else
2665                                btrfs_info(fs_info,
2666                                   "incomplete page write in btrfs with offset %u and length %u",
2667                                        bvec->bv_offset, bvec->bv_len);
2668                }
2669
2670                start = page_offset(page);
2671                end = start + bvec->bv_offset + bvec->bv_len - 1;
2672
2673                end_extent_writepage(page, error, start, end);
2674                end_page_writeback(page);
2675        }
2676
2677        bio_put(bio);
2678}
2679
2680static void
2681endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
2682                              int uptodate)
2683{
2684        struct extent_state *cached = NULL;
2685        u64 end = start + len - 1;
2686
2687        if (uptodate && tree->track_uptodate)
2688                set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
2689        unlock_extent_cached_atomic(tree, start, end, &cached);
2690}
2691
2692/*
2693 * after a readpage IO is done, we need to:
2694 * clear the uptodate bits on error
2695 * set the uptodate bits if things worked
2696 * set the page up to date if all extents in the tree are uptodate
2697 * clear the lock bit in the extent tree
2698 * unlock the page if there are no other extents locked for it
2699 *
2700 * Scheduling is not allowed, so the extent state tree is expected
2701 * to have one and only one object corresponding to this IO.
2702 */
2703static void end_bio_extent_readpage(struct bio *bio)
2704{
2705        struct bio_vec *bvec;
2706        int uptodate = !bio->bi_status;
2707        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
2708        struct extent_io_tree *tree, *failure_tree;
2709        u64 offset = 0;
2710        u64 start;
2711        u64 end;
2712        u64 len;
2713        u64 extent_start = 0;
2714        u64 extent_len = 0;
2715        int mirror;
2716        int ret;
2717        struct bvec_iter_all iter_all;
2718
2719        ASSERT(!bio_flagged(bio, BIO_CLONED));
2720        bio_for_each_segment_all(bvec, bio, iter_all) {
2721                struct page *page = bvec->bv_page;
2722                struct inode *inode = page->mapping->host;
2723                struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2724                bool data_inode = btrfs_ino(BTRFS_I(inode))
2725                        != BTRFS_BTREE_INODE_OBJECTID;
2726
2727                btrfs_debug(fs_info,
2728                        "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
2729                        (u64)bio->bi_iter.bi_sector, bio->bi_status,
2730                        io_bio->mirror_num);
2731                tree = &BTRFS_I(inode)->io_tree;
2732                failure_tree = &BTRFS_I(inode)->io_failure_tree;
2733
2734                /* We always issue full-page reads, but if some block
2735                 * in a page fails to read, blk_update_request() will
2736                 * advance bv_offset and adjust bv_len to compensate.
2737                 * Print a warning for nonzero offsets, and an error
2738                 * if they don't add up to a full page.  */
2739                if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
2740                        if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
2741                                btrfs_err(fs_info,
2742                                        "partial page read in btrfs with offset %u and length %u",
2743                                        bvec->bv_offset, bvec->bv_len);
2744                        else
2745                                btrfs_info(fs_info,
2746                                        "incomplete page read in btrfs with offset %u and length %u",
2747                                        bvec->bv_offset, bvec->bv_len);
2748                }
2749
2750                start = page_offset(page);
2751                end = start + bvec->bv_offset + bvec->bv_len - 1;
2752                len = bvec->bv_len;
2753
2754                mirror = io_bio->mirror_num;
2755                if (likely(uptodate)) {
2756                        ret = tree->ops->readpage_end_io_hook(io_bio, offset,
2757                                                              page, start, end,
2758                                                              mirror);
2759                        if (ret)
2760                                uptodate = 0;
2761                        else
2762                                clean_io_failure(BTRFS_I(inode)->root->fs_info,
2763                                                 failure_tree, tree, start,
2764                                                 page,
2765                                                 btrfs_ino(BTRFS_I(inode)), 0);
2766                }
2767
2768                if (likely(uptodate))
2769                        goto readpage_ok;
2770
2771                if (data_inode) {
2772
2773                        /*
2774                         * The generic bio_readpage_error handles errors the
2775                         * following way: If possible, new read requests are
2776                         * created and submitted and will end up in
2777                         * end_bio_extent_readpage as well (if we're lucky,
2778                         * not in the !uptodate case). In that case it returns
2779                         * 0 and we just go on with the next page in our bio.
2780                         * If it can't handle the error it will return -EIO and
2781                         * we remain responsible for that page.
2782                         */
2783                        ret = bio_readpage_error(bio, offset, page, start, end,
2784                                                 mirror);
2785                        if (ret == 0) {
2786                                uptodate = !bio->bi_status;
2787                                offset += len;
2788                                continue;
2789                        }
2790                } else {
2791                        struct extent_buffer *eb;
2792
2793                        eb = (struct extent_buffer *)page->private;
2794                        set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
2795                        eb->read_mirror = mirror;
2796                        atomic_dec(&eb->io_pages);
2797                        if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
2798                                               &eb->bflags))
2799                                btree_readahead_hook(eb, -EIO);
2800                }
2801readpage_ok:
2802                if (likely(uptodate)) {
2803                        loff_t i_size = i_size_read(inode);
2804                        pgoff_t end_index = i_size >> PAGE_SHIFT;
2805                        unsigned off;
2806
2807                        /* Zero out the end if this page straddles i_size */
2808                        off = offset_in_page(i_size);
2809                        if (page->index == end_index && off)
2810                                zero_user_segment(page, off, PAGE_SIZE);
2811                        SetPageUptodate(page);
2812                } else {
2813                        ClearPageUptodate(page);
2814                        SetPageError(page);
2815                }
2816                unlock_page(page);
2817                offset += len;
2818
2819                if (unlikely(!uptodate)) {
2820                        if (extent_len) {
2821                                endio_readpage_release_extent(tree,
2822                                                              extent_start,
2823                                                              extent_len, 1);
2824                                extent_start = 0;
2825                                extent_len = 0;
2826                        }
2827                        endio_readpage_release_extent(tree, start,
2828                                                      end - start + 1, 0);
2829                } else if (!extent_len) {
2830                        extent_start = start;
2831                        extent_len = end + 1 - start;
2832                } else if (extent_start + extent_len == start) {
2833                        extent_len += end + 1 - start;
2834                } else {
2835                        endio_readpage_release_extent(tree, extent_start,
2836                                                      extent_len, uptodate);
2837                        extent_start = start;
2838                        extent_len = end + 1 - start;
2839                }
2840        }
2841
2842        if (extent_len)
2843                endio_readpage_release_extent(tree, extent_start, extent_len,
2844                                              uptodate);
2845        btrfs_io_bio_free_csum(io_bio);
2846        bio_put(bio);
2847}
2848
2849/*
2850 * Initialize the members up to but not including 'bio'. Use after allocating a
2851 * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
2852 * 'bio' because use of __GFP_ZERO is not supported.
2853 */
2854static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
2855{
2856        memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
2857}
2858
2859/*
2860 * The following helpers allocate a bio. As it's backed by a bioset, it'll
2861 * never fail.  We're returning a bio right now but you can call btrfs_io_bio
2862 * for the appropriate container_of magic
2863 */
2864struct bio *btrfs_bio_alloc(u64 first_byte)
2865{
2866        struct bio *bio;
2867
2868        bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &btrfs_bioset);
2869        bio->bi_iter.bi_sector = first_byte >> 9;
2870        btrfs_io_bio_init(btrfs_io_bio(bio));
2871        return bio;
2872}
2873
2874struct bio *btrfs_bio_clone(struct bio *bio)
2875{
2876        struct btrfs_io_bio *btrfs_bio;
2877        struct bio *new;
2878
2879        /* Bio allocation backed by a bioset does not fail */
2880        new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset);
2881        btrfs_bio = btrfs_io_bio(new);
2882        btrfs_io_bio_init(btrfs_bio);
2883        btrfs_bio->iter = bio->bi_iter;
2884        return new;
2885}
2886
2887struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
2888{
2889        struct bio *bio;
2890
2891        /* Bio allocation backed by a bioset does not fail */
2892        bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset);
2893        btrfs_io_bio_init(btrfs_io_bio(bio));
2894        return bio;
2895}
2896
2897struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size)
2898{
2899        struct bio *bio;
2900        struct btrfs_io_bio *btrfs_bio;
2901
2902        /* this will never fail when it's backed by a bioset */
2903        bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset);
2904        ASSERT(bio);
2905
2906        btrfs_bio = btrfs_io_bio(bio);
2907        btrfs_io_bio_init(btrfs_bio);
2908
2909        bio_trim(bio, offset >> 9, size >> 9);
2910        btrfs_bio->iter = bio->bi_iter;
2911        return bio;
2912}
2913
2914/*
2915 * @opf:        bio REQ_OP_* and REQ_* flags as one value
2916 * @tree:       tree so we can call our merge_bio hook
2917 * @wbc:        optional writeback control for io accounting
2918 * @page:       page to add to the bio
2919 * @pg_offset:  offset of the new bio or to check whether we are adding
2920 *              a contiguous page to the previous one
2921 * @size:       portion of page that we want to write
2922 * @offset:     starting offset in the page
2923 * @bdev:       attach newly created bios to this bdev
2924 * @bio_ret:    must be valid pointer, newly allocated bio will be stored there
2925 * @end_io_func:     end_io callback for new bio
2926 * @mirror_num:      desired mirror to read/write
2927 * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
2928 * @bio_flags:  flags of the current bio to see if we can merge them
2929 */
2930static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
2931                              struct writeback_control *wbc,
2932                              struct page *page, u64 offset,
2933                              size_t size, unsigned long pg_offset,
2934                              struct block_device *bdev,
2935                              struct bio **bio_ret,
2936                              bio_end_io_t end_io_func,
2937                              int mirror_num,
2938                              unsigned long prev_bio_flags,
2939                              unsigned long bio_flags,
2940                              bool force_bio_submit)
2941{
2942        int ret = 0;
2943        struct bio *bio;
2944        size_t page_size = min_t(size_t, size, PAGE_SIZE);
2945        sector_t sector = offset >> 9;
2946
2947        ASSERT(bio_ret);
2948
2949        if (*bio_ret) {
2950                bool contig;
2951                bool can_merge = true;
2952
2953                bio = *bio_ret;
2954                if (prev_bio_flags & EXTENT_BIO_COMPRESSED)
2955                        contig = bio->bi_iter.bi_sector == sector;
2956                else
2957                        contig = bio_end_sector(bio) == sector;
2958
2959                ASSERT(tree->ops);
2960                if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
2961                        can_merge = false;
2962
2963                if (prev_bio_flags != bio_flags || !contig || !can_merge ||
2964                    force_bio_submit ||
2965                    bio_add_page(bio, page, page_size, pg_offset) < page_size) {
2966                        ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
2967                        if (ret < 0) {
2968                                *bio_ret = NULL;
2969                                return ret;
2970                        }
2971                        bio = NULL;
2972                } else {
2973                        if (wbc)
2974                                wbc_account_cgroup_owner(wbc, page, page_size);
2975                        return 0;
2976                }
2977        }
2978
2979        bio = btrfs_bio_alloc(offset);
2980        bio_set_dev(bio, bdev);
2981        bio_add_page(bio, page, page_size, pg_offset);
2982        bio->bi_end_io = end_io_func;
2983        bio->bi_private = tree;
2984        bio->bi_write_hint = page->mapping->host->i_write_hint;
2985        bio->bi_opf = opf;
2986        if (wbc) {
2987                wbc_init_bio(wbc, bio);
2988                wbc_account_cgroup_owner(wbc, page, page_size);
2989        }
2990
2991        *bio_ret = bio;
2992
2993        return ret;
2994}
2995
2996static void attach_extent_buffer_page(struct extent_buffer *eb,
2997                                      struct page *page)
2998{
2999        if (!PagePrivate(page)) {
3000                SetPagePrivate(page);

3001                get_page(page);
3002                set_page_private(page, (unsigned long)eb);
3003        } else {
3004                WARN_ON(page->private != (unsigned long)eb);
3005        }
3006}
3007
3008void set_page_extent_mapped(struct page *page)
3009{
3010        if (!PagePrivate(page)) {
3011                SetPagePrivate(page);
3012                get_page(page);
3013                set_page_private(page, EXTENT_PAGE_PRIVATE);
3014        }
3015}
3016
3017static struct extent_map *
3018__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3019                 u64 start, u64 len, get_extent_t *get_extent,
3020                 struct extent_map **em_cached)
3021{
3022        struct extent_map *em;
3023
3024        if (em_cached && *em_cached) {
3025                em = *em_cached;
3026                if (extent_map_in_tree(em) && start >= em->start &&
3027                    start < extent_map_end(em)) {
3028                        refcount_inc(&em->refs);
3029                        return em;
3030                }
3031
3032                free_extent_map(em);
3033                *em_cached = NULL;
3034        }
3035
3036        em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
3037        if (em_cached && !IS_ERR_OR_NULL(em)) {
3038                BUG_ON(*em_cached);
3039                refcount_inc(&em->refs);
3040                *em_cached = em;
3041        }
3042        return em;
3043}
3044/*
3045 * basic readpage implementation.  Locked extent state structs are inserted
3046 * into the tree that are removed when the IO is done (by the end_io
3047 * handlers)
3048 * XXX JDM: This needs looking at to ensure proper page locking
3049 * return 0 on success, otherwise return error
3050 */
3051static int __do_readpage(struct extent_io_tree *tree,
3052                         struct page *page,
3053                         get_extent_t *get_extent,
3054                         struct extent_map **em_cached,
3055                         struct bio **bio, int mirror_num,
3056                         unsigned long *bio_flags, unsigned int read_flags,
3057                         u64 *prev_em_start)
3058{
3059        struct inode *inode = page->mapping->host;
3060        u64 start = page_offset(page);
3061        const u64 end = start + PAGE_SIZE - 1;
3062        u64 cur = start;
3063        u64 extent_offset;
3064        u64 last_byte = i_size_read(inode);
3065        u64 block_start;
3066        u64 cur_end;
3067        struct extent_map *em;
3068        struct block_device *bdev;
3069        int ret = 0;
3070        int nr = 0;
3071        size_t pg_offset = 0;
3072        size_t iosize;
3073        size_t disk_io_size;
3074        size_t blocksize = inode->i_sb->s_blocksize;
3075        unsigned long this_bio_flag = 0;
3076
3077        set_page_extent_mapped(page);
3078
3079        if (!PageUptodate(page)) {
3080                if (cleancache_get_page(page) == 0) {
3081                        BUG_ON(blocksize != PAGE_SIZE);
3082                        unlock_extent(tree, start, end);
3083                        goto out;
3084                }
3085        }
3086
3087        if (page->index == last_byte >> PAGE_SHIFT) {
3088                char *userpage;
3089                size_t zero_offset = offset_in_page(last_byte);
3090
3091                if (zero_offset) {
3092                        iosize = PAGE_SIZE - zero_offset;
3093                        userpage = kmap_atomic(page);
3094                        memset(userpage + zero_offset, 0, iosize);
3095                        flush_dcache_page(page);
3096                        kunmap_atomic(userpage);
3097                }
3098        }
3099        while (cur <= end) {
3100                bool force_bio_submit = false;
3101                u64 offset;
3102
3103                if (cur >= last_byte) {
3104                        char *userpage;
3105                        struct extent_state *cached = NULL;
3106
3107                        iosize = PAGE_SIZE - pg_offset;
3108                        userpage = kmap_atomic(page);
3109                        memset(userpage + pg_offset, 0, iosize);
3110                        flush_dcache_page(page);
3111                        kunmap_atomic(userpage);
3112                        set_extent_uptodate(tree, cur, cur + iosize - 1,
3113                                            &cached, GFP_NOFS);
3114                        unlock_extent_cached(tree, cur,
3115                                             cur + iosize - 1, &cached);
3116                        break;
3117                }
3118                em = __get_extent_map(inode, page, pg_offset, cur,
3119                                      end - cur + 1, get_extent, em_cached);
3120                if (IS_ERR_OR_NULL(em)) {
3121                        SetPageError(page);
3122                        unlock_extent(tree, cur, end);
3123                        break;
3124                }
3125                extent_offset = cur - em->start;
3126                BUG_ON(extent_map_end(em) <= cur);
3127                BUG_ON(end < cur);
3128
3129                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
3130                        this_bio_flag |= EXTENT_BIO_COMPRESSED;
3131                        extent_set_compress_type(&this_bio_flag,
3132                                                 em->compress_type);
3133                }
3134
3135                iosize = min(extent_map_end(em) - cur, end - cur + 1);
3136                cur_end = min(extent_map_end(em) - 1, end);
3137                iosize = ALIGN(iosize, blocksize);
3138                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
3139                        disk_io_size = em->block_len;
3140                        offset = em->block_start;
3141                } else {
3142                        offset = em->block_start + extent_offset;
3143                        disk_io_size = iosize;
3144                }
3145                bdev = em->bdev;
3146                block_start = em->block_start;
3147                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3148                        block_start = EXTENT_MAP_HOLE;
3149
3150                /*
3151                 * If we have a file range that points to a compressed extent
3152                 * and it's followed by a consecutive file range that points to
3153                 * to the same compressed extent (possibly with a different
3154                 * offset and/or length, so it either points to the whole extent
3155                 * or only part of it), we must make sure we do not submit a
3156                 * single bio to populate the pages for the 2 ranges because
3157                 * this makes the compressed extent read zero out the pages
3158                 * belonging to the 2nd range. Imagine the following scenario:
3159                 *
3160                 *  File layout
3161                 *  [0 - 8K]                     [8K - 24K]
3162                 *    |                               |
3163                 *    |                               |
3164                 * points to extent X,         points to extent X,
3165                 * offset 4K, length of 8K     offset 0, length 16K
3166                 *
3167                 * [extent X, compressed length = 4K uncompressed length = 16K]
3168                 *
3169                 * If the bio to read the compressed extent covers both ranges,
3170                 * it will decompress extent X into the pages belonging to the
3171                 * first range and then it will stop, zeroing out the remaining
3172                 * pages that belong to the other range that points to extent X.
3173                 * So here we make sure we submit 2 bios, one for the first
3174                 * range and another one for the third range. Both will target
3175                 * the same physical extent from disk, but we can't currently
3176                 * make the compressed bio endio callback populate the pages
3177                 * for both ranges because each compressed bio is tightly
3178                 * coupled with a single extent map, and each range can have
3179                 * an extent map with a different offset value relative to the
3180                 * uncompressed data of our extent and different lengths. This
3181                 * is a corner case so we prioritize correctness over
3182                 * non-optimal behavior (submitting 2 bios for the same extent).
3183                 */
3184                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3185                    prev_em_start && *prev_em_start != (u64)-1 &&
3186                    *prev_em_start != em->start)
3187                        force_bio_submit = true;
3188
3189                if (prev_em_start)
3190                        *prev_em_start = em->start;
3191
3192                free_extent_map(em);
3193                em = NULL;
3194
3195                /* we've found a hole, just zero and go on */
3196                if (block_start == EXTENT_MAP_HOLE) {
3197                        char *userpage;
3198                        struct extent_state *cached = NULL;
3199
3200                        userpage = kmap_atomic(page);
3201                        memset(userpage + pg_offset, 0, iosize);
3202                        flush_dcache_page(page);
3203                        kunmap_atomic(userpage);
3204
3205                        set_extent_uptodate(tree, cur, cur + iosize - 1,
3206                                            &cached, GFP_NOFS);
3207                        unlock_extent_cached(tree, cur,
3208                                             cur + iosize - 1, &cached);
3209                        cur = cur + iosize;
3210                        pg_offset += iosize;
3211                        continue;
3212                }
3213                /* the get_extent function already copied into the page */
3214                if (test_range_bit(tree, cur, cur_end,
3215                                   EXTENT_UPTODATE, 1, NULL)) {
3216                        check_page_uptodate(tree, page);
3217                        unlock_extent(tree, cur, cur + iosize - 1);
3218                        cur = cur + iosize;
3219                        pg_offset += iosize;
3220                        continue;
3221                }
3222                /* we have an inline extent but it didn't get marked up
3223                 * to date.  Error out
3224                 */
3225                if (block_start == EXTENT_MAP_INLINE) {
3226                        SetPageError(page);
3227                        unlock_extent(tree, cur, cur + iosize - 1);
3228                        cur = cur + iosize;
3229                        pg_offset += iosize;
3230                        continue;
3231                }
3232
3233                ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL,
3234                                         page, offset, disk_io_size,
3235                                         pg_offset, bdev, bio,
3236                                         end_bio_extent_readpage, mirror_num,
3237                                         *bio_flags,
3238                                         this_bio_flag,
3239                                         force_bio_submit);
3240                if (!ret) {
3241                        nr++;
3242                        *bio_flags = this_bio_flag;
3243                } else {
3244                        SetPageError(page);
3245                        unlock_extent(tree, cur, cur + iosize - 1);
3246                        goto out;
3247                }
3248                cur = cur + iosize;
3249                pg_offset += iosize;
3250        }
3251out:
3252        if (!nr) {
3253                if (!PageError(page))
3254                        SetPageUptodate(page);
3255                unlock_page(page);
3256        }
3257        return ret;
3258}
3259
3260static inline void contiguous_readpages(struct extent_io_tree *tree,
3261                                             struct page *pages[], int nr_pages,
3262                                             u64 start, u64 end,
3263                                             struct extent_map **em_cached,
3264                                             struct bio **bio,
3265                                             unsigned long *bio_flags,
3266                                             u64 *prev_em_start)
3267{
3268        struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
3269        int index;
3270
3271        btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
3272
3273        for (index = 0; index < nr_pages; index++) {
3274                __do_readpage(tree, pages[index], btrfs_get_extent, em_cached,
3275                                bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
3276                put_page(pages[index]);
3277        }
3278}
3279
3280static int __extent_read_full_page(struct extent_io_tree *tree,
3281                                   struct page *page,
3282                                   get_extent_t *get_extent,
3283                                   struct bio **bio, int mirror_num,
3284                                   unsigned long *bio_flags,
3285                                   unsigned int read_flags)
3286{
3287        struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3288        u64 start = page_offset(page);
3289        u64 end = start + PAGE_SIZE - 1;
3290        int ret;
3291
3292        btrfs_lock_and_flush_ordered_range(tree, inode, start, end, NULL);
3293
3294        ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
3295                            bio_flags, read_flags, NULL);
3296        return ret;
3297}
3298
3299int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
3300                            get_extent_t *get_extent, int mirror_num)
3301{
3302        struct bio *bio = NULL;
3303        unsigned long bio_flags = 0;
3304        int ret;
3305
3306        ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
3307                                      &bio_flags, 0);
3308        if (bio)
3309                ret = submit_one_bio(bio, mirror_num, bio_flags);
3310        return ret;
3311}
3312
3313static void update_nr_written(struct writeback_control *wbc,
3314                              unsigned long nr_written)
3315{
3316        wbc->nr_to_write -= nr_written;
3317}
3318
3319/*
3320 * helper for __extent_writepage, doing all of the delayed allocation setup.
3321 *
3322 * This returns 1 if btrfs_run_delalloc_range function did all the work required
3323 * to write the page (copy into inline extent).  In this case the IO has
3324 * been started and the page is already unlocked.
3325 *
3326 * This returns 0 if all went well (page still locked)
3327 * This returns < 0 if there were errors (page still locked)
3328 */
3329static noinline_for_stack int writepage_delalloc(struct inode *inode,
3330                struct page *page, struct writeback_control *wbc,
3331                u64 delalloc_start, unsigned long *nr_written)
3332{
3333        u64 page_end = delalloc_start + PAGE_SIZE - 1;
3334        bool found;
3335        u64 delalloc_to_write = 0;
3336        u64 delalloc_end = 0;
3337        int ret;
3338        int page_started = 0;
3339
3340
3341        while (delalloc_end < page_end) {
3342                found = find_lock_delalloc_range(inode, page,
3343                                               &delalloc_start,
3344                                               &delalloc_end);
3345                if (!found) {
3346                        delalloc_start = delalloc_end + 1;
3347                        continue;
3348                }
3349                ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3350                                delalloc_end, &page_started, nr_written, wbc);
3351                if (ret) {
3352                        SetPageError(page);
3353                        /*
3354                         * btrfs_run_delalloc_range should return < 0 for error
3355                         * but just in case, we use > 0 here meaning the IO is
3356                         * started, so we don't want to return > 0 unless
3357                         * things are going well.
3358                         */
3359                        ret = ret < 0 ? ret : -EIO;
3360                        goto done;
3361                }
3362                /*
3363                 * delalloc_end is already one less than the total length, so
3364                 * we don't subtract one from PAGE_SIZE
3365                 */
3366                delalloc_to_write += (delalloc_end - delalloc_start +
3367                                      PAGE_SIZE) >> PAGE_SHIFT;
3368                delalloc_start = delalloc_end + 1;
3369        }
3370        if (wbc->nr_to_write < delalloc_to_write) {
3371                int thresh = 8192;
3372
3373                if (delalloc_to_write < thresh * 2)
3374                        thresh = delalloc_to_write;
3375                wbc->nr_to_write = min_t(u64, delalloc_to_write,
3376                                         thresh);
3377        }
3378
3379        /* did the fill delalloc function already unlock and start
3380         * the IO?
3381         */
3382        if (page_started) {
3383                /*
3384                 * we've unlocked the page, so we can't update
3385                 * the mapping's writeback index, just update
3386                 * nr_to_write.
3387                 */
3388                wbc->nr_to_write -= *nr_written;
3389                return 1;
3390        }
3391
3392        ret = 0;
3393
3394done:
3395        return ret;
3396}
3397
3398/*
3399 * helper for __extent_writepage.  This calls the writepage start hooks,
3400 * and does the loop to map the page into extents and bios.
3401 *
3402 * We return 1 if the IO is started and the page is unlocked,
3403 * 0 if all went well (page still locked)
3404 * < 0 if there were errors (page still locked)
3405 */
3406static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3407                                 struct page *page,
3408                                 struct writeback_control *wbc,
3409                                 struct extent_page_data *epd,
3410                                 loff_t i_size,
3411                                 unsigned long nr_written,
3412                                 unsigned int write_flags, int *nr_ret)
3413{
3414        struct extent_io_tree *tree = epd->tree;
3415        u64 start = page_offset(page);
3416        u64 page_end = start + PAGE_SIZE - 1;
3417        u64 end;
3418        u64 cur = start;
3419        u64 extent_offset;
3420        u64 block_start;
3421        u64 iosize;
3422        struct extent_map *em;
3423        struct block_device *bdev;
3424        size_t pg_offset = 0;
3425        size_t blocksize;
3426        int ret = 0;
3427        int nr = 0;
3428        bool compressed;
3429
3430        ret = btrfs_writepage_cow_fixup(page, start, page_end);
3431        if (ret) {
3432                /* Fixup worker will requeue */
3433                if (ret == -EBUSY)
3434                        wbc->pages_skipped++;
3435                else
3436                        redirty_page_for_writepage(wbc, page);
3437
3438                update_nr_written(wbc, nr_written);
3439                unlock_page(page);
3440                return 1;
3441        }
3442
3443        /*
3444         * we don't want to touch the inode after unlocking the page,
3445         * so we update the mapping writeback index now
3446         */
3447        update_nr_written(wbc, nr_written + 1);
3448
3449        end = page_end;
3450        if (i_size <= start) {
3451                btrfs_writepage_endio_finish_ordered(page, start, page_end, 1);
3452                goto done;
3453        }
3454
3455        blocksize = inode->i_sb->s_blocksize;
3456
3457        while (cur <= end) {
3458                u64 em_end;
3459                u64 offset;
3460
3461                if (cur >= i_size) {
3462                        btrfs_writepage_endio_finish_ordered(page, cur,
3463                                                             page_end, 1);
3464                        break;
3465                }
3466                em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur,
3467                                     end - cur + 1, 1);
3468                if (IS_ERR_OR_NULL(em)) {
3469                        SetPageError(page);
3470                        ret = PTR_ERR_OR_ZERO(em);
3471                        break;
3472                }
3473
3474                extent_offset = cur - em->start;
3475                em_end = extent_map_end(em);
3476                BUG_ON(em_end <= cur);
3477                BUG_ON(end < cur);
3478                iosize = min(em_end - cur, end - cur + 1);
3479                iosize = ALIGN(iosize, blocksize);
3480                offset = em->block_start + extent_offset;
3481                bdev = em->bdev;
3482                block_start = em->block_start;
3483                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3484                free_extent_map(em);
3485                em = NULL;
3486
3487                /*
3488                 * compressed and inline extents are written through other
3489                 * paths in the FS
3490                 */
3491                if (compressed || block_start == EXTENT_MAP_HOLE ||
3492                    block_start == EXTENT_MAP_INLINE) {
3493                        /*
3494                         * end_io notification does not happen here for
3495                         * compressed extents
3496                         */
3497                        if (!compressed)
3498                                btrfs_writepage_endio_finish_ordered(page, cur,
3499                                                            cur + iosize - 1,
3500                                                            1);
3501                        else if (compressed) {
3502                                /* we don't want to end_page_writeback on
3503                                 * a compressed extent.  this happens
3504                                 * elsewhere
3505                                 */
3506                                nr++;
3507                        }
3508
3509                        cur += iosize;
3510                        pg_offset += iosize;
3511                        continue;
3512                }
3513
3514                btrfs_set_range_writeback(tree, cur, cur + iosize - 1);
3515                if (!PageWriteback(page)) {
3516                        btrfs_err(BTRFS_I(inode)->root->fs_info,
3517                                   "page %lu not writeback, cur %llu end %llu",
3518                               page->index, cur, end);
3519                }
3520
3521                ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
3522                                         page, offset, iosize, pg_offset,
3523                                         bdev, &epd->bio,
3524                                         end_bio_extent_writepage,
3525                                         0, 0, 0, false);
3526                if (ret) {
3527                        SetPageError(page);
3528                        if (PageWriteback(page))
3529                                end_page_writeback(page);
3530                }
3531
3532                cur = cur + iosize;
3533                pg_offset += iosize;
3534                nr++;
3535        }
3536done:
3537        *nr_ret = nr;
3538        return ret;
3539}
3540
3541/*
3542 * the writepage semantics are similar to regular writepage.  extent
3543 * records are inserted to lock ranges in the tree, and as dirty areas
3544 * are found, they are marked writeback.  Then the lock bits are removed
3545 * and the end_io handler clears the writeback ranges
3546 *
3547 * Return 0 if everything goes well.
3548 * Return <0 for error.
3549 */
3550static int __extent_writepage(struct page *page, struct writeback_control *wbc,
3551                              struct extent_page_data *epd)
3552{
3553        struct inode *inode = page->mapping->host;
3554        u64 start = page_offset(page);
3555        u64 page_end = start + PAGE_SIZE - 1;
3556        int ret;
3557        int nr = 0;
3558        size_t pg_offset = 0;
3559        loff_t i_size = i_size_read(inode);
3560        unsigned long end_index = i_size >> PAGE_SHIFT;
3561        unsigned int write_flags = 0;
3562        unsigned long nr_written = 0;
3563
3564        write_flags = wbc_to_write_flags(wbc);
3565
3566        trace___extent_writepage(page, inode, wbc);
3567
3568        WARN_ON(!PageLocked(page));
3569
3570        ClearPageError(page);
3571
3572        pg_offset = offset_in_page(i_size);
3573        if (page->index > end_index ||
3574           (page->index == end_index && !pg_offset)) {
3575                page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
3576                unlock_page(page);
3577                return 0;
3578        }
3579
3580        if (page->index == end_index) {
3581                char *userpage;
3582
3583                userpage = kmap_atomic(page);
3584                memset(userpage + pg_offset, 0,
3585                       PAGE_SIZE - pg_offset);
3586                kunmap_atomic(userpage);
3587                flush_dcache_page(page);
3588        }
3589
3590        pg_offset = 0;
3591
3592        set_page_extent_mapped(page);
3593
3594        if (!epd->extent_locked) {
3595                ret = writepage_delalloc(inode, page, wbc, start, &nr_written);
3596                if (ret == 1)
3597                        goto done_unlocked;
3598                if (ret)
3599                        goto done;
3600        }
3601
3602        ret = __extent_writepage_io(inode, page, wbc, epd,
3603                                    i_size, nr_written, write_flags, &nr);
3604        if (ret == 1)
3605                goto done_unlocked;
3606
3607done:
3608        if (nr == 0) {
3609                /* make sure the mapping tag for page dirty gets cleared */
3610                set_page_writeback(page);
3611                end_page_writeback(page);
3612        }
3613        if (PageError(page)) {
3614                ret = ret < 0 ? ret : -EIO;
3615                end_extent_writepage(page, ret, start, page_end);
3616        }
3617        unlock_page(page);
3618        ASSERT(ret <= 0);
3619        return ret;
3620
3621done_unlocked:
3622        return 0;
3623}
3624
3625void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3626{
3627        wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3628                       TASK_UNINTERRUPTIBLE);
3629}
3630
3631static void end_extent_buffer_writeback(struct extent_buffer *eb)
3632{
3633        clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3634        smp_mb__after_atomic();
3635        wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3636}
3637
3638/*
3639 * Lock eb pages and flush the bio if we can't the locks
3640 *
3641 * Return  0 if nothing went wrong
3642 * Return >0 is same as 0, except bio is not submitted
3643 * Return <0 if something went wrong, no page is locked
3644 */
3645static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
3646                          struct extent_page_data *epd)
3647{
3648        struct btrfs_fs_info *fs_info = eb->fs_info;
3649        int i, num_pages, failed_page_nr;
3650        int flush = 0;
3651        int ret = 0;
3652
3653        if (!btrfs_try_tree_write_lock(eb)) {
3654                ret = flush_write_bio(epd);
3655                if (ret < 0)
3656                        return ret;
3657                flush = 1;
3658                btrfs_tree_lock(eb);
3659        }
3660
3661        if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3662                btrfs_tree_unlock(eb);
3663                if (!epd->sync_io)
3664                        return 0;
3665                if (!flush) {
3666                        ret = flush_write_bio(epd);
3667                        if (ret < 0)
3668                                return ret;
3669                        flush = 1;
3670                }
3671                while (1) {
3672                        wait_on_extent_buffer_writeback(eb);
3673                        btrfs_tree_lock(eb);
3674                        if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3675                                break;
3676                        btrfs_tree_unlock(eb);
3677                }
3678        }
3679
3680        /*
3681         * We need to do this to prevent races in people who check if the eb is
3682         * under IO since we can end up having no IO bits set for a short period
3683         * of time.
3684         */
3685        spin_lock(&eb->refs_lock);
3686        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3687                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3688                spin_unlock(&eb->refs_lock);
3689                btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3690                percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
3691                                         -eb->len,
3692                                         fs_info->dirty_metadata_batch);
3693                ret = 1;
3694        } else {
3695                spin_unlock(&eb->refs_lock);
3696        }
3697
3698        btrfs_tree_unlock(eb);
3699
3700        if (!ret)
3701                return ret;
3702
3703        num_pages = num_extent_pages(eb);
3704        for (i = 0; i < num_pages; i++) {
3705                struct page *p = eb->pages[i];
3706
3707                if (!trylock_page(p)) {
3708                        if (!flush) {
3709                                int err;
3710
3711                                err = flush_write_bio(epd);
3712                                if (err < 0) {
3713                                        ret = err;
3714                                        failed_page_nr = i;
3715                                        goto err_unlock;
3716                                }
3717                                flush = 1;
3718                        }
3719                        lock_page(p);
3720                }
3721        }
3722
3723        return ret;
3724err_unlock:
3725        /* Unlock already locked pages */
3726        for (i = 0; i < failed_page_nr; i++)
3727                unlock_page(eb->pages[i]);
3728        /*
3729         * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it.
3730         * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can
3731         * be made and undo everything done before.
3732         */
3733        btrfs_tree_lock(eb);
3734        spin_lock(&eb->refs_lock);
3735        set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3736        end_extent_buffer_writeback(eb);
3737        spin_unlock(&eb->refs_lock);
3738        percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len,
3739                                 fs_info->dirty_metadata_batch);
3740        btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3741        btrfs_tree_unlock(eb);
3742        return ret;
3743}
3744
3745static void set_btree_ioerr(struct page *page)
3746{
3747        struct extent_buffer *eb = (struct extent_buffer *)page->private;
3748
3749        SetPageError(page);
3750        if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3751                return;
3752
3753        /*
3754         * If writeback for a btree extent that doesn't belong to a log tree
3755         * failed, increment the counter transaction->eb_write_errors.
3756         * We do this because while the transaction is running and before it's
3757         * committing (when we call filemap_fdata[write|wait]_range against
3758         * the btree inode), we might have
3759         * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3760         * returns an error or an error happens during writeback, when we're
3761         * committing the transaction we wouldn't know about it, since the pages
3762         * can be no longer dirty nor marked anymore for writeback (if a
3763         * subsequent modification to the extent buffer didn't happen before the
3764         * transaction commit), which makes filemap_fdata[write|wait]_range not
3765         * able to find the pages tagged with SetPageError at transaction
3766         * commit time. So if this happens we must abort the transaction,
3767         * otherwise we commit a super block with btree roots that point to
3768         * btree nodes/leafs whose content on disk is invalid - either garbage
3769         * or the content of some node/leaf from a past generation that got
3770         * cowed or deleted and is no longer valid.
3771         *
3772         * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3773         * not be enough - we need to distinguish between log tree extents vs
3774         * non-log tree extents, and the next filemap_fdatawait_range() call
3775         * will catch and clear such errors in the mapping - and that call might
3776         * be from a log sync and not from a transaction commit. Also, checking
3777         * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3778         * not done and would not be reliable - the eb might have been released
3779         * from memory and reading it back again means that flag would not be
3780         * set (since it's a runtime flag, not persisted on disk).
3781         *
3782         * Using the flags below in the btree inode also makes us achieve the
3783         * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3784         * writeback for all dirty pages and before filemap_fdatawait_range()
3785         * is called, the writeback for all dirty pages had already finished
3786         * with errors - because we were not using AS_EIO/AS_ENOSPC,
3787         * filemap_fdatawait_range() would return success, as it could not know
3788         * that writeback errors happened (the pages were no longer tagged for
3789         * writeback).
3790         */
3791        switch (eb->log_index) {
3792        case -1:
3793                set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
3794                break;
3795        case 0:
3796                set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
3797                break;
3798        case 1:
3799                set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
3800                break;
3801        default:
3802                BUG(); /* unexpected, logic error */
3803        }
3804}
3805
3806static void end_bio_extent_buffer_writepage(struct bio *bio)
3807{
3808        struct bio_vec *bvec;
3809        struct extent_buffer *eb;
3810        int done;
3811        struct bvec_iter_all iter_all;
3812
3813        ASSERT(!bio_flagged(bio, BIO_CLONED));
3814        bio_for_each_segment_all(bvec, bio, iter_all) {
3815                struct page *page = bvec->bv_page;
3816
3817                eb = (struct extent_buffer *)page->private;
3818                BUG_ON(!eb);
3819                done = atomic_dec_and_test(&eb->io_pages);
3820
3821                if (bio->bi_status ||
3822                    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3823                        ClearPageUptodate(page);
3824                        set_btree_ioerr(page);
3825                }
3826
3827                end_page_writeback(page);
3828
3829                if (!done)
3830                        continue;
3831
3832                end_extent_buffer_writeback(eb);
3833        }
3834
3835        bio_put(bio);
3836}
3837
3838static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3839                        struct writeback_control *wbc,
3840                        struct extent_page_data *epd)
3841{
3842        struct btrfs_fs_info *fs_info = eb->fs_info;
3843        struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3844        struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
3845        u64 offset = eb->start;
3846        u32 nritems;
3847        int i, num_pages;
3848        unsigned long start, end;
3849        unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
3850        int ret = 0;
3851
3852        clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3853        num_pages = num_extent_pages(eb);
3854        atomic_set(&eb->io_pages, num_pages);
3855
3856        /* set btree blocks beyond nritems with 0 to avoid stale content. */
3857        nritems = btrfs_header_nritems(eb);
3858        if (btrfs_header_level(eb) > 0) {
3859                end = btrfs_node_key_ptr_offset(nritems);
3860
3861                memzero_extent_buffer(eb, end, eb->len - end);
3862        } else {
3863                /*
3864                 * leaf:
3865                 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
3866                 */
3867                start = btrfs_item_nr_offset(nritems);
3868                end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
3869                memzero_extent_buffer(eb, start, end - start);
3870        }
3871
3872        for (i = 0; i < num_pages; i++) {
3873                struct page *p = eb->pages[i];
3874
3875                clear_page_dirty_for_io(p);
3876                set_page_writeback(p);
3877                ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc,
3878                                         p, offset, PAGE_SIZE, 0, bdev,
3879                                         &epd->bio,
3880                                         end_bio_extent_buffer_writepage,
3881                                         0, 0, 0, false);
3882                if (ret) {
3883                        set_btree_ioerr(p);
3884                        if (PageWriteback(p))
3885                                end_page_writeback(p);
3886                        if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3887                                end_extent_buffer_writeback(eb);
3888                        ret = -EIO;
3889                        break;
3890                }
3891                offset += PAGE_SIZE;
3892                update_nr_written(wbc, 1);
3893                unlock_page(p);
3894        }
3895
3896        if (unlikely(ret)) {
3897                for (; i < num_pages; i++) {
3898                        struct page *p = eb->pages[i];
3899                        clear_page_dirty_for_io(p);
3900                        unlock_page(p);
3901                }
3902        }
3903
3904        return ret;
3905}
3906
3907int btree_write_cache_pages(struct address_space *mapping,
3908                                   struct writeback_control *wbc)
3909{
3910        struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3911        struct extent_buffer *eb, *prev_eb = NULL;
3912        struct extent_page_data epd = {
3913                .bio = NULL,
3914                .tree = tree,
3915                .extent_locked = 0,
3916                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3917        };
3918        int ret = 0;
3919        int done = 0;
3920        int nr_to_write_done = 0;
3921        struct pagevec pvec;
3922        int nr_pages;
3923        pgoff_t index;
3924        pgoff_t end;            /* Inclusive */
3925        int scanned = 0;
3926        xa_mark_t tag;
3927
3928        pagevec_init(&pvec);
3929        if (wbc->range_cyclic) {
3930                index = mapping->writeback_index; /* Start from prev offset */
3931                end = -1;
3932        } else {
3933                index = wbc->range_start >> PAGE_SHIFT;
3934                end = wbc->range_end >> PAGE_SHIFT;
3935                scanned = 1;
3936        }
3937        if (wbc->sync_mode == WB_SYNC_ALL)
3938                tag = PAGECACHE_TAG_TOWRITE;
3939        else
3940                tag = PAGECACHE_TAG_DIRTY;
3941retry:
3942        if (wbc->sync_mode == WB_SYNC_ALL)
3943                tag_pages_for_writeback(mapping, index, end);
3944        while (!done && !nr_to_write_done && (index <= end) &&
3945               (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
3946                        tag))) {
3947                unsigned i;
3948
3949                scanned = 1;
3950                for (i = 0; i < nr_pages; i++) {
3951                        struct page *page = pvec.pages[i];
3952
3953                        if (!PagePrivate(page))
3954                                continue;
3955
3956                        spin_lock(&mapping->private_lock);
3957                        if (!PagePrivate(page)) {
3958                                spin_unlock(&mapping->private_lock);
3959                                continue;
3960                        }
3961
3962                        eb = (struct extent_buffer *)page->private;
3963
3964                        /*
3965                         * Shouldn't happen and normally this would be a BUG_ON
3966                         * but no sense in crashing the users box for something
3967                         * we can survive anyway.
3968                         */
3969                        if (WARN_ON(!eb)) {
3970                                spin_unlock(&mapping->private_lock);
3971                                continue;
3972                        }
3973
3974                        if (eb == prev_eb) {
3975                                spin_unlock(&mapping->private_lock);
3976                                continue;
3977                        }
3978
3979                        ret = atomic_inc_not_zero(&eb->refs);
3980                        spin_unlock(&mapping->private_lock);
3981                        if (!ret)
3982                                continue;
3983
3984                        prev_eb = eb;
3985                        ret = lock_extent_buffer_for_io(eb, &epd);
3986                        if (!ret) {
3987                                free_extent_buffer(eb);
3988                                continue;
3989                        }
3990
3991                        ret = write_one_eb(eb, wbc, &epd);
3992                        if (ret) {
3993                                done = 1;
3994                                free_extent_buffer(eb);
3995                                break;
3996                        }
3997                        free_extent_buffer(eb);
3998
3999                        /*
4000                         * the filesystem may choose to bump up nr_to_write.

4001                         * We have to make sure to honor the new nr_to_write
4002                         * at any time
4003                         */
4004                        nr_to_write_done = wbc->nr_to_write <= 0;
4005                }
4006                pagevec_release(&pvec);
4007                cond_resched();
4008        }
4009        if (!scanned && !done) {
4010                /*
4011                 * We hit the last page and there is more work to be done: wrap
4012                 * back to the start of the file
4013                 */
4014                scanned = 1;
4015                index = 0;
4016                goto retry;
4017        }
4018        ASSERT(ret <= 0);
4019        if (ret < 0) {
4020                end_write_bio(&epd, ret);
4021                return ret;
4022        }
4023        ret = flush_write_bio(&epd);
4024        return ret;
4025}
4026
4027/**
4028 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
4029 * @mapping: address space structure to write
4030 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
4031 * @data: data passed to __extent_writepage function
4032 *
4033 * If a page is already under I/O, write_cache_pages() skips it, even
4034 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
4035 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
4036 * and msync() need to guarantee that all the data which was dirty at the time
4037 * the call was made get new I/O started against them.  If wbc->sync_mode is
4038 * WB_SYNC_ALL then we were called for data integrity and we must wait for
4039 * existing IO to complete.
4040 */
4041static int extent_write_cache_pages(struct address_space *mapping,
4042                             struct writeback_control *wbc,
4043                             struct extent_page_data *epd)
4044{
4045        struct inode *inode = mapping->host;
4046        int ret = 0;
4047        int done = 0;
4048        int nr_to_write_done = 0;
4049        struct pagevec pvec;
4050        int nr_pages;
4051        pgoff_t index;
4052        pgoff_t end;            /* Inclusive */
4053        pgoff_t done_index;
4054        int range_whole = 0;
4055        int scanned = 0;
4056        xa_mark_t tag;
4057
4058        /*
4059         * We have to hold onto the inode so that ordered extents can do their
4060         * work when the IO finishes.  The alternative to this is failing to add
4061         * an ordered extent if the igrab() fails there and that is a huge pain
4062         * to deal with, so instead just hold onto the inode throughout the
4063         * writepages operation.  If it fails here we are freeing up the inode
4064         * anyway and we'd rather not waste our time writing out stuff that is
4065         * going to be truncated anyway.
4066         */
4067        if (!igrab(inode))
4068                return 0;
4069
4070        pagevec_init(&pvec);
4071        if (wbc->range_cyclic) {
4072                index = mapping->writeback_index; /* Start from prev offset */
4073                end = -1;
4074        } else {
4075                index = wbc->range_start >> PAGE_SHIFT;
4076                end = wbc->range_end >> PAGE_SHIFT;
4077                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4078                        range_whole = 1;
4079                scanned = 1;
4080        }
4081
4082        /*
4083         * We do the tagged writepage as long as the snapshot flush bit is set
4084         * and we are the first one who do the filemap_flush() on this inode.
4085         *
4086         * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4087         * not race in and drop the bit.
4088         */
4089        if (range_whole && wbc->nr_to_write == LONG_MAX &&
4090            test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4091                               &BTRFS_I(inode)->runtime_flags))
4092                wbc->tagged_writepages = 1;
4093
4094        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
4095                tag = PAGECACHE_TAG_TOWRITE;
4096        else
4097                tag = PAGECACHE_TAG_DIRTY;
4098retry:
4099        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
4100                tag_pages_for_writeback(mapping, index, end);
4101        done_index = index;
4102        while (!done && !nr_to_write_done && (index <= end) &&
4103                        (nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
4104                                                &index, end, tag))) {
4105                unsigned i;
4106
4107                scanned = 1;
4108                for (i = 0; i < nr_pages; i++) {
4109                        struct page *page = pvec.pages[i];
4110
4111                        done_index = page->index;
4112                        /*
4113                         * At this point we hold neither the i_pages lock nor
4114                         * the page lock: the page may be truncated or
4115                         * invalidated (changing page->mapping to NULL),
4116                         * or even swizzled back from swapper_space to
4117                         * tmpfs file mapping
4118                         */
4119                        if (!trylock_page(page)) {
4120                                ret = flush_write_bio(epd);
4121                                BUG_ON(ret < 0);
4122                                lock_page(page);
4123                        }
4124
4125                        if (unlikely(page->mapping != mapping)) {
4126                                unlock_page(page);
4127                                continue;
4128                        }
4129
4130                        if (wbc->sync_mode != WB_SYNC_NONE) {
4131                                if (PageWriteback(page)) {
4132                                        ret = flush_write_bio(epd);
4133                                        BUG_ON(ret < 0);
4134                                }
4135                                wait_on_page_writeback(page);
4136                        }
4137
4138                        if (PageWriteback(page) ||
4139                            !clear_page_dirty_for_io(page)) {
4140                                unlock_page(page);
4141                                continue;
4142                        }
4143
4144                        ret = __extent_writepage(page, wbc, epd);
4145                        if (ret < 0) {
4146                                /*
4147                                 * done_index is set past this page,
4148                                 * so media errors will not choke
4149                                 * background writeout for the entire
4150                                 * file. This has consequences for
4151                                 * range_cyclic semantics (ie. it may
4152                                 * not be suitable for data integrity
4153                                 * writeout).
4154                                 */
4155                                done_index = page->index + 1;
4156                                done = 1;
4157                                break;
4158                        }
4159
4160                        /*
4161                         * the filesystem may choose to bump up nr_to_write.
4162                         * We have to make sure to honor the new nr_to_write
4163                         * at any time
4164                         */
4165                        nr_to_write_done = wbc->nr_to_write <= 0;
4166                }
4167                pagevec_release(&pvec);
4168                cond_resched();
4169        }
4170        if (!scanned && !done) {
4171                /*
4172                 * We hit the last page and there is more work to be done: wrap
4173                 * back to the start of the file
4174                 */
4175                scanned = 1;
4176                index = 0;
4177                goto retry;
4178        }
4179
4180        if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4181                mapping->writeback_index = done_index;
4182
4183        btrfs_add_delayed_iput(inode);
4184        return ret;
4185}
4186
4187int extent_write_full_page(struct page *page, struct writeback_control *wbc)
4188{
4189        int ret;
4190        struct extent_page_data epd = {
4191                .bio = NULL,
4192                .tree = &BTRFS_I(page->mapping->host)->io_tree,
4193                .extent_locked = 0,
4194                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4195        };
4196
4197        ret = __extent_writepage(page, wbc, &epd);
4198        ASSERT(ret <= 0);
4199        if (ret < 0) {
4200                end_write_bio(&epd, ret);
4201                return ret;
4202        }
4203
4204        ret = flush_write_bio(&epd);
4205        ASSERT(ret <= 0);
4206        return ret;
4207}
4208
4209int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
4210                              int mode)
4211{
4212        int ret = 0;
4213        struct address_space *mapping = inode->i_mapping;
4214        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
4215        struct page *page;
4216        unsigned long nr_pages = (end - start + PAGE_SIZE) >>
4217                PAGE_SHIFT;
4218
4219        struct extent_page_data epd = {
4220                .bio = NULL,
4221                .tree = tree,
4222                .extent_locked = 1,
4223                .sync_io = mode == WB_SYNC_ALL,
4224        };
4225        struct writeback_control wbc_writepages = {
4226                .sync_mode      = mode,
4227                .nr_to_write    = nr_pages * 2,
4228                .range_start    = start,
4229                .range_end      = end + 1,
4230        };
4231
4232        while (start <= end) {
4233                page = find_get_page(mapping, start >> PAGE_SHIFT);
4234                if (clear_page_dirty_for_io(page))
4235                        ret = __extent_writepage(page, &wbc_writepages, &epd);
4236                else {
4237                        btrfs_writepage_endio_finish_ordered(page, start,
4238                                                    start + PAGE_SIZE - 1, 1);
4239                        unlock_page(page);
4240                }
4241                put_page(page);
4242                start += PAGE_SIZE;
4243        }
4244
4245        ASSERT(ret <= 0);
4246        if (ret < 0) {
4247                end_write_bio(&epd, ret);
4248                return ret;
4249        }
4250        ret = flush_write_bio(&epd);
4251        return ret;
4252}
4253
4254int extent_writepages(struct address_space *mapping,
4255                      struct writeback_control *wbc)
4256{
4257        int ret = 0;
4258        struct extent_page_data epd = {
4259                .bio = NULL,
4260                .tree = &BTRFS_I(mapping->host)->io_tree,
4261                .extent_locked = 0,
4262                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
4263        };
4264
4265        ret = extent_write_cache_pages(mapping, wbc, &epd);
4266        ASSERT(ret <= 0);
4267        if (ret < 0) {
4268                end_write_bio(&epd, ret);
4269                return ret;
4270        }
4271        ret = flush_write_bio(&epd);
4272        return ret;
4273}
4274
4275int extent_readpages(struct address_space *mapping, struct list_head *pages,
4276                     unsigned nr_pages)
4277{
4278        struct bio *bio = NULL;
4279        unsigned long bio_flags = 0;
4280        struct page *pagepool[16];
4281        struct extent_map *em_cached = NULL;
4282        struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
4283        int nr = 0;
4284        u64 prev_em_start = (u64)-1;
4285
4286        while (!list_empty(pages)) {
4287                u64 contig_end = 0;
4288
4289                for (nr = 0; nr < ARRAY_SIZE(pagepool) && !list_empty(pages);) {
4290                        struct page *page = lru_to_page(pages);
4291
4292                        prefetchw(&page->flags);
4293                        list_del(&page->lru);
4294                        if (add_to_page_cache_lru(page, mapping, page->index,
4295                                                readahead_gfp_mask(mapping))) {
4296                                put_page(page);
4297                                break;
4298                        }
4299
4300                        pagepool[nr++] = page;
4301                        contig_end = page_offset(page) + PAGE_SIZE - 1;
4302                }
4303
4304                if (nr) {
4305                        u64 contig_start = page_offset(pagepool[0]);
4306
4307                        ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
4308
4309                        contiguous_readpages(tree, pagepool, nr, contig_start,
4310                                     contig_end, &em_cached, &bio, &bio_flags,
4311                                     &prev_em_start);
4312                }
4313        }
4314
4315        if (em_cached)
4316                free_extent_map(em_cached);
4317
4318        if (bio)
4319                return submit_one_bio(bio, 0, bio_flags);
4320        return 0;
4321}
4322
4323/*
4324 * basic invalidatepage code, this waits on any locked or writeback
4325 * ranges corresponding to the page, and then deletes any extent state
4326 * records from the tree
4327 */
4328int extent_invalidatepage(struct extent_io_tree *tree,
4329                          struct page *page, unsigned long offset)
4330{
4331        struct extent_state *cached_state = NULL;
4332        u64 start = page_offset(page);
4333        u64 end = start + PAGE_SIZE - 1;
4334        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
4335
4336        start += ALIGN(offset, blocksize);
4337        if (start > end)
4338                return 0;
4339
4340        lock_extent_bits(tree, start, end, &cached_state);
4341        wait_on_page_writeback(page);
4342        clear_extent_bit(tree, start, end,
4343                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
4344                         EXTENT_DO_ACCOUNTING,
4345                         1, 1, &cached_state);
4346        return 0;
4347}
4348
4349/*
4350 * a helper for releasepage, this tests for areas of the page that
4351 * are locked or under IO and drops the related state bits if it is safe
4352 * to drop the page.
4353 */
4354static int try_release_extent_state(struct extent_io_tree *tree,
4355                                    struct page *page, gfp_t mask)
4356{
4357        u64 start = page_offset(page);
4358        u64 end = start + PAGE_SIZE - 1;
4359        int ret = 1;
4360
4361        if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
4362                ret = 0;
4363        } else {
4364                /*
4365                 * at this point we can safely clear everything except the
4366                 * locked bit and the nodatasum bit
4367                 */
4368                ret = __clear_extent_bit(tree, start, end,
4369                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
4370                                 0, 0, NULL, mask, NULL);
4371
4372                /* if clear_extent_bit failed for enomem reasons,
4373                 * we can't allow the release to continue.
4374                 */
4375                if (ret < 0)
4376                        ret = 0;
4377                else
4378                        ret = 1;
4379        }
4380        return ret;
4381}
4382
4383/*
4384 * a helper for releasepage.  As long as there are no locked extents
4385 * in the range corresponding to the page, both state records and extent
4386 * map records are removed
4387 */
4388int try_release_extent_mapping(struct page *page, gfp_t mask)
4389{
4390        struct extent_map *em;
4391        u64 start = page_offset(page);
4392        u64 end = start + PAGE_SIZE - 1;
4393        struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
4394        struct extent_io_tree *tree = &btrfs_inode->io_tree;
4395        struct extent_map_tree *map = &btrfs_inode->extent_tree;
4396
4397        if (gfpflags_allow_blocking(mask) &&
4398            page->mapping->host->i_size > SZ_16M) {
4399                u64 len;
4400                while (start <= end) {
4401                        len = end - start + 1;
4402                        write_lock(&map->lock);
4403                        em = lookup_extent_mapping(map, start, len);
4404                        if (!em) {
4405                                write_unlock(&map->lock);
4406                                break;
4407                        }
4408                        if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
4409                            em->start != start) {
4410                                write_unlock(&map->lock);
4411                                free_extent_map(em);
4412                                break;
4413                        }
4414                        if (!test_range_bit(tree, em->start,
4415                                            extent_map_end(em) - 1,
4416                                            EXTENT_LOCKED, 0, NULL)) {
4417                                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4418                                        &btrfs_inode->runtime_flags);
4419                                remove_extent_mapping(map, em);
4420                                /* once for the rb tree */
4421                                free_extent_map(em);
4422                        }
4423                        start = extent_map_end(em);
4424                        write_unlock(&map->lock);
4425
4426                        /* once for us */
4427                        free_extent_map(em);
4428                }
4429        }
4430        return try_release_extent_state(tree, page, mask);
4431}
4432
4433/*
4434 * helper function for fiemap, which doesn't want to see any holes.
4435 * This maps until we find something past 'last'
4436 */
4437static struct extent_map *get_extent_skip_holes(struct inode *inode,
4438                                                u64 offset, u64 last)
4439{
4440        u64 sectorsize = btrfs_inode_sectorsize(inode);
4441        struct extent_map *em;
4442        u64 len;
4443
4444        if (offset >= last)
4445                return NULL;
4446
4447        while (1) {
4448                len = last - offset;
4449                if (len == 0)
4450                        break;
4451                len = ALIGN(len, sectorsize);
4452                em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
4453                if (IS_ERR_OR_NULL(em))
4454                        return em;
4455
4456                /* if this isn't a hole return it */
4457                if (em->block_start != EXTENT_MAP_HOLE)
4458                        return em;
4459
4460                /* this is a hole, advance to the next extent */
4461                offset = extent_map_end(em);
4462                free_extent_map(em);
4463                if (offset >= last)
4464                        break;
4465        }
4466        return NULL;
4467}
4468
4469/*
4470 * To cache previous fiemap extent
4471 *
4472 * Will be used for merging fiemap extent
4473 */
4474struct fiemap_cache {
4475        u64 offset;
4476        u64 phys;
4477        u64 len;
4478        u32 flags;
4479        bool cached;
4480};
4481
4482/*
4483 * Helper to submit fiemap extent.
4484 *
4485 * Will try to merge current fiemap extent specified by @offset, @phys,
4486 * @len and @flags with cached one.
4487 * And only when we fails to merge, cached one will be submitted as
4488 * fiemap extent.
4489 *
4490 * Return value is the same as fiemap_fill_next_extent().
4491 */
4492static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
4493                                struct fiemap_cache *cache,
4494                                u64 offset, u64 phys, u64 len, u32 flags)
4495{
4496        int ret = 0;
4497
4498        if (!cache->cached)
4499                goto assign;
4500
4501        /*
4502         * Sanity check, extent_fiemap() should have ensured that new
4503         * fiemap extent won't overlap with cached one.
4504         * Not recoverable.
4505         *
4506         * NOTE: Physical address can overlap, due to compression
4507         */
4508        if (cache->offset + cache->len > offset) {
4509                WARN_ON(1);
4510                return -EINVAL;
4511        }
4512
4513        /*
4514         * Only merges fiemap extents if
4515         * 1) Their logical addresses are continuous
4516         *
4517         * 2) Their physical addresses are continuous
4518         *    So truly compressed (physical size smaller than logical size)
4519         *    extents won't get merged with each other
4520         *
4521         * 3) Share same flags except FIEMAP_EXTENT_LAST
4522         *    So regular extent won't get merged with prealloc extent
4523         */
4524        if (cache->offset + cache->len  == offset &&
4525            cache->phys + cache->len == phys  &&
4526            (cache->flags & ~FIEMAP_EXTENT_LAST) ==
4527                        (flags & ~FIEMAP_EXTENT_LAST)) {
4528                cache->len += len;
4529                cache->flags |= flags;
4530                goto try_submit_last;
4531        }
4532
4533        /* Not mergeable, need to submit cached one */
4534        ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4535                                      cache->len, cache->flags);
4536        cache->cached = false;
4537        if (ret)
4538                return ret;
4539assign:
4540        cache->cached = true;
4541        cache->offset = offset;
4542        cache->phys = phys;
4543        cache->len = len;
4544        cache->flags = flags;
4545try_submit_last:
4546        if (cache->flags & FIEMAP_EXTENT_LAST) {
4547                ret = fiemap_fill_next_extent(fieinfo, cache->offset,
4548                                cache->phys, cache->len, cache->flags);
4549                cache->cached = false;
4550        }
4551        return ret;
4552}
4553
4554/*
4555 * Emit last fiemap cache
4556 *
4557 * The last fiemap cache may still be cached in the following case:
4558 * 0                  4k                    8k
4559 * |<- Fiemap range ->|
4560 * |<------------  First extent ----------->|
4561 *
4562 * In this case, the first extent range will be cached but not emitted.
4563 * So we must emit it before ending extent_fiemap().
4564 */
4565static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
4566                                  struct fiemap_cache *cache)
4567{
4568        int ret;
4569
4570        if (!cache->cached)
4571                return 0;
4572
4573        ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
4574                                      cache->len, cache->flags);
4575        cache->cached = false;
4576        if (ret > 0)
4577                ret = 0;
4578        return ret;
4579}
4580
4581int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4582                __u64 start, __u64 len)
4583{
4584        int ret = 0;
4585        u64 off = start;
4586        u64 max = start + len;
4587        u32 flags = 0;
4588        u32 found_type;
4589        u64 last;
4590        u64 last_for_get_extent = 0;
4591        u64 disko = 0;
4592        u64 isize = i_size_read(inode);
4593        struct btrfs_key found_key;
4594        struct extent_map *em = NULL;
4595        struct extent_state *cached_state = NULL;
4596        struct btrfs_path *path;
4597        struct btrfs_root *root = BTRFS_I(inode)->root;
4598        struct fiemap_cache cache = { 0 };
4599        struct ulist *roots;
4600        struct ulist *tmp_ulist;
4601        int end = 0;
4602        u64 em_start = 0;
4603        u64 em_len = 0;
4604        u64 em_end = 0;
4605
4606        if (len == 0)
4607                return -EINVAL;
4608
4609        path = btrfs_alloc_path();
4610        if (!path)
4611                return -ENOMEM;
4612        path->leave_spinning = 1;
4613
4614        roots = ulist_alloc(GFP_KERNEL);
4615        tmp_ulist = ulist_alloc(GFP_KERNEL);
4616        if (!roots || !tmp_ulist) {
4617                ret = -ENOMEM;
4618                goto out_free_ulist;
4619        }
4620
4621        start = round_down(start, btrfs_inode_sectorsize(inode));
4622        len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
4623
4624        /*
4625         * lookup the last file extent.  We're not using i_size here
4626         * because there might be preallocation past i_size
4627         */
4628        ret = btrfs_lookup_file_extent(NULL, root, path,
4629                        btrfs_ino(BTRFS_I(inode)), -1, 0);
4630        if (ret < 0) {
4631                goto out_free_ulist;
4632        } else {
4633                WARN_ON(!ret);
4634                if (ret == 1)
4635                        ret = 0;
4636        }
4637
4638        path->slots[0]--;
4639        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4640        found_type = found_key.type;
4641
4642        /* No extents, but there might be delalloc bits */
4643        if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
4644            found_type != BTRFS_EXTENT_DATA_KEY) {
4645                /* have to trust i_size as the end */
4646                last = (u64)-1;
4647                last_for_get_extent = isize;
4648        } else {
4649                /*
4650                 * remember the start of the last extent.  There are a
4651                 * bunch of different factors that go into the length of the
4652                 * extent, so its much less complex to remember where it started
4653                 */
4654                last = found_key.offset;
4655                last_for_get_extent = last + 1;
4656        }
4657        btrfs_release_path(path);
4658
4659        /*
4660         * we might have some extents allocated but more delalloc past those
4661         * extents.  so, we trust isize unless the start of the last extent is
4662         * beyond isize
4663         */
4664        if (last < isize) {
4665                last = (u64)-1;
4666                last_for_get_extent = isize;
4667        }
4668
4669        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4670                         &cached_state);
4671
4672        em = get_extent_skip_holes(inode, start, last_for_get_extent);
4673        if (!em)
4674                goto out;
4675        if (IS_ERR(em)) {
4676                ret = PTR_ERR(em);
4677                goto out;
4678        }
4679
4680        while (!end) {
4681                u64 offset_in_extent = 0;
4682
4683                /* break if the extent we found is outside the range */
4684                if (em->start >= max || extent_map_end(em) < off)
4685                        break;
4686
4687                /*
4688                 * get_extent may return an extent that starts before our
4689                 * requested range.  We have to make sure the ranges
4690                 * we return to fiemap always move forward and don't
4691                 * overlap, so adjust the offsets here
4692                 */
4693                em_start = max(em->start, off);
4694
4695                /*
4696                 * record the offset from the start of the extent
4697                 * for adjusting the disk offset below.  Only do this if the
4698                 * extent isn't compressed since our in ram offset may be past
4699                 * what we have actually allocated on disk.
4700                 */
4701                if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4702                        offset_in_extent = em_start - em->start;
4703                em_end = extent_map_end(em);
4704                em_len = em_end - em_start;
4705                flags = 0;
4706                if (em->block_start < EXTENT_MAP_LAST_BYTE)
4707                        disko = em->block_start + offset_in_extent;
4708                else
4709                        disko = 0;
4710
4711                /*
4712                 * bump off for our next call to get_extent
4713                 */
4714                off = extent_map_end(em);
4715                if (off >= max)
4716                        end = 1;
4717
4718                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
4719                        end = 1;
4720                        flags |= FIEMAP_EXTENT_LAST;
4721                } else if (em->block_start == EXTENT_MAP_INLINE) {
4722                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
4723                                  FIEMAP_EXTENT_NOT_ALIGNED);
4724                } else if (em->block_start == EXTENT_MAP_DELALLOC) {
4725                        flags |= (FIEMAP_EXTENT_DELALLOC |
4726                                  FIEMAP_EXTENT_UNKNOWN);
4727                } else if (fieinfo->fi_extents_max) {
4728                        u64 bytenr = em->block_start -
4729                                (em->start - em->orig_start);
4730
4731                        /*
4732                         * As btrfs supports shared space, this information
4733                         * can be exported to userspace tools via
4734                         * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
4735                         * then we're just getting a count and we can skip the
4736                         * lookup stuff.
4737                         */
4738                        ret = btrfs_check_shared(root,
4739                                                 btrfs_ino(BTRFS_I(inode)),
4740                                                 bytenr, roots, tmp_ulist);
4741                        if (ret < 0)
4742                                goto out_free;
4743                        if (ret)
4744                                flags |= FIEMAP_EXTENT_SHARED;
4745                        ret = 0;
4746                }
4747                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4748                        flags |= FIEMAP_EXTENT_ENCODED;
4749                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4750                        flags |= FIEMAP_EXTENT_UNWRITTEN;
4751
4752                free_extent_map(em);
4753                em = NULL;
4754                if ((em_start >= last) || em_len == (u64)-1 ||
4755                   (last == (u64)-1 && isize <= em_end)) {
4756                        flags |= FIEMAP_EXTENT_LAST;
4757                        end = 1;
4758                }
4759
4760                /* now scan forward to see if this is really the last extent. */
4761                em = get_extent_skip_holes(inode, off, last_for_get_extent);
4762                if (IS_ERR(em)) {
4763                        ret = PTR_ERR(em);
4764                        goto out;
4765                }
4766                if (!em) {
4767                        flags |= FIEMAP_EXTENT_LAST;
4768                        end = 1;
4769                }
4770                ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
4771                                           em_len, flags);
4772                if (ret) {
4773                        if (ret == 1)
4774                                ret = 0;
4775                        goto out_free;
4776                }
4777        }
4778out_free:
4779        if (!ret)
4780                ret = emit_last_fiemap_cache(fieinfo, &cache);
4781        free_extent_map(em);
4782out:
4783        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
4784                             &cached_state);
4785
4786out_free_ulist:
4787        btrfs_free_path(path);
4788        ulist_free(roots);
4789        ulist_free(tmp_ulist);
4790        return ret;
4791}
4792
4793static void __free_extent_buffer(struct extent_buffer *eb)
4794{
4795        btrfs_leak_debug_del(&eb->leak_list);
4796        kmem_cache_free(extent_buffer_cache, eb);
4797}
4798
4799int extent_buffer_under_io(struct extent_buffer *eb)
4800{
4801        return (atomic_read(&eb->io_pages) ||
4802                test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4803                test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4804}
4805
4806/*
4807 * Release all pages attached to the extent buffer.
4808 */
4809static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
4810{
4811        int i;
4812        int num_pages;
4813        int mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
4814
4815        BUG_ON(extent_buffer_under_io(eb));
4816
4817        num_pages = num_extent_pages(eb);
4818        for (i = 0; i < num_pages; i++) {
4819                struct page *page = eb->pages[i];
4820
4821                if (!page)
4822                        continue;
4823                if (mapped)
4824                        spin_lock(&page->mapping->private_lock);
4825                /*
4826                 * We do this since we'll remove the pages after we've
4827                 * removed the eb from the radix tree, so we could race
4828                 * and have this page now attached to the new eb.  So
4829                 * only clear page_private if it's still connected to
4830                 * this eb.
4831                 */
4832                if (PagePrivate(page) &&
4833                    page->private == (unsigned long)eb) {
4834                        BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4835                        BUG_ON(PageDirty(page));
4836                        BUG_ON(PageWriteback(page));
4837                        /*
4838                         * We need to make sure we haven't be attached
4839                         * to a new eb.
4840                         */
4841                        ClearPagePrivate(page);
4842                        set_page_private(page, 0);
4843                        /* One for the page private */
4844                        put_page(page);
4845                }
4846
4847                if (mapped)
4848                        spin_unlock(&page->mapping->private_lock);
4849
4850                /* One for when we allocated the page */
4851                put_page(page);
4852        }
4853}
4854
4855/*
4856 * Helper for releasing the extent buffer.
4857 */
4858static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4859{
4860        btrfs_release_extent_buffer_pages(eb);
4861        __free_extent_buffer(eb);
4862}
4863
4864static struct extent_buffer *
4865__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
4866                      unsigned long len)
4867{
4868        struct extent_buffer *eb = NULL;
4869
4870        eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
4871        eb->start = start;
4872        eb->len = len;
4873        eb->fs_info = fs_info;
4874        eb->bflags = 0;
4875        rwlock_init(&eb->lock);
4876        atomic_set(&eb->blocking_readers, 0);
4877        eb->blocking_writers = 0;
4878        eb->lock_nested = false;
4879        init_waitqueue_head(&eb->write_lock_wq);
4880        init_waitqueue_head(&eb->read_lock_wq);
4881
4882        btrfs_leak_debug_add(&eb->leak_list, &buffers);
4883
4884        spin_lock_init(&eb->refs_lock);
4885        atomic_set(&eb->refs, 1);
4886        atomic_set(&eb->io_pages, 0);
4887
4888        /*
4889         * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4890         */
4891        BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4892                > MAX_INLINE_EXTENT_BUFFER_SIZE);
4893        BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4894
4895#ifdef CONFIG_BTRFS_DEBUG
4896        eb->spinning_writers = 0;
4897        atomic_set(&eb->spinning_readers, 0);
4898        atomic_set(&eb->read_locks, 0);
4899        eb->write_locks = 0;
4900#endif
4901
4902        return eb;
4903}
4904
4905struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4906{
4907        int i;
4908        struct page *p;
4909        struct extent_buffer *new;
4910        int num_pages = num_extent_pages(src);
4911
4912        new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
4913        if (new == NULL)
4914                return NULL;
4915
4916        for (i = 0; i < num_pages; i++) {
4917                p = alloc_page(GFP_NOFS);
4918                if (!p) {
4919                        btrfs_release_extent_buffer(new);
4920                        return NULL;
4921                }
4922                attach_extent_buffer_page(new, p);
4923                WARN_ON(PageDirty(p));
4924                SetPageUptodate(p);
4925                new->pages[i] = p;
4926                copy_page(page_address(p), page_address(src->pages[i]));
4927        }
4928
4929        set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
4930        set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
4931
4932        return new;
4933}
4934
4935struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4936                                                  u64 start, unsigned long len)
4937{
4938        struct extent_buffer *eb;
4939        int num_pages;
4940        int i;
4941
4942        eb = __alloc_extent_buffer(fs_info, start, len);
4943        if (!eb)
4944                return NULL;
4945
4946        num_pages = num_extent_pages(eb);
4947        for (i = 0; i < num_pages; i++) {
4948                eb->pages[i] = alloc_page(GFP_NOFS);
4949                if (!eb->pages[i])
4950                        goto err;
4951        }
4952        set_extent_buffer_uptodate(eb);
4953        btrfs_set_header_nritems(eb, 0);
4954        set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
4955
4956        return eb;
4957err:
4958        for (; i > 0; i--)
4959                __free_page(eb->pages[i - 1]);
4960        __free_extent_buffer(eb);
4961        return NULL;
4962}
4963
4964struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
4965                                                u64 start)
4966{
4967        return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
4968}
4969
4970static void check_buffer_tree_ref(struct extent_buffer *eb)
4971{
4972        int refs;
4973        /* the ref bit is tricky.  We have to make sure it is set
4974         * if we have the buffer dirty.   Otherwise the
4975         * code to free a buffer can end up dropping a dirty
4976         * page
4977         *
4978         * Once the ref bit is set, it won't go away while the
4979         * buffer is dirty or in writeback, and it also won't
4980         * go away while we have the reference count on the
4981         * eb bumped.
4982         *
4983         * We can't just set the ref bit without bumping the
4984         * ref on the eb because free_extent_buffer might
4985         * see the ref bit and try to clear it.  If this happens
4986         * free_extent_buffer might end up dropping our original
4987         * ref by mistake and freeing the page before we are able
4988         * to add one more ref.
4989         *
4990         * So bump the ref count first, then set the bit.  If someone
4991         * beat us to it, drop the ref we added.
4992         */
4993        refs = atomic_read(&eb->refs);
4994        if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4995                return;
4996
4997        spin_lock(&eb->refs_lock);
4998        if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4999                atomic_inc(&eb->refs);
5000        spin_unlock(&eb->refs_lock);

5001}
5002
5003static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5004                struct page *accessed)
5005{
5006        int num_pages, i;
5007
5008        check_buffer_tree_ref(eb);
5009
5010        num_pages = num_extent_pages(eb);
5011        for (i = 0; i < num_pages; i++) {
5012                struct page *p = eb->pages[i];
5013
5014                if (p != accessed)
5015                        mark_page_accessed(p);
5016        }
5017}
5018
5019struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
5020                                         u64 start)
5021{
5022        struct extent_buffer *eb;
5023
5024        rcu_read_lock();
5025        eb = radix_tree_lookup(&fs_info->buffer_radix,
5026                               start >> PAGE_SHIFT);
5027        if (eb && atomic_inc_not_zero(&eb->refs)) {
5028                rcu_read_unlock();
5029                /*
5030                 * Lock our eb's refs_lock to avoid races with
5031                 * free_extent_buffer. When we get our eb it might be flagged
5032                 * with EXTENT_BUFFER_STALE and another task running
5033                 * free_extent_buffer might have seen that flag set,
5034                 * eb->refs == 2, that the buffer isn't under IO (dirty and
5035                 * writeback flags not set) and it's still in the tree (flag
5036                 * EXTENT_BUFFER_TREE_REF set), therefore being in the process
5037                 * of decrementing the extent buffer's reference count twice.
5038                 * So here we could race and increment the eb's reference count,
5039                 * clear its stale flag, mark it as dirty and drop our reference
5040                 * before the other task finishes executing free_extent_buffer,
5041                 * which would later result in an attempt to free an extent
5042                 * buffer that is dirty.
5043                 */
5044                if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
5045                        spin_lock(&eb->refs_lock);
5046                        spin_unlock(&eb->refs_lock);
5047                }
5048                mark_extent_buffer_accessed(eb, NULL);
5049                return eb;
5050        }
5051        rcu_read_unlock();
5052
5053        return NULL;
5054}
5055
5056#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5057struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
5058                                        u64 start)
5059{
5060        struct extent_buffer *eb, *exists = NULL;
5061        int ret;
5062
5063        eb = find_extent_buffer(fs_info, start);
5064        if (eb)
5065                return eb;
5066        eb = alloc_dummy_extent_buffer(fs_info, start);
5067        if (!eb)
5068                return NULL;
5069        eb->fs_info = fs_info;
5070again:
5071        ret = radix_tree_preload(GFP_NOFS);
5072        if (ret)
5073                goto free_eb;
5074        spin_lock(&fs_info->buffer_lock);
5075        ret = radix_tree_insert(&fs_info->buffer_radix,
5076                                start >> PAGE_SHIFT, eb);
5077        spin_unlock(&fs_info->buffer_lock);
5078        radix_tree_preload_end();
5079        if (ret == -EEXIST) {
5080                exists = find_extent_buffer(fs_info, start);
5081                if (exists)
5082                        goto free_eb;
5083                else
5084                        goto again;
5085        }
5086        check_buffer_tree_ref(eb);
5087        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5088
5089        return eb;
5090free_eb:
5091        btrfs_release_extent_buffer(eb);
5092        return exists;
5093}
5094#endif
5095
5096struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
5097                                          u64 start)
5098{
5099        unsigned long len = fs_info->nodesize;
5100        int num_pages;
5101        int i;
5102        unsigned long index = start >> PAGE_SHIFT;
5103        struct extent_buffer *eb;
5104        struct extent_buffer *exists = NULL;
5105        struct page *p;
5106        struct address_space *mapping = fs_info->btree_inode->i_mapping;
5107        int uptodate = 1;
5108        int ret;
5109
5110        if (!IS_ALIGNED(start, fs_info->sectorsize)) {
5111                btrfs_err(fs_info, "bad tree block start %llu", start);
5112                return ERR_PTR(-EINVAL);
5113        }
5114
5115        eb = find_extent_buffer(fs_info, start);
5116        if (eb)
5117                return eb;
5118
5119        eb = __alloc_extent_buffer(fs_info, start, len);
5120        if (!eb)
5121                return ERR_PTR(-ENOMEM);
5122
5123        num_pages = num_extent_pages(eb);
5124        for (i = 0; i < num_pages; i++, index++) {
5125                p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
5126                if (!p) {
5127                        exists = ERR_PTR(-ENOMEM);
5128                        goto free_eb;
5129                }
5130
5131                spin_lock(&mapping->private_lock);
5132                if (PagePrivate(p)) {
5133                        /*
5134                         * We could have already allocated an eb for this page
5135                         * and attached one so lets see if we can get a ref on
5136                         * the existing eb, and if we can we know it's good and
5137                         * we can just return that one, else we know we can just
5138                         * overwrite page->private.
5139                         */
5140                        exists = (struct extent_buffer *)p->private;
5141                        if (atomic_inc_not_zero(&exists->refs)) {
5142                                spin_unlock(&mapping->private_lock);
5143                                unlock_page(p);
5144                                put_page(p);
5145                                mark_extent_buffer_accessed(exists, p);
5146                                goto free_eb;
5147                        }
5148                        exists = NULL;
5149
5150                        /*
5151                         * Do this so attach doesn't complain and we need to
5152                         * drop the ref the old guy had.
5153                         */
5154                        ClearPagePrivate(p);
5155                        WARN_ON(PageDirty(p));
5156                        put_page(p);
5157                }
5158                attach_extent_buffer_page(eb, p);
5159                spin_unlock(&mapping->private_lock);
5160                WARN_ON(PageDirty(p));
5161                eb->pages[i] = p;
5162                if (!PageUptodate(p))
5163                        uptodate = 0;
5164
5165                /*
5166                 * We can't unlock the pages just yet since the extent buffer
5167                 * hasn't been properly inserted in the radix tree, this
5168                 * opens a race with btree_releasepage which can free a page
5169                 * while we are still filling in all pages for the buffer and
5170                 * we could crash.
5171                 */
5172        }
5173        if (uptodate)
5174                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5175again:
5176        ret = radix_tree_preload(GFP_NOFS);
5177        if (ret) {
5178                exists = ERR_PTR(ret);
5179                goto free_eb;
5180        }
5181
5182        spin_lock(&fs_info->buffer_lock);
5183        ret = radix_tree_insert(&fs_info->buffer_radix,
5184                                start >> PAGE_SHIFT, eb);
5185        spin_unlock(&fs_info->buffer_lock);
5186        radix_tree_preload_end();
5187        if (ret == -EEXIST) {
5188                exists = find_extent_buffer(fs_info, start);
5189                if (exists)
5190                        goto free_eb;
5191                else
5192                        goto again;
5193        }
5194        /* add one reference for the tree */
5195        check_buffer_tree_ref(eb);
5196        set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
5197
5198        /*
5199         * Now it's safe to unlock the pages because any calls to
5200         * btree_releasepage will correctly detect that a page belongs to a
5201         * live buffer and won't free them prematurely.
5202         */
5203        for (i = 0; i < num_pages; i++)
5204                unlock_page(eb->pages[i]);
5205        return eb;
5206
5207free_eb:
5208        WARN_ON(!atomic_dec_and_test(&eb->refs));
5209        for (i = 0; i < num_pages; i++) {
5210                if (eb->pages[i])
5211                        unlock_page(eb->pages[i]);
5212        }
5213
5214        btrfs_release_extent_buffer(eb);
5215        return exists;
5216}
5217
5218static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
5219{
5220        struct extent_buffer *eb =
5221                        container_of(head, struct extent_buffer, rcu_head);
5222
5223        __free_extent_buffer(eb);
5224}
5225
5226static int release_extent_buffer(struct extent_buffer *eb)
5227{
5228        lockdep_assert_held(&eb->refs_lock);
5229
5230        WARN_ON(atomic_read(&eb->refs) == 0);
5231        if (atomic_dec_and_test(&eb->refs)) {
5232                if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
5233                        struct btrfs_fs_info *fs_info = eb->fs_info;
5234
5235                        spin_unlock(&eb->refs_lock);
5236
5237                        spin_lock(&fs_info->buffer_lock);
5238                        radix_tree_delete(&fs_info->buffer_radix,
5239                                          eb->start >> PAGE_SHIFT);
5240                        spin_unlock(&fs_info->buffer_lock);
5241                } else {
5242                        spin_unlock(&eb->refs_lock);
5243                }
5244
5245                /* Should be safe to release our pages at this point */
5246                btrfs_release_extent_buffer_pages(eb);
5247#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
5248                if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
5249                        __free_extent_buffer(eb);
5250                        return 1;
5251                }
5252#endif
5253                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
5254                return 1;
5255        }
5256        spin_unlock(&eb->refs_lock);
5257
5258        return 0;
5259}
5260
5261void free_extent_buffer(struct extent_buffer *eb)
5262{
5263        int refs;
5264        int old;
5265        if (!eb)
5266                return;
5267
5268        while (1) {
5269                refs = atomic_read(&eb->refs);
5270                if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
5271                    || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
5272                        refs == 1))
5273                        break;
5274                old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
5275                if (old == refs)
5276                        return;
5277        }
5278
5279        spin_lock(&eb->refs_lock);
5280        if (atomic_read(&eb->refs) == 2 &&
5281            test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
5282            !extent_buffer_under_io(eb) &&
5283            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5284                atomic_dec(&eb->refs);
5285
5286        /*
5287         * I know this is terrible, but it's temporary until we stop tracking
5288         * the uptodate bits and such for the extent buffers.
5289         */
5290        release_extent_buffer(eb);
5291}
5292
5293void free_extent_buffer_stale(struct extent_buffer *eb)
5294{
5295        if (!eb)
5296                return;
5297
5298        spin_lock(&eb->refs_lock);
5299        set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
5300
5301        if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
5302            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5303                atomic_dec(&eb->refs);
5304        release_extent_buffer(eb);
5305}
5306
5307void clear_extent_buffer_dirty(struct extent_buffer *eb)
5308{
5309        int i;
5310        int num_pages;
5311        struct page *page;
5312
5313        num_pages = num_extent_pages(eb);
5314
5315        for (i = 0; i < num_pages; i++) {
5316                page = eb->pages[i];
5317                if (!PageDirty(page))
5318                        continue;
5319
5320                lock_page(page);
5321                WARN_ON(!PagePrivate(page));
5322
5323                clear_page_dirty_for_io(page);
5324                xa_lock_irq(&page->mapping->i_pages);
5325                if (!PageDirty(page))
5326                        __xa_clear_mark(&page->mapping->i_pages,
5327                                        page_index(page), PAGECACHE_TAG_DIRTY);
5328                xa_unlock_irq(&page->mapping->i_pages);
5329                ClearPageError(page);
5330                unlock_page(page);
5331        }
5332        WARN_ON(atomic_read(&eb->refs) == 0);
5333}
5334
5335bool set_extent_buffer_dirty(struct extent_buffer *eb)
5336{
5337        int i;
5338        int num_pages;
5339        bool was_dirty;
5340
5341        check_buffer_tree_ref(eb);
5342
5343        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
5344
5345        num_pages = num_extent_pages(eb);
5346        WARN_ON(atomic_read(&eb->refs) == 0);
5347        WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
5348
5349        if (!was_dirty)
5350                for (i = 0; i < num_pages; i++)
5351                        set_page_dirty(eb->pages[i]);
5352
5353#ifdef CONFIG_BTRFS_DEBUG
5354        for (i = 0; i < num_pages; i++)
5355                ASSERT(PageDirty(eb->pages[i]));
5356#endif
5357
5358        return was_dirty;
5359}
5360
5361void clear_extent_buffer_uptodate(struct extent_buffer *eb)
5362{
5363        int i;
5364        struct page *page;
5365        int num_pages;
5366
5367        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5368        num_pages = num_extent_pages(eb);
5369        for (i = 0; i < num_pages; i++) {
5370                page = eb->pages[i];
5371                if (page)
5372                        ClearPageUptodate(page);
5373        }
5374}
5375
5376void set_extent_buffer_uptodate(struct extent_buffer *eb)
5377{
5378        int i;
5379        struct page *page;
5380        int num_pages;
5381
5382        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5383        num_pages = num_extent_pages(eb);
5384        for (i = 0; i < num_pages; i++) {
5385                page = eb->pages[i];
5386                SetPageUptodate(page);
5387        }
5388}
5389
5390int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
5391{
5392        int i;
5393        struct page *page;
5394        int err;
5395        int ret = 0;
5396        int locked_pages = 0;
5397        int all_uptodate = 1;
5398        int num_pages;
5399        unsigned long num_reads = 0;
5400        struct bio *bio = NULL;
5401        unsigned long bio_flags = 0;
5402        struct extent_io_tree *tree = &BTRFS_I(eb->fs_info->btree_inode)->io_tree;
5403
5404        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
5405                return 0;
5406
5407        num_pages = num_extent_pages(eb);
5408        for (i = 0; i < num_pages; i++) {
5409                page = eb->pages[i];
5410                if (wait == WAIT_NONE) {
5411                        if (!trylock_page(page))
5412                                goto unlock_exit;
5413                } else {
5414                        lock_page(page);
5415                }
5416                locked_pages++;
5417        }
5418        /*
5419         * We need to firstly lock all pages to make sure that
5420         * the uptodate bit of our pages won't be affected by
5421         * clear_extent_buffer_uptodate().
5422         */
5423        for (i = 0; i < num_pages; i++) {
5424                page = eb->pages[i];
5425                if (!PageUptodate(page)) {
5426                        num_reads++;
5427                        all_uptodate = 0;
5428                }
5429        }
5430
5431        if (all_uptodate) {
5432                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
5433                goto unlock_exit;
5434        }
5435
5436        clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
5437        eb->read_mirror = 0;
5438        atomic_set(&eb->io_pages, num_reads);
5439        for (i = 0; i < num_pages; i++) {
5440                page = eb->pages[i];
5441
5442                if (!PageUptodate(page)) {
5443                        if (ret) {
5444                                atomic_dec(&eb->io_pages);
5445                                unlock_page(page);
5446                                continue;
5447                        }
5448
5449                        ClearPageError(page);
5450                        err = __extent_read_full_page(tree, page,
5451                                                      btree_get_extent, &bio,
5452                                                      mirror_num, &bio_flags,
5453                                                      REQ_META);
5454                        if (err) {
5455                                ret = err;
5456                                /*
5457                                 * We use &bio in above __extent_read_full_page,
5458                                 * so we ensure that if it returns error, the
5459                                 * current page fails to add itself to bio and
5460                                 * it's been unlocked.
5461                                 *
5462                                 * We must dec io_pages by ourselves.
5463                                 */
5464                                atomic_dec(&eb->io_pages);
5465                        }
5466                } else {
5467                        unlock_page(page);
5468                }
5469        }
5470
5471        if (bio) {
5472                err = submit_one_bio(bio, mirror_num, bio_flags);
5473                if (err)
5474                        return err;
5475        }
5476
5477        if (ret || wait != WAIT_COMPLETE)
5478                return ret;
5479
5480        for (i = 0; i < num_pages; i++) {
5481                page = eb->pages[i];
5482                wait_on_page_locked(page);
5483                if (!PageUptodate(page))
5484                        ret = -EIO;
5485        }
5486
5487        return ret;
5488
5489unlock_exit:
5490        while (locked_pages > 0) {
5491                locked_pages--;
5492                page = eb->pages[locked_pages];
5493                unlock_page(page);
5494        }
5495        return ret;
5496}
5497
5498void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
5499                        unsigned long start, unsigned long len)
5500{
5501        size_t cur;
5502        size_t offset;
5503        struct page *page;
5504        char *kaddr;
5505        char *dst = (char *)dstv;
5506        size_t start_offset = offset_in_page(eb->start);
5507        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5508
5509        if (start + len > eb->len) {
5510                WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5511                     eb->start, eb->len, start, len);
5512                memset(dst, 0, len);
5513                return;
5514        }
5515
5516        offset = offset_in_page(start_offset + start);
5517
5518        while (len > 0) {
5519                page = eb->pages[i];
5520
5521                cur = min(len, (PAGE_SIZE - offset));
5522                kaddr = page_address(page);
5523                memcpy(dst, kaddr + offset, cur);
5524
5525                dst += cur;
5526                len -= cur;
5527                offset = 0;
5528                i++;
5529        }
5530}
5531
5532int read_extent_buffer_to_user(const struct extent_buffer *eb,
5533                               void __user *dstv,
5534                               unsigned long start, unsigned long len)
5535{
5536        size_t cur;
5537        size_t offset;
5538        struct page *page;
5539        char *kaddr;
5540        char __user *dst = (char __user *)dstv;
5541        size_t start_offset = offset_in_page(eb->start);
5542        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5543        int ret = 0;
5544
5545        WARN_ON(start > eb->len);
5546        WARN_ON(start + len > eb->start + eb->len);
5547
5548        offset = offset_in_page(start_offset + start);
5549
5550        while (len > 0) {
5551                page = eb->pages[i];
5552
5553                cur = min(len, (PAGE_SIZE - offset));
5554                kaddr = page_address(page);
5555                if (copy_to_user(dst, kaddr + offset, cur)) {
5556                        ret = -EFAULT;
5557                        break;
5558                }
5559
5560                dst += cur;
5561                len -= cur;
5562                offset = 0;
5563                i++;
5564        }
5565
5566        return ret;
5567}
5568
5569/*
5570 * return 0 if the item is found within a page.
5571 * return 1 if the item spans two pages.
5572 * return -EINVAL otherwise.
5573 */
5574int map_private_extent_buffer(const struct extent_buffer *eb,
5575                              unsigned long start, unsigned long min_len,
5576                              char **map, unsigned long *map_start,
5577                              unsigned long *map_len)
5578{
5579        size_t offset;
5580        char *kaddr;
5581        struct page *p;
5582        size_t start_offset = offset_in_page(eb->start);
5583        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5584        unsigned long end_i = (start_offset + start + min_len - 1) >>
5585                PAGE_SHIFT;
5586
5587        if (start + min_len > eb->len) {
5588                WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
5589                       eb->start, eb->len, start, min_len);
5590                return -EINVAL;
5591        }
5592
5593        if (i != end_i)
5594                return 1;
5595
5596        if (i == 0) {
5597                offset = start_offset;
5598                *map_start = 0;
5599        } else {
5600                offset = 0;
5601                *map_start = ((u64)i << PAGE_SHIFT) - start_offset;
5602        }
5603
5604        p = eb->pages[i];
5605        kaddr = page_address(p);
5606        *map = kaddr + offset;
5607        *map_len = PAGE_SIZE - offset;
5608        return 0;
5609}
5610
5611int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
5612                         unsigned long start, unsigned long len)
5613{
5614        size_t cur;
5615        size_t offset;
5616        struct page *page;
5617        char *kaddr;
5618        char *ptr = (char *)ptrv;
5619        size_t start_offset = offset_in_page(eb->start);
5620        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5621        int ret = 0;
5622
5623        WARN_ON(start > eb->len);
5624        WARN_ON(start + len > eb->start + eb->len);
5625
5626        offset = offset_in_page(start_offset + start);
5627
5628        while (len > 0) {
5629                page = eb->pages[i];
5630
5631                cur = min(len, (PAGE_SIZE - offset));
5632
5633                kaddr = page_address(page);
5634                ret = memcmp(ptr, kaddr + offset, cur);
5635                if (ret)
5636                        break;
5637
5638                ptr += cur;
5639                len -= cur;
5640                offset = 0;
5641                i++;
5642        }
5643        return ret;
5644}
5645
5646void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
5647                const void *srcv)
5648{
5649        char *kaddr;
5650
5651        WARN_ON(!PageUptodate(eb->pages[0]));
5652        kaddr = page_address(eb->pages[0]);
5653        memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
5654                        BTRFS_FSID_SIZE);
5655}
5656
5657void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
5658{
5659        char *kaddr;
5660
5661        WARN_ON(!PageUptodate(eb->pages[0]));
5662        kaddr = page_address(eb->pages[0]);
5663        memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
5664                        BTRFS_FSID_SIZE);
5665}
5666
5667void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5668                         unsigned long start, unsigned long len)
5669{
5670        size_t cur;
5671        size_t offset;
5672        struct page *page;
5673        char *kaddr;
5674        char *src = (char *)srcv;
5675        size_t start_offset = offset_in_page(eb->start);
5676        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5677
5678        WARN_ON(start > eb->len);
5679        WARN_ON(start + len > eb->start + eb->len);
5680
5681        offset = offset_in_page(start_offset + start);
5682
5683        while (len > 0) {
5684                page = eb->pages[i];
5685                WARN_ON(!PageUptodate(page));
5686
5687                cur = min(len, PAGE_SIZE - offset);
5688                kaddr = page_address(page);
5689                memcpy(kaddr + offset, src, cur);
5690
5691                src += cur;
5692                len -= cur;
5693                offset = 0;
5694                i++;
5695        }
5696}
5697
5698void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
5699                unsigned long len)
5700{
5701        size_t cur;
5702        size_t offset;
5703        struct page *page;
5704        char *kaddr;
5705        size_t start_offset = offset_in_page(eb->start);
5706        unsigned long i = (start_offset + start) >> PAGE_SHIFT;
5707
5708        WARN_ON(start > eb->len);
5709        WARN_ON(start + len > eb->start + eb->len);
5710
5711        offset = offset_in_page(start_offset + start);
5712
5713        while (len > 0) {
5714                page = eb->pages[i];
5715                WARN_ON(!PageUptodate(page));
5716
5717                cur = min(len, PAGE_SIZE - offset);
5718                kaddr = page_address(page);
5719                memset(kaddr + offset, 0, cur);
5720
5721                len -= cur;
5722                offset = 0;
5723                i++;
5724        }
5725}
5726
5727void copy_extent_buffer_full(struct extent_buffer *dst,
5728                             struct extent_buffer *src)
5729{
5730        int i;
5731        int num_pages;
5732
5733        ASSERT(dst->len == src->len);
5734
5735        num_pages = num_extent_pages(dst);
5736        for (i = 0; i < num_pages; i++)
5737                copy_page(page_address(dst->pages[i]),
5738                                page_address(src->pages[i]));
5739}
5740
5741void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5742                        unsigned long dst_offset, unsigned long src_offset,
5743                        unsigned long len)
5744{
5745        u64 dst_len = dst->len;
5746        size_t cur;
5747        size_t offset;
5748        struct page *page;
5749        char *kaddr;
5750        size_t start_offset = offset_in_page(dst->start);
5751        unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
5752
5753        WARN_ON(src->len != dst_len);
5754
5755        offset = offset_in_page(start_offset + dst_offset);
5756
5757        while (len > 0) {
5758                page = dst->pages[i];
5759                WARN_ON(!PageUptodate(page));
5760
5761                cur = min(len, (unsigned long)(PAGE_SIZE - offset));
5762
5763                kaddr = page_address(page);
5764                read_extent_buffer(src, kaddr + offset, src_offset, cur);
5765
5766                src_offset += cur;
5767                len -= cur;
5768                offset = 0;
5769                i++;
5770        }
5771}
5772
5773/*
5774 * eb_bitmap_offset() - calculate the page and offset of the byte containing the
5775 * given bit number
5776 * @eb: the extent buffer
5777 * @start: offset of the bitmap item in the extent buffer
5778 * @nr: bit number
5779 * @page_index: return index of the page in the extent buffer that contains the
5780 * given bit number
5781 * @page_offset: return offset into the page given by page_index
5782 *
5783 * This helper hides the ugliness of finding the byte in an extent buffer which
5784 * contains a given bit.
5785 */
5786static inline void eb_bitmap_offset(struct extent_buffer *eb,
5787                                    unsigned long start, unsigned long nr,
5788                                    unsigned long *page_index,
5789                                    size_t *page_offset)
5790{
5791        size_t start_offset = offset_in_page(eb->start);
5792        size_t byte_offset = BIT_BYTE(nr);
5793        size_t offset;
5794
5795        /*
5796         * The byte we want is the offset of the extent buffer + the offset of
5797         * the bitmap item in the extent buffer + the offset of the byte in the
5798         * bitmap item.
5799         */
5800        offset = start_offset + start + byte_offset;
5801
5802        *page_index = offset >> PAGE_SHIFT;
5803        *page_offset = offset_in_page(offset);
5804}
5805
5806/**
5807 * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
5808 * @eb: the extent buffer
5809 * @start: offset of the bitmap item in the extent buffer
5810 * @nr: bit number to test
5811 */
5812int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
5813                           unsigned long nr)
5814{
5815        u8 *kaddr;
5816        struct page *page;
5817        unsigned long i;
5818        size_t offset;
5819
5820        eb_bitmap_offset(eb, start, nr, &i, &offset);
5821        page = eb->pages[i];
5822        WARN_ON(!PageUptodate(page));
5823        kaddr = page_address(page);
5824        return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
5825}
5826
5827/**
5828 * extent_buffer_bitmap_set - set an area of a bitmap
5829 * @eb: the extent buffer
5830 * @start: offset of the bitmap item in the extent buffer
5831 * @pos: bit number of the first bit
5832 * @len: number of bits to set
5833 */
5834void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
5835                              unsigned long pos, unsigned long len)
5836{
5837        u8 *kaddr;
5838        struct page *page;
5839        unsigned long i;
5840        size_t offset;
5841        const unsigned int size = pos + len;
5842        int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
5843        u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
5844
5845        eb_bitmap_offset(eb, start, pos, &i, &offset);
5846        page = eb->pages[i];
5847        WARN_ON(!PageUptodate(page));
5848        kaddr = page_address(page);
5849
5850        while (len >= bits_to_set) {
5851                kaddr[offset] |= mask_to_set;
5852                len -= bits_to_set;
5853                bits_to_set = BITS_PER_BYTE;
5854                mask_to_set = ~0;
5855                if (++offset >= PAGE_SIZE && len > 0) {
5856                        offset = 0;
5857                        page = eb->pages[++i];
5858                        WARN_ON(!PageUptodate(page));
5859                        kaddr = page_address(page);
5860                }
5861        }
5862        if (len) {
5863                mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
5864                kaddr[offset] |= mask_to_set;
5865        }
5866}
5867
5868
5869/**
5870 * extent_buffer_bitmap_clear - clear an area of a bitmap
5871 * @eb: the extent buffer
5872 * @start: offset of the bitmap item in the extent buffer
5873 * @pos: bit number of the first bit
5874 * @len: number of bits to clear
5875 */
5876void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
5877                                unsigned long pos, unsigned long len)
5878{
5879        u8 *kaddr;
5880        struct page *page;
5881        unsigned long i;
5882        size_t offset;
5883        const unsigned int size = pos + len;
5884        int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
5885        u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
5886
5887        eb_bitmap_offset(eb, start, pos, &i, &offset);
5888        page = eb->pages[i];
5889        WARN_ON(!PageUptodate(page));
5890        kaddr = page_address(page);
5891
5892        while (len >= bits_to_clear) {
5893                kaddr[offset] &= ~mask_to_clear;
5894                len -= bits_to_clear;
5895                bits_to_clear = BITS_PER_BYTE;
5896                mask_to_clear = ~0;
5897                if (++offset >= PAGE_SIZE && len > 0) {
5898                        offset = 0;
5899                        page = eb->pages[++i];
5900                        WARN_ON(!PageUptodate(page));
5901                        kaddr = page_address(page);
5902                }
5903        }
5904        if (len) {
5905                mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
5906                kaddr[offset] &= ~mask_to_clear;
5907        }
5908}
5909
5910static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
5911{
5912        unsigned long distance = (src > dst) ? src - dst : dst - src;
5913        return distance < len;
5914}
5915
5916static void copy_pages(struct page *dst_page, struct page *src_page,
5917                       unsigned long dst_off, unsigned long src_off,
5918                       unsigned long len)
5919{
5920        char *dst_kaddr = page_address(dst_page);
5921        char *src_kaddr;
5922        int must_memmove = 0;
5923
5924        if (dst_page != src_page) {
5925                src_kaddr = page_address(src_page);
5926        } else {
5927                src_kaddr = dst_kaddr;
5928                if (areas_overlap(src_off, dst_off, len))
5929                        must_memmove = 1;
5930        }
5931
5932        if (must_memmove)
5933                memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
5934        else
5935                memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
5936}
5937
5938void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5939                           unsigned long src_offset, unsigned long len)
5940{
5941        struct btrfs_fs_info *fs_info = dst->fs_info;
5942        size_t cur;
5943        size_t dst_off_in_page;
5944        size_t src_off_in_page;
5945        size_t start_offset = offset_in_page(dst->start);
5946        unsigned long dst_i;
5947        unsigned long src_i;
5948
5949        if (src_offset + len > dst->len) {
5950                btrfs_err(fs_info,
5951                        "memmove bogus src_offset %lu move len %lu dst len %lu",
5952                         src_offset, len, dst->len);
5953                BUG();
5954        }
5955        if (dst_offset + len > dst->len) {
5956                btrfs_err(fs_info,
5957                        "memmove bogus dst_offset %lu move len %lu dst len %lu",
5958                         dst_offset, len, dst->len);
5959                BUG();
5960        }
5961
5962        while (len > 0) {
5963                dst_off_in_page = offset_in_page(start_offset + dst_offset);
5964                src_off_in_page = offset_in_page(start_offset + src_offset);
5965
5966                dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
5967                src_i = (start_offset + src_offset) >> PAGE_SHIFT;
5968
5969                cur = min(len, (unsigned long)(PAGE_SIZE -
5970                                               src_off_in_page));
5971                cur = min_t(unsigned long, cur,
5972                        (unsigned long)(PAGE_SIZE - dst_off_in_page));
5973
5974                copy_pages(dst->pages[dst_i], dst->pages[src_i],
5975                           dst_off_in_page, src_off_in_page, cur);
5976
5977                src_offset += cur;
5978                dst_offset += cur;
5979                len -= cur;
5980        }
5981}
5982
5983void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5984                           unsigned long src_offset, unsigned long len)
5985{
5986        struct btrfs_fs_info *fs_info = dst->fs_info;
5987        size_t cur;
5988        size_t dst_off_in_page;
5989        size_t src_off_in_page;
5990        unsigned long dst_end = dst_offset + len - 1;
5991        unsigned long src_end = src_offset + len - 1;
5992        size_t start_offset = offset_in_page(dst->start);
5993        unsigned long dst_i;
5994        unsigned long src_i;
5995
5996        if (src_offset + len > dst->len) {
5997                btrfs_err(fs_info,
5998                          "memmove bogus src_offset %lu move len %lu len %lu",
5999                          src_offset, len, dst->len);
6000                BUG();

6001        }
6002        if (dst_offset + len > dst->len) {
6003                btrfs_err(fs_info,
6004                          "memmove bogus dst_offset %lu move len %lu len %lu",
6005                          dst_offset, len, dst->len);
6006                BUG();
6007        }
6008        if (dst_offset < src_offset) {
6009                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
6010                return;
6011        }
6012        while (len > 0) {
6013                dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
6014                src_i = (start_offset + src_end) >> PAGE_SHIFT;
6015
6016                dst_off_in_page = offset_in_page(start_offset + dst_end);
6017                src_off_in_page = offset_in_page(start_offset + src_end);
6018
6019                cur = min_t(unsigned long, len, src_off_in_page + 1);
6020                cur = min(cur, dst_off_in_page + 1);
6021                copy_pages(dst->pages[dst_i], dst->pages[src_i],
6022                           dst_off_in_page - cur + 1,
6023                           src_off_in_page - cur + 1, cur);
6024
6025                dst_end -= cur;
6026                src_end -= cur;
6027                len -= cur;
6028        }
6029}
6030
6031int try_release_extent_buffer(struct page *page)
6032{
6033        struct extent_buffer *eb;
6034
6035        /*
6036         * We need to make sure nobody is attaching this page to an eb right
6037         * now.
6038         */
6039        spin_lock(&page->mapping->private_lock);
6040        if (!PagePrivate(page)) {
6041                spin_unlock(&page->mapping->private_lock);
6042                return 1;
6043        }
6044
6045        eb = (struct extent_buffer *)page->private;
6046        BUG_ON(!eb);
6047
6048        /*
6049         * This is a little awful but should be ok, we need to make sure that
6050         * the eb doesn't disappear out from under us while we're looking at
6051         * this page.
6052         */
6053        spin_lock(&eb->refs_lock);
6054        if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
6055                spin_unlock(&eb->refs_lock);
6056                spin_unlock(&page->mapping->private_lock);
6057                return 0;
6058        }
6059        spin_unlock(&page->mapping->private_lock);
6060
6061        /*
6062         * If tree ref isn't set then we know the ref on this eb is a real ref,
6063         * so just return, this page will likely be freed soon anyway.
6064         */
6065        if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
6066                spin_unlock(&eb->refs_lock);
6067                return 0;
6068        }
6069
6070        return release_extent_buffer(eb);
6071}
6072