linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/slab.h>
   8#include <linux/rbtree.h>
   9#include <linux/mm.h>
  10#include "ctree.h"
  11#include "disk-io.h"
  12#include "transaction.h"
  13#include "print-tree.h"
  14#include "locking.h"
  15#include "volumes.h"
  16#include "qgroup.h"
  17
  18static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  19                      *root, struct btrfs_path *path, int level);
  20static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  21                      const struct btrfs_key *ins_key, struct btrfs_path *path,
  22                      int data_size, int extend);
  23static int push_node_left(struct btrfs_trans_handle *trans,
  24                          struct btrfs_fs_info *fs_info,
  25                          struct extent_buffer *dst,
  26                          struct extent_buffer *src, int empty);
  27static int balance_node_right(struct btrfs_trans_handle *trans,
  28                              struct btrfs_fs_info *fs_info,
  29                              struct extent_buffer *dst_buf,
  30                              struct extent_buffer *src_buf);
  31static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
  32                    int level, int slot);
  33
  34struct btrfs_path *btrfs_alloc_path(void)
  35{
  36        return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  37}
  38
  39/*
  40 * set all locked nodes in the path to blocking locks.  This should
  41 * be done before scheduling
  42 */
  43noinline void btrfs_set_path_blocking(struct btrfs_path *p)
  44{
  45        int i;
  46        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  47                if (!p->nodes[i] || !p->locks[i])
  48                        continue;
  49                /*
  50                 * If we currently have a spinning reader or writer lock this
  51                 * will bump the count of blocking holders and drop the
  52                 * spinlock.
  53                 */
  54                if (p->locks[i] == BTRFS_READ_LOCK) {
  55                        btrfs_set_lock_blocking_read(p->nodes[i]);
  56                        p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
  57                } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
  58                        btrfs_set_lock_blocking_write(p->nodes[i]);
  59                        p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
  60                }
  61        }
  62}
  63
  64/* this also releases the path */
  65void btrfs_free_path(struct btrfs_path *p)
  66{
  67        if (!p)
  68                return;
  69        btrfs_release_path(p);
  70        kmem_cache_free(btrfs_path_cachep, p);
  71}
  72
  73/*
  74 * path release drops references on the extent buffers in the path
  75 * and it drops any locks held by this path
  76 *
  77 * It is safe to call this on paths that no locks or extent buffers held.
  78 */
  79noinline void btrfs_release_path(struct btrfs_path *p)
  80{
  81        int i;
  82
  83        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  84                p->slots[i] = 0;
  85                if (!p->nodes[i])
  86                        continue;
  87                if (p->locks[i]) {
  88                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
  89                        p->locks[i] = 0;
  90                }
  91                free_extent_buffer(p->nodes[i]);
  92                p->nodes[i] = NULL;
  93        }
  94}
  95
  96/*
  97 * safely gets a reference on the root node of a tree.  A lock
  98 * is not taken, so a concurrent writer may put a different node
  99 * at the root of the tree.  See btrfs_lock_root_node for the
 100 * looping required.
 101 *
 102 * The extent buffer returned by this has a reference taken, so
 103 * it won't disappear.  It may stop being the root of the tree
 104 * at any time because there are no locks held.
 105 */
 106struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 107{
 108        struct extent_buffer *eb;
 109
 110        while (1) {
 111                rcu_read_lock();
 112                eb = rcu_dereference(root->node);
 113
 114                /*
 115                 * RCU really hurts here, we could free up the root node because
 116                 * it was COWed but we may not get the new root node yet so do
 117                 * the inc_not_zero dance and if it doesn't work then
 118                 * synchronize_rcu and try again.
 119                 */
 120                if (atomic_inc_not_zero(&eb->refs)) {
 121                        rcu_read_unlock();
 122                        break;
 123                }
 124                rcu_read_unlock();
 125                synchronize_rcu();
 126        }
 127        return eb;
 128}
 129
 130/* loop around taking references on and locking the root node of the
 131 * tree until you end up with a lock on the root.  A locked buffer
 132 * is returned, with a reference held.
 133 */
 134struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 135{
 136        struct extent_buffer *eb;
 137
 138        while (1) {
 139                eb = btrfs_root_node(root);
 140                btrfs_tree_lock(eb);
 141                if (eb == root->node)
 142                        break;
 143                btrfs_tree_unlock(eb);
 144                free_extent_buffer(eb);
 145        }
 146        return eb;
 147}
 148
 149/* loop around taking references on and locking the root node of the
 150 * tree until you end up with a lock on the root.  A locked buffer
 151 * is returned, with a reference held.
 152 */
 153struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 154{
 155        struct extent_buffer *eb;
 156
 157        while (1) {
 158                eb = btrfs_root_node(root);
 159                btrfs_tree_read_lock(eb);
 160                if (eb == root->node)
 161                        break;
 162                btrfs_tree_read_unlock(eb);
 163                free_extent_buffer(eb);
 164        }
 165        return eb;
 166}
 167
 168/* cowonly root (everything not a reference counted cow subvolume), just get
 169 * put onto a simple dirty list.  transaction.c walks this to make sure they
 170 * get properly updated on disk.
 171 */
 172static void add_root_to_dirty_list(struct btrfs_root *root)
 173{
 174        struct btrfs_fs_info *fs_info = root->fs_info;
 175
 176        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
 177            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
 178                return;
 179
 180        spin_lock(&fs_info->trans_lock);
 181        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
 182                /* Want the extent tree to be the last on the list */
 183                if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
 184                        list_move_tail(&root->dirty_list,
 185                                       &fs_info->dirty_cowonly_roots);
 186                else
 187                        list_move(&root->dirty_list,
 188                                  &fs_info->dirty_cowonly_roots);
 189        }
 190        spin_unlock(&fs_info->trans_lock);
 191}
 192
 193/*
 194 * used by snapshot creation to make a copy of a root for a tree with
 195 * a given objectid.  The buffer with the new root node is returned in
 196 * cow_ret, and this func returns zero on success or a negative error code.
 197 */
 198int btrfs_copy_root(struct btrfs_trans_handle *trans,
 199                      struct btrfs_root *root,
 200                      struct extent_buffer *buf,
 201                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 202{
 203        struct btrfs_fs_info *fs_info = root->fs_info;
 204        struct extent_buffer *cow;
 205        int ret = 0;
 206        int level;
 207        struct btrfs_disk_key disk_key;
 208
 209        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 210                trans->transid != fs_info->running_transaction->transid);
 211        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 212                trans->transid != root->last_trans);
 213
 214        level = btrfs_header_level(buf);
 215        if (level == 0)
 216                btrfs_item_key(buf, &disk_key, 0);
 217        else
 218                btrfs_node_key(buf, &disk_key, 0);
 219
 220        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 221                        &disk_key, level, buf->start, 0);
 222        if (IS_ERR(cow))
 223                return PTR_ERR(cow);
 224
 225        copy_extent_buffer_full(cow, buf);
 226        btrfs_set_header_bytenr(cow, cow->start);
 227        btrfs_set_header_generation(cow, trans->transid);
 228        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 229        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 230                                     BTRFS_HEADER_FLAG_RELOC);
 231        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 232                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 233        else
 234                btrfs_set_header_owner(cow, new_root_objectid);
 235
 236        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
 237
 238        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 239        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 240                ret = btrfs_inc_ref(trans, root, cow, 1);
 241        else
 242                ret = btrfs_inc_ref(trans, root, cow, 0);
 243
 244        if (ret)
 245                return ret;
 246
 247        btrfs_mark_buffer_dirty(cow);
 248        *cow_ret = cow;
 249        return 0;
 250}
 251
 252enum mod_log_op {
 253        MOD_LOG_KEY_REPLACE,
 254        MOD_LOG_KEY_ADD,
 255        MOD_LOG_KEY_REMOVE,
 256        MOD_LOG_KEY_REMOVE_WHILE_FREEING,
 257        MOD_LOG_KEY_REMOVE_WHILE_MOVING,
 258        MOD_LOG_MOVE_KEYS,
 259        MOD_LOG_ROOT_REPLACE,
 260};
 261
 262struct tree_mod_root {
 263        u64 logical;
 264        u8 level;
 265};
 266
 267struct tree_mod_elem {
 268        struct rb_node node;
 269        u64 logical;
 270        u64 seq;
 271        enum mod_log_op op;
 272
 273        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
 274        int slot;
 275
 276        /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
 277        u64 generation;
 278
 279        /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
 280        struct btrfs_disk_key key;
 281        u64 blockptr;
 282
 283        /* this is used for op == MOD_LOG_MOVE_KEYS */
 284        struct {
 285                int dst_slot;
 286                int nr_items;
 287        } move;
 288
 289        /* this is used for op == MOD_LOG_ROOT_REPLACE */
 290        struct tree_mod_root old_root;
 291};
 292
 293/*
 294 * Pull a new tree mod seq number for our operation.
 295 */
 296static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 297{
 298        return atomic64_inc_return(&fs_info->tree_mod_seq);
 299}
 300
 301/*
 302 * This adds a new blocker to the tree mod log's blocker list if the @elem
 303 * passed does not already have a sequence number set. So when a caller expects
 304 * to record tree modifications, it should ensure to set elem->seq to zero
 305 * before calling btrfs_get_tree_mod_seq.
 306 * Returns a fresh, unused tree log modification sequence number, even if no new
 307 * blocker was added.
 308 */
 309u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 310                           struct seq_list *elem)
 311{
 312        write_lock(&fs_info->tree_mod_log_lock);
 313        spin_lock(&fs_info->tree_mod_seq_lock);
 314        if (!elem->seq) {
 315                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 316                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 317        }
 318        spin_unlock(&fs_info->tree_mod_seq_lock);
 319        write_unlock(&fs_info->tree_mod_log_lock);
 320
 321        return elem->seq;
 322}
 323
 324void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 325                            struct seq_list *elem)
 326{
 327        struct rb_root *tm_root;
 328        struct rb_node *node;
 329        struct rb_node *next;
 330        struct seq_list *cur_elem;
 331        struct tree_mod_elem *tm;
 332        u64 min_seq = (u64)-1;
 333        u64 seq_putting = elem->seq;
 334
 335        if (!seq_putting)
 336                return;
 337
 338        spin_lock(&fs_info->tree_mod_seq_lock);
 339        list_del(&elem->list);
 340        elem->seq = 0;
 341
 342        list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
 343                if (cur_elem->seq < min_seq) {
 344                        if (seq_putting > cur_elem->seq) {
 345                                /*
 346                                 * blocker with lower sequence number exists, we
 347                                 * cannot remove anything from the log
 348                                 */
 349                                spin_unlock(&fs_info->tree_mod_seq_lock);
 350                                return;
 351                        }
 352                        min_seq = cur_elem->seq;
 353                }
 354        }
 355        spin_unlock(&fs_info->tree_mod_seq_lock);
 356
 357        /*
 358         * anything that's lower than the lowest existing (read: blocked)
 359         * sequence number can be removed from the tree.
 360         */
 361        write_lock(&fs_info->tree_mod_log_lock);
 362        tm_root = &fs_info->tree_mod_log;
 363        for (node = rb_first(tm_root); node; node = next) {
 364                next = rb_next(node);
 365                tm = rb_entry(node, struct tree_mod_elem, node);
 366                if (tm->seq > min_seq)
 367                        continue;
 368                rb_erase(node, tm_root);
 369                kfree(tm);
 370        }
 371        write_unlock(&fs_info->tree_mod_log_lock);
 372}
 373
 374/*
 375 * key order of the log:
 376 *       node/leaf start address -> sequence
 377 *
 378 * The 'start address' is the logical address of the *new* root node
 379 * for root replace operations, or the logical address of the affected
 380 * block for all other operations.
 381 *
 382 * Note: must be called with write lock for fs_info::tree_mod_log_lock.
 383 */
 384static noinline int
 385__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 386{
 387        struct rb_root *tm_root;
 388        struct rb_node **new;
 389        struct rb_node *parent = NULL;
 390        struct tree_mod_elem *cur;
 391
 392        tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 393
 394        tm_root = &fs_info->tree_mod_log;
 395        new = &tm_root->rb_node;
 396        while (*new) {
 397                cur = rb_entry(*new, struct tree_mod_elem, node);
 398                parent = *new;
 399                if (cur->logical < tm->logical)
 400                        new = &((*new)->rb_left);
 401                else if (cur->logical > tm->logical)
 402                        new = &((*new)->rb_right);
 403                else if (cur->seq < tm->seq)
 404                        new = &((*new)->rb_left);
 405                else if (cur->seq > tm->seq)
 406                        new = &((*new)->rb_right);
 407                else
 408                        return -EEXIST;
 409        }
 410
 411        rb_link_node(&tm->node, parent, new);
 412        rb_insert_color(&tm->node, tm_root);
 413        return 0;
 414}
 415
 416/*
 417 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 418 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 419 * this until all tree mod log insertions are recorded in the rb tree and then
 420 * write unlock fs_info::tree_mod_log_lock.
 421 */
 422static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 423                                    struct extent_buffer *eb) {
 424        smp_mb();
 425        if (list_empty(&(fs_info)->tree_mod_seq_list))
 426                return 1;
 427        if (eb && btrfs_header_level(eb) == 0)
 428                return 1;
 429
 430        write_lock(&fs_info->tree_mod_log_lock);
 431        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 432                write_unlock(&fs_info->tree_mod_log_lock);
 433                return 1;
 434        }
 435
 436        return 0;
 437}
 438
 439/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 440static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 441                                    struct extent_buffer *eb)
 442{
 443        smp_mb();
 444        if (list_empty(&(fs_info)->tree_mod_seq_list))
 445                return 0;
 446        if (eb && btrfs_header_level(eb) == 0)
 447                return 0;
 448
 449        return 1;
 450}
 451
 452static struct tree_mod_elem *
 453alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
 454                    enum mod_log_op op, gfp_t flags)
 455{
 456        struct tree_mod_elem *tm;
 457
 458        tm = kzalloc(sizeof(*tm), flags);
 459        if (!tm)
 460                return NULL;
 461
 462        tm->logical = eb->start;
 463        if (op != MOD_LOG_KEY_ADD) {
 464                btrfs_node_key(eb, &tm->key, slot);
 465                tm->blockptr = btrfs_node_blockptr(eb, slot);
 466        }
 467        tm->op = op;
 468        tm->slot = slot;
 469        tm->generation = btrfs_node_ptr_generation(eb, slot);
 470        RB_CLEAR_NODE(&tm->node);
 471
 472        return tm;
 473}
 474
 475static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 476                enum mod_log_op op, gfp_t flags)
 477{
 478        struct tree_mod_elem *tm;
 479        int ret;
 480
 481        if (!tree_mod_need_log(eb->fs_info, eb))
 482                return 0;
 483
 484        tm = alloc_tree_mod_elem(eb, slot, op, flags);
 485        if (!tm)
 486                return -ENOMEM;
 487
 488        if (tree_mod_dont_log(eb->fs_info, eb)) {
 489                kfree(tm);
 490                return 0;
 491        }
 492
 493        ret = __tree_mod_log_insert(eb->fs_info, tm);
 494        write_unlock(&eb->fs_info->tree_mod_log_lock);
 495        if (ret)
 496                kfree(tm);
 497
 498        return ret;
 499}
 500
 501static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
 502                int dst_slot, int src_slot, int nr_items)
 503{
 504        struct tree_mod_elem *tm = NULL;
 505        struct tree_mod_elem **tm_list = NULL;
 506        int ret = 0;
 507        int i;
 508        int locked = 0;
 509
 510        if (!tree_mod_need_log(eb->fs_info, eb))
 511                return 0;
 512
 513        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
 514        if (!tm_list)
 515                return -ENOMEM;
 516
 517        tm = kzalloc(sizeof(*tm), GFP_NOFS);
 518        if (!tm) {
 519                ret = -ENOMEM;
 520                goto free_tms;
 521        }
 522
 523        tm->logical = eb->start;
 524        tm->slot = src_slot;
 525        tm->move.dst_slot = dst_slot;
 526        tm->move.nr_items = nr_items;
 527        tm->op = MOD_LOG_MOVE_KEYS;
 528
 529        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 530                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 531                    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
 532                if (!tm_list[i]) {
 533                        ret = -ENOMEM;
 534                        goto free_tms;
 535                }
 536        }
 537
 538        if (tree_mod_dont_log(eb->fs_info, eb))
 539                goto free_tms;
 540        locked = 1;
 541
 542        /*
 543         * When we override something during the move, we log these removals.
 544         * This can only happen when we move towards the beginning of the
 545         * buffer, i.e. dst_slot < src_slot.
 546         */
 547        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 548                ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
 549                if (ret)
 550                        goto free_tms;
 551        }
 552
 553        ret = __tree_mod_log_insert(eb->fs_info, tm);
 554        if (ret)
 555                goto free_tms;
 556        write_unlock(&eb->fs_info->tree_mod_log_lock);
 557        kfree(tm_list);
 558
 559        return 0;
 560free_tms:
 561        for (i = 0; i < nr_items; i++) {
 562                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 563                        rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
 564                kfree(tm_list[i]);
 565        }
 566        if (locked)
 567                write_unlock(&eb->fs_info->tree_mod_log_lock);
 568        kfree(tm_list);
 569        kfree(tm);
 570
 571        return ret;
 572}
 573
 574static inline int
 575__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 576                       struct tree_mod_elem **tm_list,
 577                       int nritems)
 578{
 579        int i, j;
 580        int ret;
 581
 582        for (i = nritems - 1; i >= 0; i--) {
 583                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 584                if (ret) {
 585                        for (j = nritems - 1; j > i; j--)
 586                                rb_erase(&tm_list[j]->node,
 587                                         &fs_info->tree_mod_log);
 588                        return ret;
 589                }
 590        }
 591
 592        return 0;
 593}
 594
 595static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
 596                         struct extent_buffer *new_root, int log_removal)
 597{
 598        struct btrfs_fs_info *fs_info = old_root->fs_info;
 599        struct tree_mod_elem *tm = NULL;
 600        struct tree_mod_elem **tm_list = NULL;
 601        int nritems = 0;
 602        int ret = 0;
 603        int i;
 604
 605        if (!tree_mod_need_log(fs_info, NULL))
 606                return 0;
 607
 608        if (log_removal && btrfs_header_level(old_root) > 0) {
 609                nritems = btrfs_header_nritems(old_root);
 610                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 611                                  GFP_NOFS);
 612                if (!tm_list) {
 613                        ret = -ENOMEM;
 614                        goto free_tms;
 615                }
 616                for (i = 0; i < nritems; i++) {
 617                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
 618                            MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 619                        if (!tm_list[i]) {
 620                                ret = -ENOMEM;
 621                                goto free_tms;
 622                        }
 623                }
 624        }
 625
 626        tm = kzalloc(sizeof(*tm), GFP_NOFS);
 627        if (!tm) {
 628                ret = -ENOMEM;
 629                goto free_tms;
 630        }
 631
 632        tm->logical = new_root->start;
 633        tm->old_root.logical = old_root->start;
 634        tm->old_root.level = btrfs_header_level(old_root);
 635        tm->generation = btrfs_header_generation(old_root);
 636        tm->op = MOD_LOG_ROOT_REPLACE;
 637
 638        if (tree_mod_dont_log(fs_info, NULL))
 639                goto free_tms;
 640
 641        if (tm_list)
 642                ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 643        if (!ret)
 644                ret = __tree_mod_log_insert(fs_info, tm);
 645
 646        write_unlock(&fs_info->tree_mod_log_lock);
 647        if (ret)
 648                goto free_tms;
 649        kfree(tm_list);
 650
 651        return ret;
 652
 653free_tms:
 654        if (tm_list) {
 655                for (i = 0; i < nritems; i++)
 656                        kfree(tm_list[i]);
 657                kfree(tm_list);
 658        }
 659        kfree(tm);
 660
 661        return ret;
 662}
 663
 664static struct tree_mod_elem *
 665__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 666                      int smallest)
 667{
 668        struct rb_root *tm_root;
 669        struct rb_node *node;
 670        struct tree_mod_elem *cur = NULL;
 671        struct tree_mod_elem *found = NULL;
 672
 673        read_lock(&fs_info->tree_mod_log_lock);
 674        tm_root = &fs_info->tree_mod_log;
 675        node = tm_root->rb_node;
 676        while (node) {
 677                cur = rb_entry(node, struct tree_mod_elem, node);
 678                if (cur->logical < start) {
 679                        node = node->rb_left;
 680                } else if (cur->logical > start) {
 681                        node = node->rb_right;
 682                } else if (cur->seq < min_seq) {
 683                        node = node->rb_left;
 684                } else if (!smallest) {
 685                        /* we want the node with the highest seq */
 686                        if (found)
 687                                BUG_ON(found->seq > cur->seq);
 688                        found = cur;
 689                        node = node->rb_left;
 690                } else if (cur->seq > min_seq) {
 691                        /* we want the node with the smallest seq */
 692                        if (found)
 693                                BUG_ON(found->seq < cur->seq);
 694                        found = cur;
 695                        node = node->rb_right;
 696                } else {
 697                        found = cur;
 698                        break;
 699                }
 700        }
 701        read_unlock(&fs_info->tree_mod_log_lock);
 702
 703        return found;
 704}
 705
 706/*
 707 * this returns the element from the log with the smallest time sequence
 708 * value that's in the log (the oldest log item). any element with a time
 709 * sequence lower than min_seq will be ignored.
 710 */
 711static struct tree_mod_elem *
 712tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
 713                           u64 min_seq)
 714{
 715        return __tree_mod_log_search(fs_info, start, min_seq, 1);
 716}
 717
 718/*
 719 * this returns the element from the log with the largest time sequence
 720 * value that's in the log (the most recent log item). any element with
 721 * a time sequence lower than min_seq will be ignored.
 722 */
 723static struct tree_mod_elem *
 724tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 725{
 726        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 727}
 728
 729static noinline int
 730tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 731                     struct extent_buffer *src, unsigned long dst_offset,
 732                     unsigned long src_offset, int nr_items)
 733{
 734        int ret = 0;
 735        struct tree_mod_elem **tm_list = NULL;
 736        struct tree_mod_elem **tm_list_add, **tm_list_rem;
 737        int i;
 738        int locked = 0;
 739
 740        if (!tree_mod_need_log(fs_info, NULL))
 741                return 0;
 742
 743        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 744                return 0;
 745
 746        tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 747                          GFP_NOFS);
 748        if (!tm_list)
 749                return -ENOMEM;
 750
 751        tm_list_add = tm_list;
 752        tm_list_rem = tm_list + nr_items;
 753        for (i = 0; i < nr_items; i++) {
 754                tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 755                    MOD_LOG_KEY_REMOVE, GFP_NOFS);
 756                if (!tm_list_rem[i]) {
 757                        ret = -ENOMEM;
 758                        goto free_tms;
 759                }
 760
 761                tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 762                    MOD_LOG_KEY_ADD, GFP_NOFS);
 763                if (!tm_list_add[i]) {
 764                        ret = -ENOMEM;
 765                        goto free_tms;
 766                }
 767        }
 768
 769        if (tree_mod_dont_log(fs_info, NULL))
 770                goto free_tms;
 771        locked = 1;
 772
 773        for (i = 0; i < nr_items; i++) {
 774                ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
 775                if (ret)
 776                        goto free_tms;
 777                ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
 778                if (ret)
 779                        goto free_tms;
 780        }
 781
 782        write_unlock(&fs_info->tree_mod_log_lock);
 783        kfree(tm_list);
 784
 785        return 0;
 786
 787free_tms:
 788        for (i = 0; i < nr_items * 2; i++) {
 789                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 790                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 791                kfree(tm_list[i]);
 792        }
 793        if (locked)
 794                write_unlock(&fs_info->tree_mod_log_lock);
 795        kfree(tm_list);
 796
 797        return ret;
 798}
 799
 800static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
 801{
 802        struct tree_mod_elem **tm_list = NULL;
 803        int nritems = 0;
 804        int i;
 805        int ret = 0;
 806
 807        if (btrfs_header_level(eb) == 0)
 808                return 0;
 809
 810        if (!tree_mod_need_log(eb->fs_info, NULL))
 811                return 0;
 812
 813        nritems = btrfs_header_nritems(eb);
 814        tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 815        if (!tm_list)
 816                return -ENOMEM;
 817
 818        for (i = 0; i < nritems; i++) {
 819                tm_list[i] = alloc_tree_mod_elem(eb, i,
 820                    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 821                if (!tm_list[i]) {
 822                        ret = -ENOMEM;
 823                        goto free_tms;
 824                }
 825        }
 826
 827        if (tree_mod_dont_log(eb->fs_info, eb))
 828                goto free_tms;
 829
 830        ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
 831        write_unlock(&eb->fs_info->tree_mod_log_lock);
 832        if (ret)
 833                goto free_tms;
 834        kfree(tm_list);
 835
 836        return 0;
 837
 838free_tms:
 839        for (i = 0; i < nritems; i++)
 840                kfree(tm_list[i]);
 841        kfree(tm_list);
 842
 843        return ret;
 844}
 845
 846/*
 847 * check if the tree block can be shared by multiple trees
 848 */
 849int btrfs_block_can_be_shared(struct btrfs_root *root,
 850                              struct extent_buffer *buf)
 851{
 852        /*
 853         * Tree blocks not in reference counted trees and tree roots
 854         * are never shared. If a block was allocated after the last
 855         * snapshot and the block was not allocated by tree relocation,
 856         * we know the block is not shared.
 857         */
 858        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 859            buf != root->node && buf != root->commit_root &&
 860            (btrfs_header_generation(buf) <=
 861             btrfs_root_last_snapshot(&root->root_item) ||
 862             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 863                return 1;
 864
 865        return 0;
 866}
 867
 868static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 869                                       struct btrfs_root *root,
 870                                       struct extent_buffer *buf,
 871                                       struct extent_buffer *cow,
 872                                       int *last_ref)
 873{
 874        struct btrfs_fs_info *fs_info = root->fs_info;
 875        u64 refs;
 876        u64 owner;
 877        u64 flags;
 878        u64 new_flags = 0;
 879        int ret;
 880
 881        /*
 882         * Backrefs update rules:
 883         *
 884         * Always use full backrefs for extent pointers in tree block
 885         * allocated by tree relocation.
 886         *
 887         * If a shared tree block is no longer referenced by its owner
 888         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 889         * use full backrefs for extent pointers in tree block.
 890         *
 891         * If a tree block is been relocating
 892         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
 893         * use full backrefs for extent pointers in tree block.
 894         * The reason for this is some operations (such as drop tree)
 895         * are only allowed for blocks use full backrefs.
 896         */
 897
 898        if (btrfs_block_can_be_shared(root, buf)) {
 899                ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
 900                                               btrfs_header_level(buf), 1,
 901                                               &refs, &flags);
 902                if (ret)
 903                        return ret;
 904                if (refs == 0) {
 905                        ret = -EROFS;
 906                        btrfs_handle_fs_error(fs_info, ret, NULL);
 907                        return ret;
 908                }
 909        } else {
 910                refs = 1;
 911                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
 912                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 913                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 914                else
 915                        flags = 0;
 916        }
 917
 918        owner = btrfs_header_owner(buf);
 919        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
 920               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 921
 922        if (refs > 1) {
 923                if ((owner == root->root_key.objectid ||
 924                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 925                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
 926                        ret = btrfs_inc_ref(trans, root, buf, 1);
 927                        if (ret)
 928                                return ret;
 929
 930                        if (root->root_key.objectid ==
 931                            BTRFS_TREE_RELOC_OBJECTID) {
 932                                ret = btrfs_dec_ref(trans, root, buf, 0);
 933                                if (ret)
 934                                        return ret;
 935                                ret = btrfs_inc_ref(trans, root, cow, 1);
 936                                if (ret)
 937                                        return ret;
 938                        }
 939                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 940                } else {
 941
 942                        if (root->root_key.objectid ==
 943                            BTRFS_TREE_RELOC_OBJECTID)
 944                                ret = btrfs_inc_ref(trans, root, cow, 1);
 945                        else
 946                                ret = btrfs_inc_ref(trans, root, cow, 0);
 947                        if (ret)
 948                                return ret;
 949                }
 950                if (new_flags != 0) {
 951                        int level = btrfs_header_level(buf);
 952
 953                        ret = btrfs_set_disk_extent_flags(trans, fs_info,
 954                                                          buf->start,
 955                                                          buf->len,
 956                                                          new_flags, level, 0);
 957                        if (ret)
 958                                return ret;
 959                }
 960        } else {
 961                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 962                        if (root->root_key.objectid ==
 963                            BTRFS_TREE_RELOC_OBJECTID)
 964                                ret = btrfs_inc_ref(trans, root, cow, 1);
 965                        else
 966                                ret = btrfs_inc_ref(trans, root, cow, 0);
 967                        if (ret)
 968                                return ret;
 969                        ret = btrfs_dec_ref(trans, root, buf, 1);
 970                        if (ret)
 971                                return ret;
 972                }
 973                clean_tree_block(fs_info, buf);
 974                *last_ref = 1;
 975        }
 976        return 0;
 977}
 978
 979static struct extent_buffer *alloc_tree_block_no_bg_flush(
 980                                          struct btrfs_trans_handle *trans,
 981                                          struct btrfs_root *root,
 982                                          u64 parent_start,
 983                                          const struct btrfs_disk_key *disk_key,
 984                                          int level,
 985                                          u64 hint,
 986                                          u64 empty_size)
 987{
 988        struct btrfs_fs_info *fs_info = root->fs_info;
 989        struct extent_buffer *ret;
 990
 991        /*
 992         * If we are COWing a node/leaf from the extent, chunk, device or free
 993         * space trees, make sure that we do not finish block group creation of
 994         * pending block groups. We do this to avoid a deadlock.
 995         * COWing can result in allocation of a new chunk, and flushing pending
 996         * block groups (btrfs_create_pending_block_groups()) can be triggered
 997         * when finishing allocation of a new chunk. Creation of a pending block
 998         * group modifies the extent, chunk, device and free space trees,
 999         * therefore we could deadlock with ourselves since we are holding a
1000         * lock on an extent buffer that btrfs_create_pending_block_groups() may
1001         * try to COW later.
1002         * For similar reasons, we also need to delay flushing pending block
1003         * groups when splitting a leaf or node, from one of those trees, since
1004         * we are holding a write lock on it and its parent or when inserting a
1005         * new root node for one of those trees.
1006         */
1007        if (root == fs_info->extent_root ||
1008            root == fs_info->chunk_root ||
1009            root == fs_info->dev_root ||
1010            root == fs_info->free_space_root)
1011                trans->can_flush_pending_bgs = false;
1012
1013        ret = btrfs_alloc_tree_block(trans, root, parent_start,
1014                                     root->root_key.objectid, disk_key, level,
1015                                     hint, empty_size);
1016        trans->can_flush_pending_bgs = true;
1017
1018        return ret;
1019}
1020
1021/*
1022 * does the dirty work in cow of a single block.  The parent block (if
1023 * supplied) is updated to point to the new cow copy.  The new buffer is marked
1024 * dirty and returned locked.  If you modify the block it needs to be marked
1025 * dirty again.
1026 *
1027 * search_start -- an allocation hint for the new block
1028 *
1029 * empty_size -- a hint that you plan on doing more cow.  This is the size in
1030 * bytes the allocator should try to find free next to the block it returns.
1031 * This is just a hint and may be ignored by the allocator.
1032 */
1033static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1034                             struct btrfs_root *root,
1035                             struct extent_buffer *buf,
1036                             struct extent_buffer *parent, int parent_slot,
1037                             struct extent_buffer **cow_ret,
1038                             u64 search_start, u64 empty_size)
1039{
1040        struct btrfs_fs_info *fs_info = root->fs_info;
1041        struct btrfs_disk_key disk_key;
1042        struct extent_buffer *cow;
1043        int level, ret;
1044        int last_ref = 0;
1045        int unlock_orig = 0;
1046        u64 parent_start = 0;
1047
1048        if (*cow_ret == buf)
1049                unlock_orig = 1;
1050
1051        btrfs_assert_tree_locked(buf);
1052
1053        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1054                trans->transid != fs_info->running_transaction->transid);
1055        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1056                trans->transid != root->last_trans);
1057
1058        level = btrfs_header_level(buf);
1059
1060        if (level == 0)
1061                btrfs_item_key(buf, &disk_key, 0);
1062        else
1063                btrfs_node_key(buf, &disk_key, 0);
1064
1065        if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1066                parent_start = parent->start;
1067
1068        cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1069                                           level, search_start, empty_size);
1070        if (IS_ERR(cow))
1071                return PTR_ERR(cow);
1072
1073        /* cow is set to blocking by btrfs_init_new_buffer */
1074
1075        copy_extent_buffer_full(cow, buf);
1076        btrfs_set_header_bytenr(cow, cow->start);
1077        btrfs_set_header_generation(cow, trans->transid);
1078        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1079        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1080                                     BTRFS_HEADER_FLAG_RELOC);
1081        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1082                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1083        else
1084                btrfs_set_header_owner(cow, root->root_key.objectid);
1085
1086        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1087
1088        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1089        if (ret) {
1090                btrfs_abort_transaction(trans, ret);
1091                return ret;
1092        }
1093
1094        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1095                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1096                if (ret) {
1097                        btrfs_abort_transaction(trans, ret);
1098                        return ret;
1099                }
1100        }
1101
1102        if (buf == root->node) {
1103                WARN_ON(parent && parent != buf);
1104                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1105                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1106                        parent_start = buf->start;
1107
1108                extent_buffer_get(cow);
1109                ret = tree_mod_log_insert_root(root->node, cow, 1);
1110                BUG_ON(ret < 0);
1111                rcu_assign_pointer(root->node, cow);
1112
1113                btrfs_free_tree_block(trans, root, buf, parent_start,
1114                                      last_ref);
1115                free_extent_buffer(buf);
1116                add_root_to_dirty_list(root);
1117        } else {
1118                WARN_ON(trans->transid != btrfs_header_generation(parent));
1119                tree_mod_log_insert_key(parent, parent_slot,
1120                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1121                btrfs_set_node_blockptr(parent, parent_slot,
1122                                        cow->start);
1123                btrfs_set_node_ptr_generation(parent, parent_slot,
1124                                              trans->transid);
1125                btrfs_mark_buffer_dirty(parent);
1126                if (last_ref) {
1127                        ret = tree_mod_log_free_eb(buf);
1128                        if (ret) {
1129                                btrfs_abort_transaction(trans, ret);
1130                                return ret;
1131                        }
1132                }
1133                btrfs_free_tree_block(trans, root, buf, parent_start,
1134                                      last_ref);
1135        }
1136        if (unlock_orig)
1137                btrfs_tree_unlock(buf);
1138        free_extent_buffer_stale(buf);
1139        btrfs_mark_buffer_dirty(cow);
1140        *cow_ret = cow;
1141        return 0;
1142}
1143
1144/*
1145 * returns the logical address of the oldest predecessor of the given root.
1146 * entries older than time_seq are ignored.
1147 */
1148static struct tree_mod_elem *__tree_mod_log_oldest_root(
1149                struct extent_buffer *eb_root, u64 time_seq)
1150{
1151        struct tree_mod_elem *tm;
1152        struct tree_mod_elem *found = NULL;
1153        u64 root_logical = eb_root->start;
1154        int looped = 0;
1155
1156        if (!time_seq)
1157                return NULL;
1158
1159        /*
1160         * the very last operation that's logged for a root is the
1161         * replacement operation (if it is replaced at all). this has
1162         * the logical address of the *new* root, making it the very
1163         * first operation that's logged for this root.
1164         */
1165        while (1) {
1166                tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1167                                                time_seq);
1168                if (!looped && !tm)
1169                        return NULL;
1170                /*
1171                 * if there are no tree operation for the oldest root, we simply
1172                 * return it. this should only happen if that (old) root is at
1173                 * level 0.
1174                 */
1175                if (!tm)
1176                        break;
1177
1178                /*
1179                 * if there's an operation that's not a root replacement, we
1180                 * found the oldest version of our root. normally, we'll find a
1181                 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1182                 */
1183                if (tm->op != MOD_LOG_ROOT_REPLACE)
1184                        break;
1185
1186                found = tm;
1187                root_logical = tm->old_root.logical;
1188                looped = 1;
1189        }
1190
1191        /* if there's no old root to return, return what we found instead */
1192        if (!found)
1193                found = tm;
1194
1195        return found;
1196}
1197
1198/*
1199 * tm is a pointer to the first operation to rewind within eb. then, all
1200 * previous operations will be rewound (until we reach something older than
1201 * time_seq).
1202 */
1203static void
1204__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1205                      u64 time_seq, struct tree_mod_elem *first_tm)
1206{
1207        u32 n;
1208        struct rb_node *next;
1209        struct tree_mod_elem *tm = first_tm;
1210        unsigned long o_dst;
1211        unsigned long o_src;
1212        unsigned long p_size = sizeof(struct btrfs_key_ptr);
1213
1214        n = btrfs_header_nritems(eb);
1215        read_lock(&fs_info->tree_mod_log_lock);
1216        while (tm && tm->seq >= time_seq) {
1217                /*
1218                 * all the operations are recorded with the operator used for
1219                 * the modification. as we're going backwards, we do the
1220                 * opposite of each operation here.
1221                 */
1222                switch (tm->op) {
1223                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1224                        BUG_ON(tm->slot < n);
1225                        /* Fallthrough */
1226                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1227                case MOD_LOG_KEY_REMOVE:
1228                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1229                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1230                        btrfs_set_node_ptr_generation(eb, tm->slot,
1231                                                      tm->generation);
1232                        n++;
1233                        break;
1234                case MOD_LOG_KEY_REPLACE:
1235                        BUG_ON(tm->slot >= n);
1236                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1237                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1238                        btrfs_set_node_ptr_generation(eb, tm->slot,
1239                                                      tm->generation);
1240                        break;
1241                case MOD_LOG_KEY_ADD:
1242                        /* if a move operation is needed it's in the log */
1243                        n--;
1244                        break;
1245                case MOD_LOG_MOVE_KEYS:
1246                        o_dst = btrfs_node_key_ptr_offset(tm->slot);
1247                        o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1248                        memmove_extent_buffer(eb, o_dst, o_src,
1249                                              tm->move.nr_items * p_size);
1250                        break;
1251                case MOD_LOG_ROOT_REPLACE:
1252                        /*
1253                         * this operation is special. for roots, this must be
1254                         * handled explicitly before rewinding.
1255                         * for non-roots, this operation may exist if the node
1256                         * was a root: root A -> child B; then A gets empty and
1257                         * B is promoted to the new root. in the mod log, we'll
1258                         * have a root-replace operation for B, a tree block
1259                         * that is no root. we simply ignore that operation.
1260                         */
1261                        break;
1262                }
1263                next = rb_next(&tm->node);
1264                if (!next)
1265                        break;
1266                tm = rb_entry(next, struct tree_mod_elem, node);
1267                if (tm->logical != first_tm->logical)
1268                        break;
1269        }
1270        read_unlock(&fs_info->tree_mod_log_lock);
1271        btrfs_set_header_nritems(eb, n);
1272}
1273
1274/*
1275 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1276 * is returned. If rewind operations happen, a fresh buffer is returned. The
1277 * returned buffer is always read-locked. If the returned buffer is not the
1278 * input buffer, the lock on the input buffer is released and the input buffer
1279 * is freed (its refcount is decremented).
1280 */
1281static struct extent_buffer *
1282tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1283                    struct extent_buffer *eb, u64 time_seq)
1284{
1285        struct extent_buffer *eb_rewin;
1286        struct tree_mod_elem *tm;
1287
1288        if (!time_seq)
1289                return eb;
1290
1291        if (btrfs_header_level(eb) == 0)
1292                return eb;
1293
1294        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1295        if (!tm)
1296                return eb;
1297
1298        btrfs_set_path_blocking(path);
1299        btrfs_set_lock_blocking_read(eb);
1300
1301        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1302                BUG_ON(tm->slot != 0);
1303                eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1304                if (!eb_rewin) {
1305                        btrfs_tree_read_unlock_blocking(eb);
1306                        free_extent_buffer(eb);
1307                        return NULL;
1308                }
1309                btrfs_set_header_bytenr(eb_rewin, eb->start);
1310                btrfs_set_header_backref_rev(eb_rewin,
1311                                             btrfs_header_backref_rev(eb));
1312                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1313                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1314        } else {
1315                eb_rewin = btrfs_clone_extent_buffer(eb);
1316                if (!eb_rewin) {
1317                        btrfs_tree_read_unlock_blocking(eb);
1318                        free_extent_buffer(eb);
1319                        return NULL;
1320                }
1321        }
1322
1323        btrfs_tree_read_unlock_blocking(eb);
1324        free_extent_buffer(eb);
1325
1326        btrfs_tree_read_lock(eb_rewin);
1327        __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1328        WARN_ON(btrfs_header_nritems(eb_rewin) >
1329                BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1330
1331        return eb_rewin;
1332}
1333
1334/*
1335 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1336 * value. If there are no changes, the current root->root_node is returned. If
1337 * anything changed in between, there's a fresh buffer allocated on which the
1338 * rewind operations are done. In any case, the returned buffer is read locked.
1339 * Returns NULL on error (with no locks held).
1340 */
1341static inline struct extent_buffer *
1342get_old_root(struct btrfs_root *root, u64 time_seq)
1343{
1344        struct btrfs_fs_info *fs_info = root->fs_info;
1345        struct tree_mod_elem *tm;
1346        struct extent_buffer *eb = NULL;
1347        struct extent_buffer *eb_root;
1348        struct extent_buffer *old;
1349        struct tree_mod_root *old_root = NULL;
1350        u64 old_generation = 0;
1351        u64 logical;
1352        int level;
1353
1354        eb_root = btrfs_read_lock_root_node(root);
1355        tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1356        if (!tm)
1357                return eb_root;
1358
1359        if (tm->op == MOD_LOG_ROOT_REPLACE) {
1360                old_root = &tm->old_root;
1361                old_generation = tm->generation;
1362                logical = old_root->logical;
1363                level = old_root->level;
1364        } else {
1365                logical = eb_root->start;
1366                level = btrfs_header_level(eb_root);
1367        }
1368
1369        tm = tree_mod_log_search(fs_info, logical, time_seq);
1370        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1371                btrfs_tree_read_unlock(eb_root);
1372                free_extent_buffer(eb_root);
1373                old = read_tree_block(fs_info, logical, 0, level, NULL);
1374                if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1375                        if (!IS_ERR(old))
1376                                free_extent_buffer(old);
1377                        btrfs_warn(fs_info,
1378                                   "failed to read tree block %llu from get_old_root",
1379                                   logical);
1380                } else {
1381                        eb = btrfs_clone_extent_buffer(old);
1382                        free_extent_buffer(old);
1383                }
1384        } else if (old_root) {
1385                btrfs_tree_read_unlock(eb_root);
1386                free_extent_buffer(eb_root);
1387                eb = alloc_dummy_extent_buffer(fs_info, logical);
1388        } else {
1389                btrfs_set_lock_blocking_read(eb_root);
1390                eb = btrfs_clone_extent_buffer(eb_root);
1391                btrfs_tree_read_unlock_blocking(eb_root);
1392                free_extent_buffer(eb_root);
1393        }
1394
1395        if (!eb)
1396                return NULL;
1397        btrfs_tree_read_lock(eb);
1398        if (old_root) {
1399                btrfs_set_header_bytenr(eb, eb->start);
1400                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1401                btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1402                btrfs_set_header_level(eb, old_root->level);
1403                btrfs_set_header_generation(eb, old_generation);
1404        }
1405        if (tm)
1406                __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1407        else
1408                WARN_ON(btrfs_header_level(eb) != 0);
1409        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1410
1411        return eb;
1412}
1413
1414int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1415{
1416        struct tree_mod_elem *tm;
1417        int level;
1418        struct extent_buffer *eb_root = btrfs_root_node(root);
1419
1420        tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1421        if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1422                level = tm->old_root.level;
1423        } else {
1424                level = btrfs_header_level(eb_root);
1425        }
1426        free_extent_buffer(eb_root);
1427
1428        return level;
1429}
1430
1431static inline int should_cow_block(struct btrfs_trans_handle *trans,
1432                                   struct btrfs_root *root,
1433                                   struct extent_buffer *buf)
1434{
1435        if (btrfs_is_testing(root->fs_info))
1436                return 0;
1437
1438        /* Ensure we can see the FORCE_COW bit */
1439        smp_mb__before_atomic();
1440
1441        /*
1442         * We do not need to cow a block if
1443         * 1) this block is not created or changed in this transaction;
1444         * 2) this block does not belong to TREE_RELOC tree;
1445         * 3) the root is not forced COW.
1446         *
1447         * What is forced COW:
1448         *    when we create snapshot during committing the transaction,
1449         *    after we've finished copying src root, we must COW the shared
1450         *    block to ensure the metadata consistency.
1451         */
1452        if (btrfs_header_generation(buf) == trans->transid &&
1453            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1454            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1455              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1456            !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1457                return 0;
1458        return 1;
1459}
1460
1461/*
1462 * cows a single block, see __btrfs_cow_block for the real work.
1463 * This version of it has extra checks so that a block isn't COWed more than
1464 * once per transaction, as long as it hasn't been written yet
1465 */
1466noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1467                    struct btrfs_root *root, struct extent_buffer *buf,
1468                    struct extent_buffer *parent, int parent_slot,
1469                    struct extent_buffer **cow_ret)
1470{
1471        struct btrfs_fs_info *fs_info = root->fs_info;
1472        u64 search_start;
1473        int ret;
1474
1475        if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1476                btrfs_err(fs_info,
1477                        "COW'ing blocks on a fs root that's being dropped");
1478
1479        if (trans->transaction != fs_info->running_transaction)
1480                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1481                       trans->transid,
1482                       fs_info->running_transaction->transid);
1483
1484        if (trans->transid != fs_info->generation)
1485                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1486                       trans->transid, fs_info->generation);
1487
1488        if (!should_cow_block(trans, root, buf)) {
1489                trans->dirty = true;
1490                *cow_ret = buf;
1491                return 0;
1492        }
1493
1494        search_start = buf->start & ~((u64)SZ_1G - 1);
1495
1496        if (parent)
1497                btrfs_set_lock_blocking_write(parent);
1498        btrfs_set_lock_blocking_write(buf);
1499
1500        /*
1501         * Before CoWing this block for later modification, check if it's
1502         * the subtree root and do the delayed subtree trace if needed.
1503         *
1504         * Also We don't care about the error, as it's handled internally.
1505         */
1506        btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1507        ret = __btrfs_cow_block(trans, root, buf, parent,
1508                                 parent_slot, cow_ret, search_start, 0);
1509
1510        trace_btrfs_cow_block(root, buf, *cow_ret);
1511
1512        return ret;
1513}
1514
1515/*
1516 * helper function for defrag to decide if two blocks pointed to by a
1517 * node are actually close by
1518 */
1519static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1520{
1521        if (blocknr < other && other - (blocknr + blocksize) < 32768)
1522                return 1;
1523        if (blocknr > other && blocknr - (other + blocksize) < 32768)
1524                return 1;
1525        return 0;
1526}
1527
1528/*
1529 * compare two keys in a memcmp fashion
1530 */
1531static int comp_keys(const struct btrfs_disk_key *disk,
1532                     const struct btrfs_key *k2)
1533{
1534        struct btrfs_key k1;
1535
1536        btrfs_disk_key_to_cpu(&k1, disk);
1537
1538        return btrfs_comp_cpu_keys(&k1, k2);
1539}
1540
1541/*
1542 * same as comp_keys only with two btrfs_key's
1543 */
1544int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1545{
1546        if (k1->objectid > k2->objectid)
1547                return 1;
1548        if (k1->objectid < k2->objectid)
1549                return -1;
1550        if (k1->type > k2->type)
1551                return 1;
1552        if (k1->type < k2->type)
1553                return -1;
1554        if (k1->offset > k2->offset)
1555                return 1;
1556        if (k1->offset < k2->offset)
1557                return -1;
1558        return 0;
1559}
1560
1561/*
1562 * this is used by the defrag code to go through all the
1563 * leaves pointed to by a node and reallocate them so that
1564 * disk order is close to key order
1565 */
1566int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1567                       struct btrfs_root *root, struct extent_buffer *parent,
1568                       int start_slot, u64 *last_ret,
1569                       struct btrfs_key *progress)
1570{
1571        struct btrfs_fs_info *fs_info = root->fs_info;
1572        struct extent_buffer *cur;
1573        u64 blocknr;
1574        u64 gen;
1575        u64 search_start = *last_ret;
1576        u64 last_block = 0;
1577        u64 other;
1578        u32 parent_nritems;
1579        int end_slot;
1580        int i;
1581        int err = 0;
1582        int parent_level;
1583        int uptodate;
1584        u32 blocksize;
1585        int progress_passed = 0;
1586        struct btrfs_disk_key disk_key;
1587
1588        parent_level = btrfs_header_level(parent);
1589
1590        WARN_ON(trans->transaction != fs_info->running_transaction);
1591        WARN_ON(trans->transid != fs_info->generation);
1592
1593        parent_nritems = btrfs_header_nritems(parent);
1594        blocksize = fs_info->nodesize;
1595        end_slot = parent_nritems - 1;
1596
1597        if (parent_nritems <= 1)
1598                return 0;
1599
1600        btrfs_set_lock_blocking_write(parent);
1601
1602        for (i = start_slot; i <= end_slot; i++) {
1603                struct btrfs_key first_key;
1604                int close = 1;
1605
1606                btrfs_node_key(parent, &disk_key, i);
1607                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1608                        continue;
1609
1610                progress_passed = 1;
1611                blocknr = btrfs_node_blockptr(parent, i);
1612                gen = btrfs_node_ptr_generation(parent, i);
1613                btrfs_node_key_to_cpu(parent, &first_key, i);
1614                if (last_block == 0)
1615                        last_block = blocknr;
1616
1617                if (i > 0) {
1618                        other = btrfs_node_blockptr(parent, i - 1);
1619                        close = close_blocks(blocknr, other, blocksize);
1620                }
1621                if (!close && i < end_slot) {
1622                        other = btrfs_node_blockptr(parent, i + 1);
1623                        close = close_blocks(blocknr, other, blocksize);
1624                }
1625                if (close) {
1626                        last_block = blocknr;
1627                        continue;
1628                }
1629
1630                cur = find_extent_buffer(fs_info, blocknr);
1631                if (cur)
1632                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1633                else
1634                        uptodate = 0;
1635                if (!cur || !uptodate) {
1636                        if (!cur) {
1637                                cur = read_tree_block(fs_info, blocknr, gen,
1638                                                      parent_level - 1,
1639                                                      &first_key);
1640                                if (IS_ERR(cur)) {
1641                                        return PTR_ERR(cur);
1642                                } else if (!extent_buffer_uptodate(cur)) {
1643                                        free_extent_buffer(cur);
1644                                        return -EIO;
1645                                }
1646                        } else if (!uptodate) {
1647                                err = btrfs_read_buffer(cur, gen,
1648                                                parent_level - 1,&first_key);
1649                                if (err) {
1650                                        free_extent_buffer(cur);
1651                                        return err;
1652                                }
1653                        }
1654                }
1655                if (search_start == 0)
1656                        search_start = last_block;
1657
1658                btrfs_tree_lock(cur);
1659                btrfs_set_lock_blocking_write(cur);
1660                err = __btrfs_cow_block(trans, root, cur, parent, i,
1661                                        &cur, search_start,
1662                                        min(16 * blocksize,
1663                                            (end_slot - i) * blocksize));
1664                if (err) {
1665                        btrfs_tree_unlock(cur);
1666                        free_extent_buffer(cur);
1667                        break;
1668                }
1669                search_start = cur->start;
1670                last_block = cur->start;
1671                *last_ret = search_start;
1672                btrfs_tree_unlock(cur);
1673                free_extent_buffer(cur);
1674        }
1675        return err;
1676}
1677
1678/*
1679 * search for key in the extent_buffer.  The items start at offset p,
1680 * and they are item_size apart.  There are 'max' items in p.
1681 *
1682 * the slot in the array is returned via slot, and it points to
1683 * the place where you would insert key if it is not found in
1684 * the array.
1685 *
1686 * slot may point to max if the key is bigger than all of the keys
1687 */
1688static noinline int generic_bin_search(struct extent_buffer *eb,
1689                                       unsigned long p, int item_size,
1690                                       const struct btrfs_key *key,
1691                                       int max, int *slot)
1692{
1693        int low = 0;
1694        int high = max;
1695        int mid;
1696        int ret;
1697        struct btrfs_disk_key *tmp = NULL;
1698        struct btrfs_disk_key unaligned;
1699        unsigned long offset;
1700        char *kaddr = NULL;
1701        unsigned long map_start = 0;
1702        unsigned long map_len = 0;
1703        int err;
1704
1705        if (low > high) {
1706                btrfs_err(eb->fs_info,
1707                 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1708                          __func__, low, high, eb->start,
1709                          btrfs_header_owner(eb), btrfs_header_level(eb));
1710                return -EINVAL;
1711        }
1712
1713        while (low < high) {
1714                mid = (low + high) / 2;
1715                offset = p + mid * item_size;
1716
1717                if (!kaddr || offset < map_start ||
1718                    (offset + sizeof(struct btrfs_disk_key)) >
1719                    map_start + map_len) {
1720
1721                        err = map_private_extent_buffer(eb, offset,
1722                                                sizeof(struct btrfs_disk_key),
1723                                                &kaddr, &map_start, &map_len);
1724
1725                        if (!err) {
1726                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
1727                                                        map_start);
1728                        } else if (err == 1) {
1729                                read_extent_buffer(eb, &unaligned,
1730                                                   offset, sizeof(unaligned));
1731                                tmp = &unaligned;
1732                        } else {
1733                                return err;
1734                        }
1735
1736                } else {
1737                        tmp = (struct btrfs_disk_key *)(kaddr + offset -
1738                                                        map_start);
1739                }
1740                ret = comp_keys(tmp, key);
1741
1742                if (ret < 0)
1743                        low = mid + 1;
1744                else if (ret > 0)
1745                        high = mid;
1746                else {
1747                        *slot = mid;
1748                        return 0;
1749                }
1750        }
1751        *slot = low;
1752        return 1;
1753}
1754
1755/*
1756 * simple bin_search frontend that does the right thing for
1757 * leaves vs nodes
1758 */
1759int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1760                     int level, int *slot)
1761{
1762        if (level == 0)
1763                return generic_bin_search(eb,
1764                                          offsetof(struct btrfs_leaf, items),
1765                                          sizeof(struct btrfs_item),
1766                                          key, btrfs_header_nritems(eb),
1767                                          slot);
1768        else
1769                return generic_bin_search(eb,
1770                                          offsetof(struct btrfs_node, ptrs),
1771                                          sizeof(struct btrfs_key_ptr),
1772                                          key, btrfs_header_nritems(eb),
1773                                          slot);
1774}
1775
1776static void root_add_used(struct btrfs_root *root, u32 size)
1777{
1778        spin_lock(&root->accounting_lock);
1779        btrfs_set_root_used(&root->root_item,
1780                            btrfs_root_used(&root->root_item) + size);
1781        spin_unlock(&root->accounting_lock);
1782}
1783
1784static void root_sub_used(struct btrfs_root *root, u32 size)
1785{
1786        spin_lock(&root->accounting_lock);
1787        btrfs_set_root_used(&root->root_item,
1788                            btrfs_root_used(&root->root_item) - size);
1789        spin_unlock(&root->accounting_lock);
1790}
1791
1792/* given a node and slot number, this reads the blocks it points to.  The
1793 * extent buffer is returned with a reference taken (but unlocked).
1794 */
1795static noinline struct extent_buffer *
1796read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1797               int slot)
1798{
1799        int level = btrfs_header_level(parent);
1800        struct extent_buffer *eb;
1801        struct btrfs_key first_key;
1802
1803        if (slot < 0 || slot >= btrfs_header_nritems(parent))
1804                return ERR_PTR(-ENOENT);
1805
1806        BUG_ON(level == 0);
1807
1808        btrfs_node_key_to_cpu(parent, &first_key, slot);
1809        eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1810                             btrfs_node_ptr_generation(parent, slot),
1811                             level - 1, &first_key);
1812        if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1813                free_extent_buffer(eb);
1814                eb = ERR_PTR(-EIO);
1815        }
1816
1817        return eb;
1818}
1819
1820/*
1821 * node level balancing, used to make sure nodes are in proper order for
1822 * item deletion.  We balance from the top down, so we have to make sure
1823 * that a deletion won't leave an node completely empty later on.
1824 */
1825static noinline int balance_level(struct btrfs_trans_handle *trans,
1826                         struct btrfs_root *root,
1827                         struct btrfs_path *path, int level)
1828{
1829        struct btrfs_fs_info *fs_info = root->fs_info;
1830        struct extent_buffer *right = NULL;
1831        struct extent_buffer *mid;
1832        struct extent_buffer *left = NULL;
1833        struct extent_buffer *parent = NULL;
1834        int ret = 0;
1835        int wret;
1836        int pslot;
1837        int orig_slot = path->slots[level];
1838        u64 orig_ptr;
1839
1840        ASSERT(level > 0);
1841
1842        mid = path->nodes[level];
1843
1844        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1845                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1846        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1847
1848        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1849
1850        if (level < BTRFS_MAX_LEVEL - 1) {
1851                parent = path->nodes[level + 1];
1852                pslot = path->slots[level + 1];
1853        }
1854
1855        /*
1856         * deal with the case where there is only one pointer in the root
1857         * by promoting the node below to a root
1858         */
1859        if (!parent) {
1860                struct extent_buffer *child;
1861
1862                if (btrfs_header_nritems(mid) != 1)
1863                        return 0;
1864
1865                /* promote the child to a root */
1866                child = read_node_slot(fs_info, mid, 0);
1867                if (IS_ERR(child)) {
1868                        ret = PTR_ERR(child);
1869                        btrfs_handle_fs_error(fs_info, ret, NULL);
1870                        goto enospc;
1871                }
1872
1873                btrfs_tree_lock(child);
1874                btrfs_set_lock_blocking_write(child);
1875                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1876                if (ret) {
1877                        btrfs_tree_unlock(child);
1878                        free_extent_buffer(child);
1879                        goto enospc;
1880                }
1881
1882                ret = tree_mod_log_insert_root(root->node, child, 1);
1883                BUG_ON(ret < 0);
1884                rcu_assign_pointer(root->node, child);
1885
1886                add_root_to_dirty_list(root);
1887                btrfs_tree_unlock(child);
1888
1889                path->locks[level] = 0;
1890                path->nodes[level] = NULL;
1891                clean_tree_block(fs_info, mid);
1892                btrfs_tree_unlock(mid);
1893                /* once for the path */
1894                free_extent_buffer(mid);
1895
1896                root_sub_used(root, mid->len);
1897                btrfs_free_tree_block(trans, root, mid, 0, 1);
1898                /* once for the root ptr */
1899                free_extent_buffer_stale(mid);
1900                return 0;
1901        }
1902        if (btrfs_header_nritems(mid) >
1903            BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1904                return 0;
1905
1906        left = read_node_slot(fs_info, parent, pslot - 1);
1907        if (IS_ERR(left))
1908                left = NULL;
1909
1910        if (left) {
1911                btrfs_tree_lock(left);
1912                btrfs_set_lock_blocking_write(left);
1913                wret = btrfs_cow_block(trans, root, left,
1914                                       parent, pslot - 1, &left);
1915                if (wret) {
1916                        ret = wret;
1917                        goto enospc;
1918                }
1919        }
1920
1921        right = read_node_slot(fs_info, parent, pslot + 1);
1922        if (IS_ERR(right))
1923                right = NULL;
1924
1925        if (right) {
1926                btrfs_tree_lock(right);
1927                btrfs_set_lock_blocking_write(right);
1928                wret = btrfs_cow_block(trans, root, right,
1929                                       parent, pslot + 1, &right);
1930                if (wret) {
1931                        ret = wret;
1932                        goto enospc;
1933                }
1934        }
1935
1936        /* first, try to make some room in the middle buffer */
1937        if (left) {
1938                orig_slot += btrfs_header_nritems(left);
1939                wret = push_node_left(trans, fs_info, left, mid, 1);
1940                if (wret < 0)
1941                        ret = wret;
1942        }
1943
1944        /*
1945         * then try to empty the right most buffer into the middle
1946         */
1947        if (right) {
1948                wret = push_node_left(trans, fs_info, mid, right, 1);
1949                if (wret < 0 && wret != -ENOSPC)
1950                        ret = wret;
1951                if (btrfs_header_nritems(right) == 0) {
1952                        clean_tree_block(fs_info, right);
1953                        btrfs_tree_unlock(right);
1954                        del_ptr(root, path, level + 1, pslot + 1);
1955                        root_sub_used(root, right->len);
1956                        btrfs_free_tree_block(trans, root, right, 0, 1);
1957                        free_extent_buffer_stale(right);
1958                        right = NULL;
1959                } else {
1960                        struct btrfs_disk_key right_key;
1961                        btrfs_node_key(right, &right_key, 0);
1962                        ret = tree_mod_log_insert_key(parent, pslot + 1,
1963                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1964                        BUG_ON(ret < 0);
1965                        btrfs_set_node_key(parent, &right_key, pslot + 1);
1966                        btrfs_mark_buffer_dirty(parent);
1967                }
1968        }
1969        if (btrfs_header_nritems(mid) == 1) {
1970                /*
1971                 * we're not allowed to leave a node with one item in the
1972                 * tree during a delete.  A deletion from lower in the tree
1973                 * could try to delete the only pointer in this node.
1974                 * So, pull some keys from the left.
1975                 * There has to be a left pointer at this point because
1976                 * otherwise we would have pulled some pointers from the
1977                 * right
1978                 */
1979                if (!left) {
1980                        ret = -EROFS;
1981                        btrfs_handle_fs_error(fs_info, ret, NULL);
1982                        goto enospc;
1983                }
1984                wret = balance_node_right(trans, fs_info, mid, left);
1985                if (wret < 0) {
1986                        ret = wret;
1987                        goto enospc;
1988                }
1989                if (wret == 1) {
1990                        wret = push_node_left(trans, fs_info, left, mid, 1);
1991                        if (wret < 0)
1992                                ret = wret;
1993                }
1994                BUG_ON(wret == 1);
1995        }
1996        if (btrfs_header_nritems(mid) == 0) {
1997                clean_tree_block(fs_info, mid);
1998                btrfs_tree_unlock(mid);
1999                del_ptr(root, path, level + 1, pslot);
2000                root_sub_used(root, mid->len);
2001                btrfs_free_tree_block(trans, root, mid, 0, 1);
2002                free_extent_buffer_stale(mid);
2003                mid = NULL;
2004        } else {
2005                /* update the parent key to reflect our changes */
2006                struct btrfs_disk_key mid_key;
2007                btrfs_node_key(mid, &mid_key, 0);
2008                ret = tree_mod_log_insert_key(parent, pslot,
2009                                MOD_LOG_KEY_REPLACE, GFP_NOFS);
2010                BUG_ON(ret < 0);
2011                btrfs_set_node_key(parent, &mid_key, pslot);
2012                btrfs_mark_buffer_dirty(parent);
2013        }
2014
2015        /* update the path */
2016        if (left) {
2017                if (btrfs_header_nritems(left) > orig_slot) {
2018                        extent_buffer_get(left);
2019                        /* left was locked after cow */
2020                        path->nodes[level] = left;
2021                        path->slots[level + 1] -= 1;
2022                        path->slots[level] = orig_slot;
2023                        if (mid) {
2024                                btrfs_tree_unlock(mid);
2025                                free_extent_buffer(mid);
2026                        }
2027                } else {
2028                        orig_slot -= btrfs_header_nritems(left);
2029                        path->slots[level] = orig_slot;
2030                }
2031        }
2032        /* double check we haven't messed things up */
2033        if (orig_ptr !=
2034            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2035                BUG();
2036enospc:
2037        if (right) {
2038                btrfs_tree_unlock(right);
2039                free_extent_buffer(right);
2040        }
2041        if (left) {
2042                if (path->nodes[level] != left)
2043                        btrfs_tree_unlock(left);
2044                free_extent_buffer(left);
2045        }
2046        return ret;
2047}
2048
2049/* Node balancing for insertion.  Here we only split or push nodes around
2050 * when they are completely full.  This is also done top down, so we
2051 * have to be pessimistic.
2052 */
2053static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2054                                          struct btrfs_root *root,
2055                                          struct btrfs_path *path, int level)
2056{
2057        struct btrfs_fs_info *fs_info = root->fs_info;
2058        struct extent_buffer *right = NULL;
2059        struct extent_buffer *mid;
2060        struct extent_buffer *left = NULL;
2061        struct extent_buffer *parent = NULL;
2062        int ret = 0;
2063        int wret;
2064        int pslot;
2065        int orig_slot = path->slots[level];
2066
2067        if (level == 0)
2068                return 1;
2069
2070        mid = path->nodes[level];
2071        WARN_ON(btrfs_header_generation(mid) != trans->transid);
2072
2073        if (level < BTRFS_MAX_LEVEL - 1) {
2074                parent = path->nodes[level + 1];
2075                pslot = path->slots[level + 1];
2076        }
2077
2078        if (!parent)
2079                return 1;
2080
2081        left = read_node_slot(fs_info, parent, pslot - 1);
2082        if (IS_ERR(left))
2083                left = NULL;
2084
2085        /* first, try to make some room in the middle buffer */
2086        if (left) {
2087                u32 left_nr;
2088
2089                btrfs_tree_lock(left);
2090                btrfs_set_lock_blocking_write(left);
2091
2092                left_nr = btrfs_header_nritems(left);
2093                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2094                        wret = 1;
2095                } else {
2096                        ret = btrfs_cow_block(trans, root, left, parent,
2097                                              pslot - 1, &left);
2098                        if (ret)
2099                                wret = 1;
2100                        else {
2101                                wret = push_node_left(trans, fs_info,
2102                                                      left, mid, 0);
2103                        }
2104                }
2105                if (wret < 0)
2106                        ret = wret;
2107                if (wret == 0) {
2108                        struct btrfs_disk_key disk_key;
2109                        orig_slot += left_nr;
2110                        btrfs_node_key(mid, &disk_key, 0);
2111                        ret = tree_mod_log_insert_key(parent, pslot,
2112                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
2113                        BUG_ON(ret < 0);
2114                        btrfs_set_node_key(parent, &disk_key, pslot);
2115                        btrfs_mark_buffer_dirty(parent);
2116                        if (btrfs_header_nritems(left) > orig_slot) {
2117                                path->nodes[level] = left;
2118                                path->slots[level + 1] -= 1;
2119                                path->slots[level] = orig_slot;
2120                                btrfs_tree_unlock(mid);
2121                                free_extent_buffer(mid);
2122                        } else {
2123                                orig_slot -=
2124                                        btrfs_header_nritems(left);
2125                                path->slots[level] = orig_slot;
2126                                btrfs_tree_unlock(left);
2127                                free_extent_buffer(left);
2128                        }
2129                        return 0;
2130                }
2131                btrfs_tree_unlock(left);
2132                free_extent_buffer(left);
2133        }
2134        right = read_node_slot(fs_info, parent, pslot + 1);
2135        if (IS_ERR(right))
2136                right = NULL;
2137
2138        /*
2139         * then try to empty the right most buffer into the middle
2140         */
2141        if (right) {
2142                u32 right_nr;
2143
2144                btrfs_tree_lock(right);
2145                btrfs_set_lock_blocking_write(right);
2146
2147                right_nr = btrfs_header_nritems(right);
2148                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2149                        wret = 1;
2150                } else {
2151                        ret = btrfs_cow_block(trans, root, right,
2152                                              parent, pslot + 1,
2153                                              &right);
2154                        if (ret)
2155                                wret = 1;
2156                        else {
2157                                wret = balance_node_right(trans, fs_info,
2158                                                          right, mid);
2159                        }
2160                }
2161                if (wret < 0)
2162                        ret = wret;
2163                if (wret == 0) {
2164                        struct btrfs_disk_key disk_key;
2165
2166                        btrfs_node_key(right, &disk_key, 0);
2167                        ret = tree_mod_log_insert_key(parent, pslot + 1,
2168                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
2169                        BUG_ON(ret < 0);
2170                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
2171                        btrfs_mark_buffer_dirty(parent);
2172
2173                        if (btrfs_header_nritems(mid) <= orig_slot) {
2174                                path->nodes[level] = right;
2175                                path->slots[level + 1] += 1;
2176                                path->slots[level] = orig_slot -
2177                                        btrfs_header_nritems(mid);
2178                                btrfs_tree_unlock(mid);
2179                                free_extent_buffer(mid);
2180                        } else {
2181                                btrfs_tree_unlock(right);
2182                                free_extent_buffer(right);
2183                        }
2184                        return 0;
2185                }
2186                btrfs_tree_unlock(right);
2187                free_extent_buffer(right);
2188        }
2189        return 1;
2190}
2191
2192/*
2193 * readahead one full node of leaves, finding things that are close
2194 * to the block in 'slot', and triggering ra on them.
2195 */
2196static void reada_for_search(struct btrfs_fs_info *fs_info,
2197                             struct btrfs_path *path,
2198                             int level, int slot, u64 objectid)
2199{
2200        struct extent_buffer *node;
2201        struct btrfs_disk_key disk_key;
2202        u32 nritems;
2203        u64 search;
2204        u64 target;
2205        u64 nread = 0;
2206        struct extent_buffer *eb;
2207        u32 nr;
2208        u32 blocksize;
2209        u32 nscan = 0;
2210
2211        if (level != 1)
2212                return;
2213
2214        if (!path->nodes[level])
2215                return;
2216
2217        node = path->nodes[level];
2218
2219        search = btrfs_node_blockptr(node, slot);
2220        blocksize = fs_info->nodesize;
2221        eb = find_extent_buffer(fs_info, search);
2222        if (eb) {
2223                free_extent_buffer(eb);
2224                return;
2225        }
2226
2227        target = search;
2228
2229        nritems = btrfs_header_nritems(node);
2230        nr = slot;
2231
2232        while (1) {
2233                if (path->reada == READA_BACK) {
2234                        if (nr == 0)
2235                                break;
2236                        nr--;
2237                } else if (path->reada == READA_FORWARD) {
2238                        nr++;
2239                        if (nr >= nritems)
2240                                break;
2241                }
2242                if (path->reada == READA_BACK && objectid) {
2243                        btrfs_node_key(node, &disk_key, nr);
2244                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
2245                                break;
2246                }
2247                search = btrfs_node_blockptr(node, nr);
2248                if ((search <= target && target - search <= 65536) ||
2249                    (search > target && search - target <= 65536)) {
2250                        readahead_tree_block(fs_info, search);
2251                        nread += blocksize;
2252                }
2253                nscan++;
2254                if ((nread > 65536 || nscan > 32))
2255                        break;
2256        }
2257}
2258
2259static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2260                                       struct btrfs_path *path, int level)
2261{
2262        int slot;
2263        int nritems;
2264        struct extent_buffer *parent;
2265        struct extent_buffer *eb;
2266        u64 gen;
2267        u64 block1 = 0;
2268        u64 block2 = 0;
2269
2270        parent = path->nodes[level + 1];
2271        if (!parent)
2272                return;
2273
2274        nritems = btrfs_header_nritems(parent);
2275        slot = path->slots[level + 1];
2276
2277        if (slot > 0) {
2278                block1 = btrfs_node_blockptr(parent, slot - 1);
2279                gen = btrfs_node_ptr_generation(parent, slot - 1);
2280                eb = find_extent_buffer(fs_info, block1);
2281                /*
2282                 * if we get -eagain from btrfs_buffer_uptodate, we
2283                 * don't want to return eagain here.  That will loop
2284                 * forever
2285                 */
2286                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2287                        block1 = 0;
2288                free_extent_buffer(eb);
2289        }
2290        if (slot + 1 < nritems) {
2291                block2 = btrfs_node_blockptr(parent, slot + 1);
2292                gen = btrfs_node_ptr_generation(parent, slot + 1);
2293                eb = find_extent_buffer(fs_info, block2);
2294                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2295                        block2 = 0;
2296                free_extent_buffer(eb);
2297        }
2298
2299        if (block1)
2300                readahead_tree_block(fs_info, block1);
2301        if (block2)
2302                readahead_tree_block(fs_info, block2);
2303}
2304
2305
2306/*
2307 * when we walk down the tree, it is usually safe to unlock the higher layers
2308 * in the tree.  The exceptions are when our path goes through slot 0, because
2309 * operations on the tree might require changing key pointers higher up in the
2310 * tree.
2311 *
2312 * callers might also have set path->keep_locks, which tells this code to keep
2313 * the lock if the path points to the last slot in the block.  This is part of
2314 * walking through the tree, and selecting the next slot in the higher block.
2315 *
2316 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2317 * if lowest_unlock is 1, level 0 won't be unlocked
2318 */
2319static noinline void unlock_up(struct btrfs_path *path, int level,
2320                               int lowest_unlock, int min_write_lock_level,
2321                               int *write_lock_level)
2322{
2323        int i;
2324        int skip_level = level;
2325        int no_skips = 0;
2326        struct extent_buffer *t;
2327
2328        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2329                if (!path->nodes[i])
2330                        break;
2331                if (!path->locks[i])
2332                        break;
2333                if (!no_skips && path->slots[i] == 0) {
2334                        skip_level = i + 1;
2335                        continue;
2336                }
2337                if (!no_skips && path->keep_locks) {
2338                        u32 nritems;
2339                        t = path->nodes[i];
2340                        nritems = btrfs_header_nritems(t);
2341                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
2342                                skip_level = i + 1;
2343                                continue;
2344                        }
2345                }
2346                if (skip_level < i && i >= lowest_unlock)
2347                        no_skips = 1;
2348
2349                t = path->nodes[i];
2350                if (i >= lowest_unlock && i > skip_level) {
2351                        btrfs_tree_unlock_rw(t, path->locks[i]);
2352                        path->locks[i] = 0;
2353                        if (write_lock_level &&
2354                            i > min_write_lock_level &&
2355                            i <= *write_lock_level) {
2356                                *write_lock_level = i - 1;
2357                        }
2358                }
2359        }
2360}
2361
2362/*
2363 * This releases any locks held in the path starting at level and
2364 * going all the way up to the root.
2365 *
2366 * btrfs_search_slot will keep the lock held on higher nodes in a few
2367 * corner cases, such as COW of the block at slot zero in the node.  This
2368 * ignores those rules, and it should only be called when there are no
2369 * more updates to be done higher up in the tree.
2370 */
2371noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2372{
2373        int i;
2374
2375        if (path->keep_locks)
2376                return;
2377
2378        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2379                if (!path->nodes[i])
2380                        continue;
2381                if (!path->locks[i])
2382                        continue;
2383                btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2384                path->locks[i] = 0;
2385        }
2386}
2387
2388/*
2389 * helper function for btrfs_search_slot.  The goal is to find a block
2390 * in cache without setting the path to blocking.  If we find the block
2391 * we return zero and the path is unchanged.
2392 *
2393 * If we can't find the block, we set the path blocking and do some
2394 * reada.  -EAGAIN is returned and the search must be repeated.
2395 */
2396static int
2397read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2398                      struct extent_buffer **eb_ret, int level, int slot,
2399                      const struct btrfs_key *key)
2400{
2401        struct btrfs_fs_info *fs_info = root->fs_info;
2402        u64 blocknr;
2403        u64 gen;
2404        struct extent_buffer *b = *eb_ret;
2405        struct extent_buffer *tmp;
2406        struct btrfs_key first_key;
2407        int ret;
2408        int parent_level;
2409
2410        blocknr = btrfs_node_blockptr(b, slot);
2411        gen = btrfs_node_ptr_generation(b, slot);
2412        parent_level = btrfs_header_level(b);
2413        btrfs_node_key_to_cpu(b, &first_key, slot);
2414
2415        tmp = find_extent_buffer(fs_info, blocknr);
2416        if (tmp) {
2417                /* first we do an atomic uptodate check */
2418                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2419                        *eb_ret = tmp;
2420                        return 0;
2421                }
2422
2423                /* the pages were up to date, but we failed
2424                 * the generation number check.  Do a full
2425                 * read for the generation number that is correct.
2426                 * We must do this without dropping locks so
2427                 * we can trust our generation number
2428                 */
2429                btrfs_set_path_blocking(p);
2430
2431                /* now we're allowed to do a blocking uptodate check */
2432                ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2433                if (!ret) {
2434                        *eb_ret = tmp;
2435                        return 0;
2436                }
2437                free_extent_buffer(tmp);
2438                btrfs_release_path(p);
2439                return -EIO;
2440        }
2441
2442        /*
2443         * reduce lock contention at high levels
2444         * of the btree by dropping locks before
2445         * we read.  Don't release the lock on the current
2446         * level because we need to walk this node to figure
2447         * out which blocks to read.
2448         */
2449        btrfs_unlock_up_safe(p, level + 1);
2450        btrfs_set_path_blocking(p);
2451
2452        if (p->reada != READA_NONE)
2453                reada_for_search(fs_info, p, level, slot, key->objectid);
2454
2455        ret = -EAGAIN;
2456        tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2457                              &first_key);
2458        if (!IS_ERR(tmp)) {
2459                /*
2460                 * If the read above didn't mark this buffer up to date,
2461                 * it will never end up being up to date.  Set ret to EIO now
2462                 * and give up so that our caller doesn't loop forever
2463                 * on our EAGAINs.
2464                 */
2465                if (!extent_buffer_uptodate(tmp))
2466                        ret = -EIO;
2467                free_extent_buffer(tmp);
2468        } else {
2469                ret = PTR_ERR(tmp);
2470        }
2471
2472        btrfs_release_path(p);
2473        return ret;
2474}
2475
2476/*
2477 * helper function for btrfs_search_slot.  This does all of the checks
2478 * for node-level blocks and does any balancing required based on
2479 * the ins_len.
2480 *
2481 * If no extra work was required, zero is returned.  If we had to
2482 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2483 * start over
2484 */
2485static int
2486setup_nodes_for_search(struct btrfs_trans_handle *trans,
2487                       struct btrfs_root *root, struct btrfs_path *p,
2488                       struct extent_buffer *b, int level, int ins_len,
2489                       int *write_lock_level)
2490{
2491        struct btrfs_fs_info *fs_info = root->fs_info;
2492        int ret;
2493
2494        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2495            BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2496                int sret;
2497
2498                if (*write_lock_level < level + 1) {
2499                        *write_lock_level = level + 1;
2500                        btrfs_release_path(p);
2501                        goto again;
2502                }
2503
2504                btrfs_set_path_blocking(p);
2505                reada_for_balance(fs_info, p, level);
2506                sret = split_node(trans, root, p, level);
2507
2508                BUG_ON(sret > 0);
2509                if (sret) {
2510                        ret = sret;
2511                        goto done;
2512                }
2513                b = p->nodes[level];
2514        } else if (ins_len < 0 && btrfs_header_nritems(b) <
2515                   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2516                int sret;
2517
2518                if (*write_lock_level < level + 1) {
2519                        *write_lock_level = level + 1;
2520                        btrfs_release_path(p);
2521                        goto again;
2522                }
2523
2524                btrfs_set_path_blocking(p);
2525                reada_for_balance(fs_info, p, level);
2526                sret = balance_level(trans, root, p, level);
2527
2528                if (sret) {
2529                        ret = sret;
2530                        goto done;
2531                }
2532                b = p->nodes[level];
2533                if (!b) {
2534                        btrfs_release_path(p);
2535                        goto again;
2536                }
2537                BUG_ON(btrfs_header_nritems(b) == 1);
2538        }
2539        return 0;
2540
2541again:
2542        ret = -EAGAIN;
2543done:
2544        return ret;
2545}
2546
2547static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2548                      int level, int *prev_cmp, int *slot)
2549{
2550        if (*prev_cmp != 0) {
2551                *prev_cmp = btrfs_bin_search(b, key, level, slot);
2552                return *prev_cmp;
2553        }
2554
2555        *slot = 0;
2556
2557        return 0;
2558}
2559
2560int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2561                u64 iobjectid, u64 ioff, u8 key_type,
2562                struct btrfs_key *found_key)
2563{
2564        int ret;
2565        struct btrfs_key key;
2566        struct extent_buffer *eb;
2567
2568        ASSERT(path);
2569        ASSERT(found_key);
2570
2571        key.type = key_type;
2572        key.objectid = iobjectid;
2573        key.offset = ioff;
2574
2575        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2576        if (ret < 0)
2577                return ret;
2578
2579        eb = path->nodes[0];
2580        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2581                ret = btrfs_next_leaf(fs_root, path);
2582                if (ret)
2583                        return ret;
2584                eb = path->nodes[0];
2585        }
2586
2587        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2588        if (found_key->type != key.type ||
2589                        found_key->objectid != key.objectid)
2590                return 1;
2591
2592        return 0;
2593}
2594
2595static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2596                                                        struct btrfs_path *p,
2597                                                        int write_lock_level)
2598{
2599        struct btrfs_fs_info *fs_info = root->fs_info;
2600        struct extent_buffer *b;
2601        int root_lock;
2602        int level = 0;
2603
2604        /* We try very hard to do read locks on the root */
2605        root_lock = BTRFS_READ_LOCK;
2606
2607        if (p->search_commit_root) {
2608                /*
2609                 * The commit roots are read only so we always do read locks,
2610                 * and we always must hold the commit_root_sem when doing
2611                 * searches on them, the only exception is send where we don't
2612                 * want to block transaction commits for a long time, so
2613                 * we need to clone the commit root in order to avoid races
2614                 * with transaction commits that create a snapshot of one of
2615                 * the roots used by a send operation.
2616                 */
2617                if (p->need_commit_sem) {
2618                        down_read(&fs_info->commit_root_sem);
2619                        b = btrfs_clone_extent_buffer(root->commit_root);
2620                        up_read(&fs_info->commit_root_sem);
2621                        if (!b)
2622                                return ERR_PTR(-ENOMEM);
2623
2624                } else {
2625                        b = root->commit_root;
2626                        extent_buffer_get(b);
2627                }
2628                level = btrfs_header_level(b);
2629                /*
2630                 * Ensure that all callers have set skip_locking when
2631                 * p->search_commit_root = 1.
2632                 */
2633                ASSERT(p->skip_locking == 1);
2634
2635                goto out;
2636        }
2637
2638        if (p->skip_locking) {
2639                b = btrfs_root_node(root);
2640                level = btrfs_header_level(b);
2641                goto out;
2642        }
2643
2644        /*
2645         * If the level is set to maximum, we can skip trying to get the read
2646         * lock.
2647         */
2648        if (write_lock_level < BTRFS_MAX_LEVEL) {
2649                /*
2650                 * We don't know the level of the root node until we actually
2651                 * have it read locked
2652                 */
2653                b = btrfs_read_lock_root_node(root);
2654                level = btrfs_header_level(b);
2655                if (level > write_lock_level)
2656                        goto out;
2657
2658                /* Whoops, must trade for write lock */
2659                btrfs_tree_read_unlock(b);
2660                free_extent_buffer(b);
2661        }
2662
2663        b = btrfs_lock_root_node(root);
2664        root_lock = BTRFS_WRITE_LOCK;
2665
2666        /* The level might have changed, check again */
2667        level = btrfs_header_level(b);
2668
2669out:
2670        p->nodes[level] = b;
2671        if (!p->skip_locking)
2672                p->locks[level] = root_lock;
2673        /*
2674         * Callers are responsible for dropping b's references.
2675         */
2676        return b;
2677}
2678
2679
2680/*
2681 * btrfs_search_slot - look for a key in a tree and perform necessary
2682 * modifications to preserve tree invariants.
2683 *
2684 * @trans:      Handle of transaction, used when modifying the tree
2685 * @p:          Holds all btree nodes along the search path
2686 * @root:       The root node of the tree
2687 * @key:        The key we are looking for
2688 * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2689 *              deletions it's -1. 0 for plain searches
2690 * @cow:        boolean should CoW operations be performed. Must always be 1
2691 *              when modifying the tree.
2692 *
2693 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2694 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2695 *
2696 * If @key is found, 0 is returned and you can find the item in the leaf level
2697 * of the path (level 0)
2698 *
2699 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2700 * points to the slot where it should be inserted
2701 *
2702 * If an error is encountered while searching the tree a negative error number
2703 * is returned
2704 */
2705int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2706                      const struct btrfs_key *key, struct btrfs_path *p,
2707                      int ins_len, int cow)
2708{
2709        struct btrfs_fs_info *fs_info = root->fs_info;
2710        struct extent_buffer *b;
2711        int slot;
2712        int ret;
2713        int err;
2714        int level;
2715        int lowest_unlock = 1;
2716        /* everything at write_lock_level or lower must be write locked */
2717        int write_lock_level = 0;
2718        u8 lowest_level = 0;
2719        int min_write_lock_level;
2720        int prev_cmp;
2721
2722        lowest_level = p->lowest_level;
2723        WARN_ON(lowest_level && ins_len > 0);
2724        WARN_ON(p->nodes[0] != NULL);
2725        BUG_ON(!cow && ins_len);
2726
2727        if (ins_len < 0) {
2728                lowest_unlock = 2;
2729
2730                /* when we are removing items, we might have to go up to level
2731                 * two as we update tree pointers  Make sure we keep write
2732                 * for those levels as well
2733                 */
2734                write_lock_level = 2;
2735        } else if (ins_len > 0) {
2736                /*
2737                 * for inserting items, make sure we have a write lock on
2738                 * level 1 so we can update keys
2739                 */
2740                write_lock_level = 1;
2741        }
2742
2743        if (!cow)
2744                write_lock_level = -1;
2745
2746        if (cow && (p->keep_locks || p->lowest_level))
2747                write_lock_level = BTRFS_MAX_LEVEL;
2748
2749        min_write_lock_level = write_lock_level;
2750
2751again:
2752        prev_cmp = -1;
2753        b = btrfs_search_slot_get_root(root, p, write_lock_level);
2754        if (IS_ERR(b)) {
2755                ret = PTR_ERR(b);
2756                goto done;
2757        }
2758
2759        while (b) {
2760                level = btrfs_header_level(b);
2761
2762                /*
2763                 * setup the path here so we can release it under lock
2764                 * contention with the cow code
2765                 */
2766                if (cow) {
2767                        bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2768
2769                        /*
2770                         * if we don't really need to cow this block
2771                         * then we don't want to set the path blocking,
2772                         * so we test it here
2773                         */
2774                        if (!should_cow_block(trans, root, b)) {
2775                                trans->dirty = true;
2776                                goto cow_done;
2777                        }
2778
2779                        /*
2780                         * must have write locks on this node and the
2781                         * parent
2782                         */
2783                        if (level > write_lock_level ||
2784                            (level + 1 > write_lock_level &&
2785                            level + 1 < BTRFS_MAX_LEVEL &&
2786                            p->nodes[level + 1])) {
2787                                write_lock_level = level + 1;
2788                                btrfs_release_path(p);
2789                                goto again;
2790                        }
2791
2792                        btrfs_set_path_blocking(p);
2793                        if (last_level)
2794                                err = btrfs_cow_block(trans, root, b, NULL, 0,
2795                                                      &b);
2796                        else
2797                                err = btrfs_cow_block(trans, root, b,
2798                                                      p->nodes[level + 1],
2799                                                      p->slots[level + 1], &b);
2800                        if (err) {
2801                                ret = err;
2802                                goto done;
2803                        }
2804                }
2805cow_done:
2806                p->nodes[level] = b;
2807                /*
2808                 * Leave path with blocking locks to avoid massive
2809                 * lock context switch, this is made on purpose.
2810                 */
2811
2812                /*
2813                 * we have a lock on b and as long as we aren't changing
2814                 * the tree, there is no way to for the items in b to change.
2815                 * It is safe to drop the lock on our parent before we
2816                 * go through the expensive btree search on b.
2817                 *
2818                 * If we're inserting or deleting (ins_len != 0), then we might
2819                 * be changing slot zero, which may require changing the parent.
2820                 * So, we can't drop the lock until after we know which slot
2821                 * we're operating on.
2822                 */
2823                if (!ins_len && !p->keep_locks) {
2824                        int u = level + 1;
2825
2826                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2827                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2828                                p->locks[u] = 0;
2829                        }
2830                }
2831
2832                ret = key_search(b, key, level, &prev_cmp, &slot);
2833                if (ret < 0)
2834                        goto done;
2835
2836                if (level != 0) {
2837                        int dec = 0;
2838                        if (ret && slot > 0) {
2839                                dec = 1;
2840                                slot -= 1;
2841                        }
2842                        p->slots[level] = slot;
2843                        err = setup_nodes_for_search(trans, root, p, b, level,
2844                                             ins_len, &write_lock_level);
2845                        if (err == -EAGAIN)
2846                                goto again;
2847                        if (err) {
2848                                ret = err;
2849                                goto done;
2850                        }
2851                        b = p->nodes[level];
2852                        slot = p->slots[level];
2853
2854                        /*
2855                         * slot 0 is special, if we change the key
2856                         * we have to update the parent pointer
2857                         * which means we must have a write lock
2858                         * on the parent
2859                         */
2860                        if (slot == 0 && ins_len &&
2861                            write_lock_level < level + 1) {
2862                                write_lock_level = level + 1;
2863                                btrfs_release_path(p);
2864                                goto again;
2865                        }
2866
2867                        unlock_up(p, level, lowest_unlock,
2868                                  min_write_lock_level, &write_lock_level);
2869
2870                        if (level == lowest_level) {
2871                                if (dec)
2872                                        p->slots[level]++;
2873                                goto done;
2874                        }
2875
2876                        err = read_block_for_search(root, p, &b, level,
2877                                                    slot, key);
2878                        if (err == -EAGAIN)
2879                                goto again;
2880                        if (err) {
2881                                ret = err;
2882                                goto done;
2883                        }
2884
2885                        if (!p->skip_locking) {
2886                                level = btrfs_header_level(b);
2887                                if (level <= write_lock_level) {
2888                                        err = btrfs_try_tree_write_lock(b);
2889                                        if (!err) {
2890                                                btrfs_set_path_blocking(p);
2891                                                btrfs_tree_lock(b);
2892                                        }
2893                                        p->locks[level] = BTRFS_WRITE_LOCK;
2894                                } else {
2895                                        err = btrfs_tree_read_lock_atomic(b);
2896                                        if (!err) {
2897                                                btrfs_set_path_blocking(p);
2898                                                btrfs_tree_read_lock(b);
2899                                        }
2900                                        p->locks[level] = BTRFS_READ_LOCK;
2901                                }
2902                                p->nodes[level] = b;
2903                        }
2904                } else {
2905                        p->slots[level] = slot;
2906                        if (ins_len > 0 &&
2907                            btrfs_leaf_free_space(fs_info, b) < ins_len) {
2908                                if (write_lock_level < 1) {
2909                                        write_lock_level = 1;
2910                                        btrfs_release_path(p);
2911                                        goto again;
2912                                }
2913
2914                                btrfs_set_path_blocking(p);
2915                                err = split_leaf(trans, root, key,
2916                                                 p, ins_len, ret == 0);
2917
2918                                BUG_ON(err > 0);
2919                                if (err) {
2920                                        ret = err;
2921                                        goto done;
2922                                }
2923                        }
2924                        if (!p->search_for_split)
2925                                unlock_up(p, level, lowest_unlock,
2926                                          min_write_lock_level, NULL);
2927                        goto done;
2928                }
2929        }
2930        ret = 1;
2931done:
2932        /*
2933         * we don't really know what they plan on doing with the path
2934         * from here on, so for now just mark it as blocking
2935         */
2936        if (!p->leave_spinning)
2937                btrfs_set_path_blocking(p);
2938        if (ret < 0 && !p->skip_release_on_error)
2939                btrfs_release_path(p);
2940        return ret;
2941}
2942
2943/*
2944 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2945 * current state of the tree together with the operations recorded in the tree
2946 * modification log to search for the key in a previous version of this tree, as
2947 * denoted by the time_seq parameter.
2948 *
2949 * Naturally, there is no support for insert, delete or cow operations.
2950 *
2951 * The resulting path and return value will be set up as if we called
2952 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2953 */
2954int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2955                          struct btrfs_path *p, u64 time_seq)
2956{
2957        struct btrfs_fs_info *fs_info = root->fs_info;
2958        struct extent_buffer *b;
2959        int slot;
2960        int ret;
2961        int err;
2962        int level;
2963        int lowest_unlock = 1;
2964        u8 lowest_level = 0;
2965        int prev_cmp = -1;
2966
2967        lowest_level = p->lowest_level;
2968        WARN_ON(p->nodes[0] != NULL);
2969
2970        if (p->search_commit_root) {
2971                BUG_ON(time_seq);
2972                return btrfs_search_slot(NULL, root, key, p, 0, 0);
2973        }
2974
2975again:
2976        b = get_old_root(root, time_seq);
2977        if (!b) {
2978                ret = -EIO;
2979                goto done;
2980        }
2981        level = btrfs_header_level(b);
2982        p->locks[level] = BTRFS_READ_LOCK;
2983
2984        while (b) {
2985                level = btrfs_header_level(b);
2986                p->nodes[level] = b;
2987
2988                /*
2989                 * we have a lock on b and as long as we aren't changing
2990                 * the tree, there is no way to for the items in b to change.
2991                 * It is safe to drop the lock on our parent before we
2992                 * go through the expensive btree search on b.
2993                 */
2994                btrfs_unlock_up_safe(p, level + 1);
2995
2996                /*
2997                 * Since we can unwind ebs we want to do a real search every
2998                 * time.
2999                 */
3000                prev_cmp = -1;
3001                ret = key_search(b, key, level, &prev_cmp, &slot);
3002                if (ret < 0)
3003                        goto done;
3004
3005                if (level != 0) {
3006                        int dec = 0;
3007                        if (ret && slot > 0) {
3008                                dec = 1;
3009                                slot -= 1;
3010                        }
3011                        p->slots[level] = slot;
3012                        unlock_up(p, level, lowest_unlock, 0, NULL);
3013
3014                        if (level == lowest_level) {
3015                                if (dec)
3016                                        p->slots[level]++;
3017                                goto done;
3018                        }
3019
3020                        err = read_block_for_search(root, p, &b, level,
3021                                                    slot, key);
3022                        if (err == -EAGAIN)
3023                                goto again;
3024                        if (err) {
3025                                ret = err;
3026                                goto done;
3027                        }
3028
3029                        level = btrfs_header_level(b);
3030                        err = btrfs_tree_read_lock_atomic(b);
3031                        if (!err) {
3032                                btrfs_set_path_blocking(p);
3033                                btrfs_tree_read_lock(b);
3034                        }
3035                        b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3036                        if (!b) {
3037                                ret = -ENOMEM;
3038                                goto done;
3039                        }
3040                        p->locks[level] = BTRFS_READ_LOCK;
3041                        p->nodes[level] = b;
3042                } else {
3043                        p->slots[level] = slot;
3044                        unlock_up(p, level, lowest_unlock, 0, NULL);
3045                        goto done;
3046                }
3047        }
3048        ret = 1;
3049done:
3050        if (!p->leave_spinning)
3051                btrfs_set_path_blocking(p);
3052        if (ret < 0)
3053                btrfs_release_path(p);
3054
3055        return ret;
3056}
3057
3058/*
3059 * helper to use instead of search slot if no exact match is needed but
3060 * instead the next or previous item should be returned.
3061 * When find_higher is true, the next higher item is returned, the next lower
3062 * otherwise.
3063 * When return_any and find_higher are both true, and no higher item is found,
3064 * return the next lower instead.
3065 * When return_any is true and find_higher is false, and no lower item is found,
3066 * return the next higher instead.
3067 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3068 * < 0 on error
3069 */
3070int btrfs_search_slot_for_read(struct btrfs_root *root,
3071                               const struct btrfs_key *key,
3072                               struct btrfs_path *p, int find_higher,
3073                               int return_any)
3074{
3075        int ret;
3076        struct extent_buffer *leaf;
3077
3078again:
3079        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3080        if (ret <= 0)
3081                return ret;
3082        /*
3083         * a return value of 1 means the path is at the position where the
3084         * item should be inserted. Normally this is the next bigger item,
3085         * but in case the previous item is the last in a leaf, path points
3086         * to the first free slot in the previous leaf, i.e. at an invalid
3087         * item.
3088         */
3089        leaf = p->nodes[0];
3090
3091        if (find_higher) {
3092                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3093                        ret = btrfs_next_leaf(root, p);
3094                        if (ret <= 0)
3095                                return ret;
3096                        if (!return_any)
3097                                return 1;
3098                        /*
3099                         * no higher item found, return the next
3100                         * lower instead
3101                         */
3102                        return_any = 0;
3103                        find_higher = 0;
3104                        btrfs_release_path(p);
3105                        goto again;
3106                }
3107        } else {
3108                if (p->slots[0] == 0) {
3109                        ret = btrfs_prev_leaf(root, p);
3110                        if (ret < 0)
3111                                return ret;
3112                        if (!ret) {
3113                                leaf = p->nodes[0];
3114                                if (p->slots[0] == btrfs_header_nritems(leaf))
3115                                        p->slots[0]--;
3116                                return 0;
3117                        }
3118                        if (!return_any)
3119                                return 1;
3120                        /*
3121                         * no lower item found, return the next
3122                         * higher instead
3123                         */
3124                        return_any = 0;
3125                        find_higher = 1;
3126                        btrfs_release_path(p);
3127                        goto again;
3128                } else {
3129                        --p->slots[0];
3130                }
3131        }
3132        return 0;
3133}
3134
3135/*
3136 * adjust the pointers going up the tree, starting at level
3137 * making sure the right key of each node is points to 'key'.
3138 * This is used after shifting pointers to the left, so it stops
3139 * fixing up pointers when a given leaf/node is not in slot 0 of the
3140 * higher levels
3141 *
3142 */
3143static void fixup_low_keys(struct btrfs_path *path,
3144                           struct btrfs_disk_key *key, int level)
3145{
3146        int i;
3147        struct extent_buffer *t;
3148        int ret;
3149
3150        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3151                int tslot = path->slots[i];
3152
3153                if (!path->nodes[i])
3154                        break;
3155                t = path->nodes[i];
3156                ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3157                                GFP_ATOMIC);
3158                BUG_ON(ret < 0);
3159                btrfs_set_node_key(t, key, tslot);
3160                btrfs_mark_buffer_dirty(path->nodes[i]);
3161                if (tslot != 0)
3162                        break;
3163        }
3164}
3165
3166/*
3167 * update item key.
3168 *
3169 * This function isn't completely safe. It's the caller's responsibility
3170 * that the new key won't break the order
3171 */
3172void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3173                             struct btrfs_path *path,
3174                             const struct btrfs_key *new_key)
3175{
3176        struct btrfs_disk_key disk_key;
3177        struct extent_buffer *eb;
3178        int slot;
3179
3180        eb = path->nodes[0];
3181        slot = path->slots[0];
3182        if (slot > 0) {
3183                btrfs_item_key(eb, &disk_key, slot - 1);
3184                BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3185        }
3186        if (slot < btrfs_header_nritems(eb) - 1) {
3187                btrfs_item_key(eb, &disk_key, slot + 1);
3188                BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3189        }
3190
3191        btrfs_cpu_key_to_disk(&disk_key, new_key);
3192        btrfs_set_item_key(eb, &disk_key, slot);
3193        btrfs_mark_buffer_dirty(eb);
3194        if (slot == 0)
3195                fixup_low_keys(path, &disk_key, 1);
3196}
3197
3198/*
3199 * try to push data from one node into the next node left in the
3200 * tree.
3201 *
3202 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3203 * error, and > 0 if there was no room in the left hand block.
3204 */
3205static int push_node_left(struct btrfs_trans_handle *trans,
3206                          struct btrfs_fs_info *fs_info,
3207                          struct extent_buffer *dst,
3208                          struct extent_buffer *src, int empty)
3209{
3210        int push_items = 0;
3211        int src_nritems;
3212        int dst_nritems;
3213        int ret = 0;
3214
3215        src_nritems = btrfs_header_nritems(src);
3216        dst_nritems = btrfs_header_nritems(dst);
3217        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3218        WARN_ON(btrfs_header_generation(src) != trans->transid);
3219        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3220
3221        if (!empty && src_nritems <= 8)
3222                return 1;
3223
3224        if (push_items <= 0)
3225                return 1;
3226
3227        if (empty) {
3228                push_items = min(src_nritems, push_items);
3229                if (push_items < src_nritems) {
3230                        /* leave at least 8 pointers in the node if
3231                         * we aren't going to empty it
3232                         */
3233                        if (src_nritems - push_items < 8) {
3234                                if (push_items <= 8)
3235                                        return 1;
3236                                push_items -= 8;
3237                        }
3238                }
3239        } else
3240                push_items = min(src_nritems - 8, push_items);
3241
3242        ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3243                                   push_items);
3244        if (ret) {
3245                btrfs_abort_transaction(trans, ret);
3246                return ret;
3247        }
3248        copy_extent_buffer(dst, src,
3249                           btrfs_node_key_ptr_offset(dst_nritems),
3250                           btrfs_node_key_ptr_offset(0),
3251                           push_items * sizeof(struct btrfs_key_ptr));
3252
3253        if (push_items < src_nritems) {
3254                /*
3255                 * Don't call tree_mod_log_insert_move here, key removal was
3256                 * already fully logged by tree_mod_log_eb_copy above.
3257                 */
3258                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3259                                      btrfs_node_key_ptr_offset(push_items),
3260                                      (src_nritems - push_items) *
3261                                      sizeof(struct btrfs_key_ptr));
3262        }
3263        btrfs_set_header_nritems(src, src_nritems - push_items);
3264        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3265        btrfs_mark_buffer_dirty(src);
3266        btrfs_mark_buffer_dirty(dst);
3267
3268        return ret;
3269}
3270
3271/*
3272 * try to push data from one node into the next node right in the
3273 * tree.
3274 *
3275 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3276 * error, and > 0 if there was no room in the right hand block.
3277 *
3278 * this will  only push up to 1/2 the contents of the left node over
3279 */
3280static int balance_node_right(struct btrfs_trans_handle *trans,
3281                              struct btrfs_fs_info *fs_info,
3282                              struct extent_buffer *dst,
3283                              struct extent_buffer *src)
3284{
3285        int push_items = 0;
3286        int max_push;
3287        int src_nritems;
3288        int dst_nritems;
3289        int ret = 0;
3290
3291        WARN_ON(btrfs_header_generation(src) != trans->transid);
3292        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3293
3294        src_nritems = btrfs_header_nritems(src);
3295        dst_nritems = btrfs_header_nritems(dst);
3296        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3297        if (push_items <= 0)
3298                return 1;
3299
3300        if (src_nritems < 4)
3301                return 1;
3302
3303        max_push = src_nritems / 2 + 1;
3304        /* don't try to empty the node */
3305        if (max_push >= src_nritems)
3306                return 1;
3307
3308        if (max_push < push_items)
3309                push_items = max_push;
3310
3311        ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3312        BUG_ON(ret < 0);
3313        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3314                                      btrfs_node_key_ptr_offset(0),
3315                                      (dst_nritems) *
3316                                      sizeof(struct btrfs_key_ptr));
3317
3318        ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3319                                   src_nritems - push_items, push_items);
3320        if (ret) {
3321                btrfs_abort_transaction(trans, ret);
3322                return ret;
3323        }
3324        copy_extent_buffer(dst, src,
3325                           btrfs_node_key_ptr_offset(0),
3326                           btrfs_node_key_ptr_offset(src_nritems - push_items),
3327                           push_items * sizeof(struct btrfs_key_ptr));
3328
3329        btrfs_set_header_nritems(src, src_nritems - push_items);
3330        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3331
3332        btrfs_mark_buffer_dirty(src);
3333        btrfs_mark_buffer_dirty(dst);
3334
3335        return ret;
3336}
3337
3338/*
3339 * helper function to insert a new root level in the tree.
3340 * A new node is allocated, and a single item is inserted to
3341 * point to the existing root
3342 *
3343 * returns zero on success or < 0 on failure.
3344 */
3345static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3346                           struct btrfs_root *root,
3347                           struct btrfs_path *path, int level)
3348{
3349        struct btrfs_fs_info *fs_info = root->fs_info;
3350        u64 lower_gen;
3351        struct extent_buffer *lower;
3352        struct extent_buffer *c;
3353        struct extent_buffer *old;
3354        struct btrfs_disk_key lower_key;
3355        int ret;
3356
3357        BUG_ON(path->nodes[level]);
3358        BUG_ON(path->nodes[level-1] != root->node);
3359
3360        lower = path->nodes[level-1];
3361        if (level == 1)
3362                btrfs_item_key(lower, &lower_key, 0);
3363        else
3364                btrfs_node_key(lower, &lower_key, 0);
3365
3366        c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3367                                         root->node->start, 0);
3368        if (IS_ERR(c))
3369                return PTR_ERR(c);
3370
3371        root_add_used(root, fs_info->nodesize);
3372
3373        btrfs_set_header_nritems(c, 1);
3374        btrfs_set_node_key(c, &lower_key, 0);
3375        btrfs_set_node_blockptr(c, 0, lower->start);
3376        lower_gen = btrfs_header_generation(lower);
3377        WARN_ON(lower_gen != trans->transid);
3378
3379        btrfs_set_node_ptr_generation(c, 0, lower_gen);
3380
3381        btrfs_mark_buffer_dirty(c);
3382
3383        old = root->node;
3384        ret = tree_mod_log_insert_root(root->node, c, 0);
3385        BUG_ON(ret < 0);
3386        rcu_assign_pointer(root->node, c);
3387
3388        /* the super has an extra ref to root->node */
3389        free_extent_buffer(old);
3390
3391        add_root_to_dirty_list(root);
3392        extent_buffer_get(c);
3393        path->nodes[level] = c;
3394        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3395        path->slots[level] = 0;
3396        return 0;
3397}
3398
3399/*
3400 * worker function to insert a single pointer in a node.
3401 * the node should have enough room for the pointer already
3402 *
3403 * slot and level indicate where you want the key to go, and
3404 * blocknr is the block the key points to.
3405 */
3406static void insert_ptr(struct btrfs_trans_handle *trans,
3407                       struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3408                       struct btrfs_disk_key *key, u64 bytenr,
3409                       int slot, int level)
3410{
3411        struct extent_buffer *lower;
3412        int nritems;
3413        int ret;
3414
3415        BUG_ON(!path->nodes[level]);
3416        btrfs_assert_tree_locked(path->nodes[level]);
3417        lower = path->nodes[level];
3418        nritems = btrfs_header_nritems(lower);
3419        BUG_ON(slot > nritems);
3420        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3421        if (slot != nritems) {
3422                if (level) {
3423                        ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3424                                        nritems - slot);
3425                        BUG_ON(ret < 0);
3426                }
3427                memmove_extent_buffer(lower,
3428                              btrfs_node_key_ptr_offset(slot + 1),
3429                              btrfs_node_key_ptr_offset(slot),
3430                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
3431        }
3432        if (level) {
3433                ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3434                                GFP_NOFS);
3435                BUG_ON(ret < 0);
3436        }
3437        btrfs_set_node_key(lower, key, slot);
3438        btrfs_set_node_blockptr(lower, slot, bytenr);
3439        WARN_ON(trans->transid == 0);
3440        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3441        btrfs_set_header_nritems(lower, nritems + 1);
3442        btrfs_mark_buffer_dirty(lower);
3443}
3444
3445/*
3446 * split the node at the specified level in path in two.
3447 * The path is corrected to point to the appropriate node after the split
3448 *
3449 * Before splitting this tries to make some room in the node by pushing
3450 * left and right, if either one works, it returns right away.
3451 *
3452 * returns 0 on success and < 0 on failure
3453 */
3454static noinline int split_node(struct btrfs_trans_handle *trans,
3455                               struct btrfs_root *root,
3456                               struct btrfs_path *path, int level)
3457{
3458        struct btrfs_fs_info *fs_info = root->fs_info;
3459        struct extent_buffer *c;
3460        struct extent_buffer *split;
3461        struct btrfs_disk_key disk_key;
3462        int mid;
3463        int ret;
3464        u32 c_nritems;
3465
3466        c = path->nodes[level];
3467        WARN_ON(btrfs_header_generation(c) != trans->transid);
3468        if (c == root->node) {
3469                /*
3470                 * trying to split the root, lets make a new one
3471                 *
3472                 * tree mod log: We don't log_removal old root in
3473                 * insert_new_root, because that root buffer will be kept as a
3474                 * normal node. We are going to log removal of half of the
3475                 * elements below with tree_mod_log_eb_copy. We're holding a
3476                 * tree lock on the buffer, which is why we cannot race with
3477                 * other tree_mod_log users.
3478                 */
3479                ret = insert_new_root(trans, root, path, level + 1);
3480                if (ret)
3481                        return ret;
3482        } else {
3483                ret = push_nodes_for_insert(trans, root, path, level);
3484                c = path->nodes[level];
3485                if (!ret && btrfs_header_nritems(c) <
3486                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3487                        return 0;
3488                if (ret < 0)
3489                        return ret;
3490        }
3491
3492        c_nritems = btrfs_header_nritems(c);
3493        mid = (c_nritems + 1) / 2;
3494        btrfs_node_key(c, &disk_key, mid);
3495
3496        split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3497                                             c->start, 0);
3498        if (IS_ERR(split))
3499                return PTR_ERR(split);
3500
3501        root_add_used(root, fs_info->nodesize);
3502        ASSERT(btrfs_header_level(c) == level);
3503
3504        ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3505        if (ret) {
3506                btrfs_abort_transaction(trans, ret);
3507                return ret;
3508        }
3509        copy_extent_buffer(split, c,
3510                           btrfs_node_key_ptr_offset(0),
3511                           btrfs_node_key_ptr_offset(mid),
3512                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3513        btrfs_set_header_nritems(split, c_nritems - mid);
3514        btrfs_set_header_nritems(c, mid);
3515        ret = 0;
3516
3517        btrfs_mark_buffer_dirty(c);
3518        btrfs_mark_buffer_dirty(split);
3519
3520        insert_ptr(trans, fs_info, path, &disk_key, split->start,
3521                   path->slots[level + 1] + 1, level + 1);
3522
3523        if (path->slots[level] >= mid) {
3524                path->slots[level] -= mid;
3525                btrfs_tree_unlock(c);
3526                free_extent_buffer(c);
3527                path->nodes[level] = split;
3528                path->slots[level + 1] += 1;
3529        } else {
3530                btrfs_tree_unlock(split);
3531                free_extent_buffer(split);
3532        }
3533        return ret;
3534}
3535
3536/*
3537 * how many bytes are required to store the items in a leaf.  start
3538 * and nr indicate which items in the leaf to check.  This totals up the
3539 * space used both by the item structs and the item data
3540 */
3541static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3542{
3543        struct btrfs_item *start_item;
3544        struct btrfs_item *end_item;
3545        struct btrfs_map_token token;
3546        int data_len;
3547        int nritems = btrfs_header_nritems(l);
3548        int end = min(nritems, start + nr) - 1;
3549
3550        if (!nr)
3551                return 0;
3552        btrfs_init_map_token(&token);
3553        start_item = btrfs_item_nr(start);
3554        end_item = btrfs_item_nr(end);
3555        data_len = btrfs_token_item_offset(l, start_item, &token) +
3556                btrfs_token_item_size(l, start_item, &token);
3557        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3558        data_len += sizeof(struct btrfs_item) * nr;
3559        WARN_ON(data_len < 0);
3560        return data_len;
3561}
3562
3563/*
3564 * The space between the end of the leaf items and
3565 * the start of the leaf data.  IOW, how much room
3566 * the leaf has left for both items and data
3567 */
3568noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3569                                   struct extent_buffer *leaf)
3570{
3571        int nritems = btrfs_header_nritems(leaf);
3572        int ret;
3573
3574        ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3575        if (ret < 0) {
3576                btrfs_crit(fs_info,
3577                           "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3578                           ret,
3579                           (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3580                           leaf_space_used(leaf, 0, nritems), nritems);
3581        }
3582        return ret;
3583}
3584
3585/*
3586 * min slot controls the lowest index we're willing to push to the
3587 * right.  We'll push up to and including min_slot, but no lower
3588 */
3589static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3590                                      struct btrfs_path *path,
3591                                      int data_size, int empty,
3592                                      struct extent_buffer *right,
3593                                      int free_space, u32 left_nritems,
3594                                      u32 min_slot)
3595{
3596        struct extent_buffer *left = path->nodes[0];
3597        struct extent_buffer *upper = path->nodes[1];
3598        struct btrfs_map_token token;
3599        struct btrfs_disk_key disk_key;
3600        int slot;
3601        u32 i;
3602        int push_space = 0;
3603        int push_items = 0;
3604        struct btrfs_item *item;
3605        u32 nr;
3606        u32 right_nritems;
3607        u32 data_end;
3608        u32 this_item_size;
3609
3610        btrfs_init_map_token(&token);
3611
3612        if (empty)
3613                nr = 0;
3614        else
3615                nr = max_t(u32, 1, min_slot);
3616
3617        if (path->slots[0] >= left_nritems)
3618                push_space += data_size;
3619
3620        slot = path->slots[1];
3621        i = left_nritems - 1;
3622        while (i >= nr) {
3623                item = btrfs_item_nr(i);
3624
3625                if (!empty && push_items > 0) {
3626                        if (path->slots[0] > i)
3627                                break;
3628                        if (path->slots[0] == i) {
3629                                int space = btrfs_leaf_free_space(fs_info, left);
3630                                if (space + push_space * 2 > free_space)
3631                                        break;
3632                        }
3633                }
3634
3635                if (path->slots[0] == i)
3636                        push_space += data_size;
3637
3638                this_item_size = btrfs_item_size(left, item);
3639                if (this_item_size + sizeof(*item) + push_space > free_space)
3640                        break;
3641
3642                push_items++;
3643                push_space += this_item_size + sizeof(*item);
3644                if (i == 0)
3645                        break;
3646                i--;
3647        }
3648
3649        if (push_items == 0)
3650                goto out_unlock;
3651
3652        WARN_ON(!empty && push_items == left_nritems);
3653
3654        /* push left to right */
3655        right_nritems = btrfs_header_nritems(right);
3656
3657        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3658        push_space -= leaf_data_end(fs_info, left);
3659
3660        /* make room in the right data area */
3661        data_end = leaf_data_end(fs_info, right);
3662        memmove_extent_buffer(right,
3663                              BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3664                              BTRFS_LEAF_DATA_OFFSET + data_end,
3665                              BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3666
3667        /* copy from the left data area */
3668        copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3669                     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3670                     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3671                     push_space);
3672
3673        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3674                              btrfs_item_nr_offset(0),
3675                              right_nritems * sizeof(struct btrfs_item));
3676
3677        /* copy the items from left to right */
3678        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3679                   btrfs_item_nr_offset(left_nritems - push_items),
3680                   push_items * sizeof(struct btrfs_item));
3681
3682        /* update the item pointers */
3683        right_nritems += push_items;
3684        btrfs_set_header_nritems(right, right_nritems);
3685        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3686        for (i = 0; i < right_nritems; i++) {
3687                item = btrfs_item_nr(i);
3688                push_space -= btrfs_token_item_size(right, item, &token);
3689                btrfs_set_token_item_offset(right, item, push_space, &token);
3690        }
3691
3692        left_nritems -= push_items;
3693        btrfs_set_header_nritems(left, left_nritems);
3694
3695        if (left_nritems)
3696                btrfs_mark_buffer_dirty(left);
3697        else
3698                clean_tree_block(fs_info, left);
3699
3700        btrfs_mark_buffer_dirty(right);
3701
3702        btrfs_item_key(right, &disk_key, 0);
3703        btrfs_set_node_key(upper, &disk_key, slot + 1);
3704        btrfs_mark_buffer_dirty(upper);
3705
3706        /* then fixup the leaf pointer in the path */
3707        if (path->slots[0] >= left_nritems) {
3708                path->slots[0] -= left_nritems;
3709                if (btrfs_header_nritems(path->nodes[0]) == 0)
3710                        clean_tree_block(fs_info, path->nodes[0]);
3711                btrfs_tree_unlock(path->nodes[0]);
3712                free_extent_buffer(path->nodes[0]);
3713                path->nodes[0] = right;
3714                path->slots[1] += 1;
3715        } else {
3716                btrfs_tree_unlock(right);
3717                free_extent_buffer(right);
3718        }
3719        return 0;
3720
3721out_unlock:
3722        btrfs_tree_unlock(right);
3723        free_extent_buffer(right);
3724        return 1;
3725}
3726
3727/*
3728 * push some data in the path leaf to the right, trying to free up at
3729 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3730 *
3731 * returns 1 if the push failed because the other node didn't have enough
3732 * room, 0 if everything worked out and < 0 if there were major errors.
3733 *
3734 * this will push starting from min_slot to the end of the leaf.  It won't
3735 * push any slot lower than min_slot
3736 */
3737static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3738                           *root, struct btrfs_path *path,
3739                           int min_data_size, int data_size,
3740                           int empty, u32 min_slot)
3741{
3742        struct btrfs_fs_info *fs_info = root->fs_info;
3743        struct extent_buffer *left = path->nodes[0];
3744        struct extent_buffer *right;
3745        struct extent_buffer *upper;
3746        int slot;
3747        int free_space;
3748        u32 left_nritems;
3749        int ret;
3750
3751        if (!path->nodes[1])
3752                return 1;
3753
3754        slot = path->slots[1];
3755        upper = path->nodes[1];
3756        if (slot >= btrfs_header_nritems(upper) - 1)
3757                return 1;
3758
3759        btrfs_assert_tree_locked(path->nodes[1]);
3760
3761        right = read_node_slot(fs_info, upper, slot + 1);
3762        /*
3763         * slot + 1 is not valid or we fail to read the right node,
3764         * no big deal, just return.
3765         */
3766        if (IS_ERR(right))
3767                return 1;
3768
3769        btrfs_tree_lock(right);
3770        btrfs_set_lock_blocking_write(right);
3771
3772        free_space = btrfs_leaf_free_space(fs_info, right);
3773        if (free_space < data_size)
3774                goto out_unlock;
3775
3776        /* cow and double check */
3777        ret = btrfs_cow_block(trans, root, right, upper,
3778                              slot + 1, &right);
3779        if (ret)
3780                goto out_unlock;
3781
3782        free_space = btrfs_leaf_free_space(fs_info, right);
3783        if (free_space < data_size)
3784                goto out_unlock;
3785
3786        left_nritems = btrfs_header_nritems(left);
3787        if (left_nritems == 0)
3788                goto out_unlock;
3789
3790        if (path->slots[0] == left_nritems && !empty) {
3791                /* Key greater than all keys in the leaf, right neighbor has
3792                 * enough room for it and we're not emptying our leaf to delete
3793                 * it, therefore use right neighbor to insert the new item and
3794                 * no need to touch/dirty our left leaf. */
3795                btrfs_tree_unlock(left);
3796                free_extent_buffer(left);
3797                path->nodes[0] = right;
3798                path->slots[0] = 0;
3799                path->slots[1]++;
3800                return 0;
3801        }
3802
3803        return __push_leaf_right(fs_info, path, min_data_size, empty,
3804                                right, free_space, left_nritems, min_slot);
3805out_unlock:
3806        btrfs_tree_unlock(right);
3807        free_extent_buffer(right);
3808        return 1;
3809}
3810
3811/*
3812 * push some data in the path leaf to the left, trying to free up at
3813 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3814 *
3815 * max_slot can put a limit on how far into the leaf we'll push items.  The
3816 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3817 * items
3818 */
3819static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3820                                     struct btrfs_path *path, int data_size,
3821                                     int empty, struct extent_buffer *left,
3822                                     int free_space, u32 right_nritems,
3823                                     u32 max_slot)
3824{
3825        struct btrfs_disk_key disk_key;
3826        struct extent_buffer *right = path->nodes[0];
3827        int i;
3828        int push_space = 0;
3829        int push_items = 0;
3830        struct btrfs_item *item;
3831        u32 old_left_nritems;
3832        u32 nr;
3833        int ret = 0;
3834        u32 this_item_size;
3835        u32 old_left_item_size;
3836        struct btrfs_map_token token;
3837
3838        btrfs_init_map_token(&token);
3839
3840        if (empty)
3841                nr = min(right_nritems, max_slot);
3842        else
3843                nr = min(right_nritems - 1, max_slot);
3844
3845        for (i = 0; i < nr; i++) {
3846                item = btrfs_item_nr(i);
3847
3848                if (!empty && push_items > 0) {
3849                        if (path->slots[0] < i)
3850                                break;
3851                        if (path->slots[0] == i) {
3852                                int space = btrfs_leaf_free_space(fs_info, right);
3853                                if (space + push_space * 2 > free_space)
3854                                        break;
3855                        }
3856                }
3857
3858                if (path->slots[0] == i)
3859                        push_space += data_size;
3860
3861                this_item_size = btrfs_item_size(right, item);
3862                if (this_item_size + sizeof(*item) + push_space > free_space)
3863                        break;
3864
3865                push_items++;
3866                push_space += this_item_size + sizeof(*item);
3867        }
3868
3869        if (push_items == 0) {
3870                ret = 1;
3871                goto out;
3872        }
3873        WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3874
3875        /* push data from right to left */
3876        copy_extent_buffer(left, right,
3877                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
3878                           btrfs_item_nr_offset(0),
3879                           push_items * sizeof(struct btrfs_item));
3880
3881        push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3882                     btrfs_item_offset_nr(right, push_items - 1);
3883
3884        copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3885                     leaf_data_end(fs_info, left) - push_space,
3886                     BTRFS_LEAF_DATA_OFFSET +
3887                     btrfs_item_offset_nr(right, push_items - 1),
3888                     push_space);
3889        old_left_nritems = btrfs_header_nritems(left);
3890        BUG_ON(old_left_nritems <= 0);
3891
3892        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3893        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3894                u32 ioff;
3895
3896                item = btrfs_item_nr(i);
3897
3898                ioff = btrfs_token_item_offset(left, item, &token);
3899                btrfs_set_token_item_offset(left, item,
3900                      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3901                      &token);
3902        }
3903        btrfs_set_header_nritems(left, old_left_nritems + push_items);
3904
3905        /* fixup right node */
3906        if (push_items > right_nritems)
3907                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3908                       right_nritems);
3909
3910        if (push_items < right_nritems) {
3911                push_space = btrfs_item_offset_nr(right, push_items - 1) -
3912                                                  leaf_data_end(fs_info, right);
3913                memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3914                                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3915                                      BTRFS_LEAF_DATA_OFFSET +
3916                                      leaf_data_end(fs_info, right), push_space);
3917
3918                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3919                              btrfs_item_nr_offset(push_items),
3920                             (btrfs_header_nritems(right) - push_items) *
3921                             sizeof(struct btrfs_item));
3922        }
3923        right_nritems -= push_items;
3924        btrfs_set_header_nritems(right, right_nritems);
3925        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3926        for (i = 0; i < right_nritems; i++) {
3927                item = btrfs_item_nr(i);
3928
3929                push_space = push_space - btrfs_token_item_size(right,
3930                                                                item, &token);
3931                btrfs_set_token_item_offset(right, item, push_space, &token);
3932        }
3933
3934        btrfs_mark_buffer_dirty(left);
3935        if (right_nritems)
3936                btrfs_mark_buffer_dirty(right);
3937        else
3938                clean_tree_block(fs_info, right);
3939
3940        btrfs_item_key(right, &disk_key, 0);
3941        fixup_low_keys(path, &disk_key, 1);
3942
3943        /* then fixup the leaf pointer in the path */
3944        if (path->slots[0] < push_items) {
3945                path->slots[0] += old_left_nritems;
3946                btrfs_tree_unlock(path->nodes[0]);
3947                free_extent_buffer(path->nodes[0]);
3948                path->nodes[0] = left;
3949                path->slots[1] -= 1;
3950        } else {
3951                btrfs_tree_unlock(left);
3952                free_extent_buffer(left);
3953                path->slots[0] -= push_items;
3954        }
3955        BUG_ON(path->slots[0] < 0);
3956        return ret;
3957out:
3958        btrfs_tree_unlock(left);
3959        free_extent_buffer(left);
3960        return ret;
3961}
3962
3963/*
3964 * push some data in the path leaf to the left, trying to free up at
3965 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3966 *
3967 * max_slot can put a limit on how far into the leaf we'll push items.  The
3968 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3969 * items
3970 */
3971static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3972                          *root, struct btrfs_path *path, int min_data_size,
3973                          int data_size, int empty, u32 max_slot)
3974{
3975        struct btrfs_fs_info *fs_info = root->fs_info;
3976        struct extent_buffer *right = path->nodes[0];
3977        struct extent_buffer *left;
3978        int slot;
3979        int free_space;
3980        u32 right_nritems;
3981        int ret = 0;
3982
3983        slot = path->slots[1];
3984        if (slot == 0)
3985                return 1;
3986        if (!path->nodes[1])
3987                return 1;
3988
3989        right_nritems = btrfs_header_nritems(right);
3990        if (right_nritems == 0)
3991                return 1;
3992
3993        btrfs_assert_tree_locked(path->nodes[1]);
3994
3995        left = read_node_slot(fs_info, path->nodes[1], slot - 1);
3996        /*
3997         * slot - 1 is not valid or we fail to read the left node,
3998         * no big deal, just return.
3999         */
4000        if (IS_ERR(left))
4001                return 1;
4002
4003        btrfs_tree_lock(left);
4004        btrfs_set_lock_blocking_write(left);
4005
4006        free_space = btrfs_leaf_free_space(fs_info, left);
4007        if (free_space < data_size) {
4008                ret = 1;
4009                goto out;
4010        }
4011
4012        /* cow and double check */
4013        ret = btrfs_cow_block(trans, root, left,
4014                              path->nodes[1], slot - 1, &left);
4015        if (ret) {
4016                /* we hit -ENOSPC, but it isn't fatal here */
4017                if (ret == -ENOSPC)
4018                        ret = 1;
4019                goto out;
4020        }
4021
4022        free_space = btrfs_leaf_free_space(fs_info, left);
4023        if (free_space < data_size) {
4024                ret = 1;
4025                goto out;
4026        }
4027
4028        return __push_leaf_left(fs_info, path, min_data_size,
4029                               empty, left, free_space, right_nritems,
4030                               max_slot);
4031out:
4032        btrfs_tree_unlock(left);
4033        free_extent_buffer(left);
4034        return ret;
4035}
4036
4037/*
4038 * split the path's leaf in two, making sure there is at least data_size
4039 * available for the resulting leaf level of the path.
4040 */
4041static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4042                                    struct btrfs_fs_info *fs_info,
4043                                    struct btrfs_path *path,
4044                                    struct extent_buffer *l,
4045                                    struct extent_buffer *right,
4046                                    int slot, int mid, int nritems)
4047{
4048        int data_copy_size;
4049        int rt_data_off;
4050        int i;
4051        struct btrfs_disk_key disk_key;
4052        struct btrfs_map_token token;
4053
4054        btrfs_init_map_token(&token);
4055
4056        nritems = nritems - mid;
4057        btrfs_set_header_nritems(right, nritems);
4058        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4059
4060        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4061                           btrfs_item_nr_offset(mid),
4062                           nritems * sizeof(struct btrfs_item));
4063
4064        copy_extent_buffer(right, l,
4065                     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4066                     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4067                     leaf_data_end(fs_info, l), data_copy_size);
4068
4069        rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4070
4071        for (i = 0; i < nritems; i++) {
4072                struct btrfs_item *item = btrfs_item_nr(i);
4073                u32 ioff;
4074
4075                ioff = btrfs_token_item_offset(right, item, &token);
4076                btrfs_set_token_item_offset(right, item,
4077                                            ioff + rt_data_off, &token);
4078        }
4079
4080        btrfs_set_header_nritems(l, mid);
4081        btrfs_item_key(right, &disk_key, 0);
4082        insert_ptr(trans, fs_info, path, &disk_key, right->start,
4083                   path->slots[1] + 1, 1);
4084
4085        btrfs_mark_buffer_dirty(right);
4086        btrfs_mark_buffer_dirty(l);
4087        BUG_ON(path->slots[0] != slot);
4088
4089        if (mid <= slot) {
4090                btrfs_tree_unlock(path->nodes[0]);
4091                free_extent_buffer(path->nodes[0]);
4092                path->nodes[0] = right;
4093                path->slots[0] -= mid;
4094                path->slots[1] += 1;
4095        } else {
4096                btrfs_tree_unlock(right);
4097                free_extent_buffer(right);
4098        }
4099
4100        BUG_ON(path->slots[0] < 0);
4101}
4102
4103/*
4104 * double splits happen when we need to insert a big item in the middle
4105 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4106 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4107 *          A                 B                 C
4108 *
4109 * We avoid this by trying to push the items on either side of our target
4110 * into the adjacent leaves.  If all goes well we can avoid the double split
4111 * completely.
4112 */
4113static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4114                                          struct btrfs_root *root,
4115                                          struct btrfs_path *path,
4116                                          int data_size)
4117{
4118        struct btrfs_fs_info *fs_info = root->fs_info;
4119        int ret;
4120        int progress = 0;
4121        int slot;
4122        u32 nritems;
4123        int space_needed = data_size;
4124
4125        slot = path->slots[0];
4126        if (slot < btrfs_header_nritems(path->nodes[0]))
4127                space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4128
4129        /*
4130         * try to push all the items after our slot into the
4131         * right leaf
4132         */
4133        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4134        if (ret < 0)
4135                return ret;
4136
4137        if (ret == 0)
4138                progress++;
4139
4140        nritems = btrfs_header_nritems(path->nodes[0]);
4141        /*
4142         * our goal is to get our slot at the start or end of a leaf.  If
4143         * we've done so we're done
4144         */
4145        if (path->slots[0] == 0 || path->slots[0] == nritems)
4146                return 0;
4147
4148        if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4149                return 0;
4150
4151        /* try to push all the items before our slot into the next leaf */
4152        slot = path->slots[0];
4153        space_needed = data_size;
4154        if (slot > 0)
4155                space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4156        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4157        if (ret < 0)
4158                return ret;
4159
4160        if (ret == 0)
4161                progress++;
4162
4163        if (progress)
4164                return 0;
4165        return 1;
4166}
4167
4168/*
4169 * split the path's leaf in two, making sure there is at least data_size
4170 * available for the resulting leaf level of the path.
4171 *
4172 * returns 0 if all went well and < 0 on failure.
4173 */
4174static noinline int split_leaf(struct btrfs_trans_handle *trans,
4175                               struct btrfs_root *root,
4176                               const struct btrfs_key *ins_key,
4177                               struct btrfs_path *path, int data_size,
4178                               int extend)
4179{
4180        struct btrfs_disk_key disk_key;
4181        struct extent_buffer *l;
4182        u32 nritems;
4183        int mid;
4184        int slot;
4185        struct extent_buffer *right;
4186        struct btrfs_fs_info *fs_info = root->fs_info;
4187        int ret = 0;
4188        int wret;
4189        int split;
4190        int num_doubles = 0;
4191        int tried_avoid_double = 0;
4192
4193        l = path->nodes[0];
4194        slot = path->slots[0];
4195        if (extend && data_size + btrfs_item_size_nr(l, slot) +
4196            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4197                return -EOVERFLOW;
4198
4199        /* first try to make some room by pushing left and right */
4200        if (data_size && path->nodes[1]) {
4201                int space_needed = data_size;
4202
4203                if (slot < btrfs_header_nritems(l))
4204                        space_needed -= btrfs_leaf_free_space(fs_info, l);
4205
4206                wret = push_leaf_right(trans, root, path, space_needed,
4207                                       space_needed, 0, 0);
4208                if (wret < 0)
4209                        return wret;
4210                if (wret) {
4211                        space_needed = data_size;
4212                        if (slot > 0)
4213                                space_needed -= btrfs_leaf_free_space(fs_info,
4214                                                                      l);
4215                        wret = push_leaf_left(trans, root, path, space_needed,
4216                                              space_needed, 0, (u32)-1);
4217                        if (wret < 0)
4218                                return wret;
4219                }
4220                l = path->nodes[0];
4221
4222                /* did the pushes work? */
4223                if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4224                        return 0;
4225        }
4226
4227        if (!path->nodes[1]) {
4228                ret = insert_new_root(trans, root, path, 1);
4229                if (ret)
4230                        return ret;
4231        }
4232again:
4233        split = 1;
4234        l = path->nodes[0];
4235        slot = path->slots[0];
4236        nritems = btrfs_header_nritems(l);
4237        mid = (nritems + 1) / 2;
4238
4239        if (mid <= slot) {
4240                if (nritems == 1 ||
4241                    leaf_space_used(l, mid, nritems - mid) + data_size >
4242                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
4243                        if (slot >= nritems) {
4244                                split = 0;
4245                        } else {
4246                                mid = slot;
4247                                if (mid != nritems &&
4248                                    leaf_space_used(l, mid, nritems - mid) +
4249                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4250                                        if (data_size && !tried_avoid_double)
4251                                                goto push_for_double;
4252                                        split = 2;
4253                                }
4254                        }
4255                }
4256        } else {
4257                if (leaf_space_used(l, 0, mid) + data_size >
4258                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
4259                        if (!extend && data_size && slot == 0) {
4260                                split = 0;
4261                        } else if ((extend || !data_size) && slot == 0) {
4262                                mid = 1;
4263                        } else {
4264                                mid = slot;
4265                                if (mid != nritems &&
4266                                    leaf_space_used(l, mid, nritems - mid) +
4267                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4268                                        if (data_size && !tried_avoid_double)
4269                                                goto push_for_double;
4270                                        split = 2;
4271                                }
4272                        }
4273                }
4274        }
4275
4276        if (split == 0)
4277                btrfs_cpu_key_to_disk(&disk_key, ins_key);
4278        else
4279                btrfs_item_key(l, &disk_key, mid);
4280
4281        right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4282                                             l->start, 0);
4283        if (IS_ERR(right))
4284                return PTR_ERR(right);
4285
4286        root_add_used(root, fs_info->nodesize);
4287
4288        if (split == 0) {
4289                if (mid <= slot) {
4290                        btrfs_set_header_nritems(right, 0);
4291                        insert_ptr(trans, fs_info, path, &disk_key,
4292                                   right->start, path->slots[1] + 1, 1);
4293                        btrfs_tree_unlock(path->nodes[0]);
4294                        free_extent_buffer(path->nodes[0]);
4295                        path->nodes[0] = right;
4296                        path->slots[0] = 0;
4297                        path->slots[1] += 1;
4298                } else {
4299                        btrfs_set_header_nritems(right, 0);
4300                        insert_ptr(trans, fs_info, path, &disk_key,
4301                                   right->start, path->slots[1], 1);
4302                        btrfs_tree_unlock(path->nodes[0]);
4303                        free_extent_buffer(path->nodes[0]);
4304                        path->nodes[0] = right;
4305                        path->slots[0] = 0;
4306                        if (path->slots[1] == 0)
4307                                fixup_low_keys(path, &disk_key, 1);
4308                }
4309                /*
4310                 * We create a new leaf 'right' for the required ins_len and
4311                 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4312                 * the content of ins_len to 'right'.
4313                 */
4314                return ret;
4315        }
4316
4317        copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4318
4319        if (split == 2) {
4320                BUG_ON(num_doubles != 0);
4321                num_doubles++;
4322                goto again;
4323        }
4324
4325        return 0;
4326
4327push_for_double:
4328        push_for_double_split(trans, root, path, data_size);
4329        tried_avoid_double = 1;
4330        if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4331                return 0;
4332        goto again;
4333}
4334
4335static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4336                                         struct btrfs_root *root,
4337                                         struct btrfs_path *path, int ins_len)
4338{
4339        struct btrfs_fs_info *fs_info = root->fs_info;
4340        struct btrfs_key key;
4341        struct extent_buffer *leaf;
4342        struct btrfs_file_extent_item *fi;
4343        u64 extent_len = 0;
4344        u32 item_size;
4345        int ret;
4346
4347        leaf = path->nodes[0];
4348        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4349
4350        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4351               key.type != BTRFS_EXTENT_CSUM_KEY);
4352
4353        if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4354                return 0;
4355
4356        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4357        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4358                fi = btrfs_item_ptr(leaf, path->slots[0],
4359                                    struct btrfs_file_extent_item);
4360                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4361        }
4362        btrfs_release_path(path);
4363
4364        path->keep_locks = 1;
4365        path->search_for_split = 1;
4366        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4367        path->search_for_split = 0;
4368        if (ret > 0)
4369                ret = -EAGAIN;
4370        if (ret < 0)
4371                goto err;
4372
4373        ret = -EAGAIN;
4374        leaf = path->nodes[0];
4375        /* if our item isn't there, return now */
4376        if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4377                goto err;
4378
4379        /* the leaf has  changed, it now has room.  return now */
4380        if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4381                goto err;
4382
4383        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4384                fi = btrfs_item_ptr(leaf, path->slots[0],
4385                                    struct btrfs_file_extent_item);
4386                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4387                        goto err;
4388        }
4389
4390        btrfs_set_path_blocking(path);
4391        ret = split_leaf(trans, root, &key, path, ins_len, 1);
4392        if (ret)
4393                goto err;
4394
4395        path->keep_locks = 0;
4396        btrfs_unlock_up_safe(path, 1);
4397        return 0;
4398err:
4399        path->keep_locks = 0;
4400        return ret;
4401}
4402
4403static noinline int split_item(struct btrfs_fs_info *fs_info,
4404                               struct btrfs_path *path,
4405                               const struct btrfs_key *new_key,
4406                               unsigned long split_offset)
4407{
4408        struct extent_buffer *leaf;
4409        struct btrfs_item *item;
4410        struct btrfs_item *new_item;
4411        int slot;
4412        char *buf;
4413        u32 nritems;
4414        u32 item_size;
4415        u32 orig_offset;
4416        struct btrfs_disk_key disk_key;
4417
4418        leaf = path->nodes[0];
4419        BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4420
4421        btrfs_set_path_blocking(path);
4422
4423        item = btrfs_item_nr(path->slots[0]);
4424        orig_offset = btrfs_item_offset(leaf, item);
4425        item_size = btrfs_item_size(leaf, item);
4426
4427        buf = kmalloc(item_size, GFP_NOFS);
4428        if (!buf)
4429                return -ENOMEM;
4430
4431        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4432                            path->slots[0]), item_size);
4433
4434        slot = path->slots[0] + 1;
4435        nritems = btrfs_header_nritems(leaf);
4436        if (slot != nritems) {
4437                /* shift the items */
4438                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4439                                btrfs_item_nr_offset(slot),
4440                                (nritems - slot) * sizeof(struct btrfs_item));
4441        }
4442
4443        btrfs_cpu_key_to_disk(&disk_key, new_key);
4444        btrfs_set_item_key(leaf, &disk_key, slot);
4445
4446        new_item = btrfs_item_nr(slot);
4447
4448        btrfs_set_item_offset(leaf, new_item, orig_offset);
4449        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4450
4451        btrfs_set_item_offset(leaf, item,
4452                              orig_offset + item_size - split_offset);
4453        btrfs_set_item_size(leaf, item, split_offset);
4454
4455        btrfs_set_header_nritems(leaf, nritems + 1);
4456
4457        /* write the data for the start of the original item */
4458        write_extent_buffer(leaf, buf,
4459                            btrfs_item_ptr_offset(leaf, path->slots[0]),
4460                            split_offset);
4461
4462        /* write the data for the new item */
4463        write_extent_buffer(leaf, buf + split_offset,
4464                            btrfs_item_ptr_offset(leaf, slot),
4465                            item_size - split_offset);
4466        btrfs_mark_buffer_dirty(leaf);
4467
4468        BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4469        kfree(buf);
4470        return 0;
4471}
4472
4473/*
4474 * This function splits a single item into two items,
4475 * giving 'new_key' to the new item and splitting the
4476 * old one at split_offset (from the start of the item).
4477 *
4478 * The path may be released by this operation.  After
4479 * the split, the path is pointing to the old item.  The
4480 * new item is going to be in the same node as the old one.
4481 *
4482 * Note, the item being split must be smaller enough to live alone on
4483 * a tree block with room for one extra struct btrfs_item
4484 *
4485 * This allows us to split the item in place, keeping a lock on the
4486 * leaf the entire time.
4487 */
4488int btrfs_split_item(struct btrfs_trans_handle *trans,
4489                     struct btrfs_root *root,
4490                     struct btrfs_path *path,
4491                     const struct btrfs_key *new_key,
4492                     unsigned long split_offset)
4493{
4494        int ret;
4495        ret = setup_leaf_for_split(trans, root, path,
4496                                   sizeof(struct btrfs_item));
4497        if (ret)
4498                return ret;
4499
4500        ret = split_item(root->fs_info, path, new_key, split_offset);
4501        return ret;
4502}
4503
4504/*
4505 * This function duplicate a item, giving 'new_key' to the new item.
4506 * It guarantees both items live in the same tree leaf and the new item
4507 * is contiguous with the original item.
4508 *
4509 * This allows us to split file extent in place, keeping a lock on the
4510 * leaf the entire time.
4511 */
4512int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4513                         struct btrfs_root *root,
4514                         struct btrfs_path *path,
4515                         const struct btrfs_key *new_key)
4516{
4517        struct extent_buffer *leaf;
4518        int ret;
4519        u32 item_size;
4520
4521        leaf = path->nodes[0];
4522        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4523        ret = setup_leaf_for_split(trans, root, path,
4524                                   item_size + sizeof(struct btrfs_item));
4525        if (ret)
4526                return ret;
4527
4528        path->slots[0]++;
4529        setup_items_for_insert(root, path, new_key, &item_size,
4530                               item_size, item_size +
4531                               sizeof(struct btrfs_item), 1);
4532        leaf = path->nodes[0];
4533        memcpy_extent_buffer(leaf,
4534                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4535                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4536                             item_size);
4537        return 0;
4538}
4539
4540/*
4541 * make the item pointed to by the path smaller.  new_size indicates
4542 * how small to make it, and from_end tells us if we just chop bytes
4543 * off the end of the item or if we shift the item to chop bytes off
4544 * the front.
4545 */
4546void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4547                         struct btrfs_path *path, u32 new_size, int from_end)
4548{
4549        int slot;
4550        struct extent_buffer *leaf;
4551        struct btrfs_item *item;
4552        u32 nritems;
4553        unsigned int data_end;
4554        unsigned int old_data_start;
4555        unsigned int old_size;
4556        unsigned int size_diff;
4557        int i;
4558        struct btrfs_map_token token;
4559
4560        btrfs_init_map_token(&token);
4561
4562        leaf = path->nodes[0];
4563        slot = path->slots[0];
4564
4565        old_size = btrfs_item_size_nr(leaf, slot);
4566        if (old_size == new_size)
4567                return;
4568
4569        nritems = btrfs_header_nritems(leaf);
4570        data_end = leaf_data_end(fs_info, leaf);
4571
4572        old_data_start = btrfs_item_offset_nr(leaf, slot);
4573
4574        size_diff = old_size - new_size;
4575
4576        BUG_ON(slot < 0);
4577        BUG_ON(slot >= nritems);
4578
4579        /*
4580         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4581         */
4582        /* first correct the data pointers */
4583        for (i = slot; i < nritems; i++) {
4584                u32 ioff;
4585                item = btrfs_item_nr(i);
4586
4587                ioff = btrfs_token_item_offset(leaf, item, &token);
4588                btrfs_set_token_item_offset(leaf, item,
4589                                            ioff + size_diff, &token);
4590        }
4591
4592        /* shift the data */
4593        if (from_end) {
4594                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4595                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4596                              data_end, old_data_start + new_size - data_end);
4597        } else {
4598                struct btrfs_disk_key disk_key;
4599                u64 offset;
4600
4601                btrfs_item_key(leaf, &disk_key, slot);
4602
4603                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4604                        unsigned long ptr;
4605                        struct btrfs_file_extent_item *fi;
4606
4607                        fi = btrfs_item_ptr(leaf, slot,
4608                                            struct btrfs_file_extent_item);
4609                        fi = (struct btrfs_file_extent_item *)(
4610                             (unsigned long)fi - size_diff);
4611
4612                        if (btrfs_file_extent_type(leaf, fi) ==
4613                            BTRFS_FILE_EXTENT_INLINE) {
4614                                ptr = btrfs_item_ptr_offset(leaf, slot);
4615                                memmove_extent_buffer(leaf, ptr,
4616                                      (unsigned long)fi,
4617                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4618                        }
4619                }
4620
4621                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4622                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4623                              data_end, old_data_start - data_end);
4624
4625                offset = btrfs_disk_key_offset(&disk_key);
4626                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4627                btrfs_set_item_key(leaf, &disk_key, slot);
4628                if (slot == 0)
4629                        fixup_low_keys(path, &disk_key, 1);
4630        }
4631
4632        item = btrfs_item_nr(slot);
4633        btrfs_set_item_size(leaf, item, new_size);
4634        btrfs_mark_buffer_dirty(leaf);
4635
4636        if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4637                btrfs_print_leaf(leaf);
4638                BUG();
4639        }
4640}
4641
4642/*
4643 * make the item pointed to by the path bigger, data_size is the added size.
4644 */
4645void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4646                       u32 data_size)
4647{
4648        int slot;
4649        struct extent_buffer *leaf;
4650        struct btrfs_item *item;
4651        u32 nritems;
4652        unsigned int data_end;
4653        unsigned int old_data;
4654        unsigned int old_size;
4655        int i;
4656        struct btrfs_map_token token;
4657
4658        btrfs_init_map_token(&token);
4659
4660        leaf = path->nodes[0];
4661
4662        nritems = btrfs_header_nritems(leaf);
4663        data_end = leaf_data_end(fs_info, leaf);
4664
4665        if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4666                btrfs_print_leaf(leaf);
4667                BUG();
4668        }
4669        slot = path->slots[0];
4670        old_data = btrfs_item_end_nr(leaf, slot);
4671
4672        BUG_ON(slot < 0);
4673        if (slot >= nritems) {
4674                btrfs_print_leaf(leaf);
4675                btrfs_crit(fs_info, "slot %d too large, nritems %d",
4676                           slot, nritems);
4677                BUG_ON(1);
4678        }
4679
4680        /*
4681         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4682         */
4683        /* first correct the data pointers */
4684        for (i = slot; i < nritems; i++) {
4685                u32 ioff;
4686                item = btrfs_item_nr(i);
4687
4688                ioff = btrfs_token_item_offset(leaf, item, &token);
4689                btrfs_set_token_item_offset(leaf, item,
4690                                            ioff - data_size, &token);
4691        }
4692
4693        /* shift the data */
4694        memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4695                      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4696                      data_end, old_data - data_end);
4697
4698        data_end = old_data;
4699        old_size = btrfs_item_size_nr(leaf, slot);
4700        item = btrfs_item_nr(slot);
4701        btrfs_set_item_size(leaf, item, old_size + data_size);
4702        btrfs_mark_buffer_dirty(leaf);
4703
4704        if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4705                btrfs_print_leaf(leaf);
4706                BUG();
4707        }
4708}
4709
4710/*
4711 * this is a helper for btrfs_insert_empty_items, the main goal here is
4712 * to save stack depth by doing the bulk of the work in a function
4713 * that doesn't call btrfs_search_slot
4714 */
4715void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4716                            const struct btrfs_key *cpu_key, u32 *data_size,
4717                            u32 total_data, u32 total_size, int nr)
4718{
4719        struct btrfs_fs_info *fs_info = root->fs_info;
4720        struct btrfs_item *item;
4721        int i;
4722        u32 nritems;
4723        unsigned int data_end;
4724        struct btrfs_disk_key disk_key;
4725        struct extent_buffer *leaf;
4726        int slot;
4727        struct btrfs_map_token token;
4728
4729        if (path->slots[0] == 0) {
4730                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4731                fixup_low_keys(path, &disk_key, 1);
4732        }
4733        btrfs_unlock_up_safe(path, 1);
4734
4735        btrfs_init_map_token(&token);
4736
4737        leaf = path->nodes[0];
4738        slot = path->slots[0];
4739
4740        nritems = btrfs_header_nritems(leaf);
4741        data_end = leaf_data_end(fs_info, leaf);
4742
4743        if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4744                btrfs_print_leaf(leaf);
4745                btrfs_crit(fs_info, "not enough freespace need %u have %d",
4746                           total_size, btrfs_leaf_free_space(fs_info, leaf));
4747                BUG();
4748        }
4749
4750        if (slot != nritems) {
4751                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4752
4753                if (old_data < data_end) {
4754                        btrfs_print_leaf(leaf);
4755                        btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4756                                   slot, old_data, data_end);
4757                        BUG_ON(1);
4758                }
4759                /*
4760                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4761                 */
4762                /* first correct the data pointers */
4763                for (i = slot; i < nritems; i++) {
4764                        u32 ioff;
4765
4766                        item = btrfs_item_nr(i);
4767                        ioff = btrfs_token_item_offset(leaf, item, &token);
4768                        btrfs_set_token_item_offset(leaf, item,
4769                                                    ioff - total_data, &token);
4770                }
4771                /* shift the items */
4772                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4773                              btrfs_item_nr_offset(slot),
4774                              (nritems - slot) * sizeof(struct btrfs_item));
4775
4776                /* shift the data */
4777                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4778                              data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4779                              data_end, old_data - data_end);
4780                data_end = old_data;
4781        }
4782
4783        /* setup the item for the new data */
4784        for (i = 0; i < nr; i++) {
4785                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4786                btrfs_set_item_key(leaf, &disk_key, slot + i);
4787                item = btrfs_item_nr(slot + i);
4788                btrfs_set_token_item_offset(leaf, item,
4789                                            data_end - data_size[i], &token);
4790                data_end -= data_size[i];
4791                btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4792        }
4793
4794        btrfs_set_header_nritems(leaf, nritems + nr);
4795        btrfs_mark_buffer_dirty(leaf);
4796
4797        if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4798                btrfs_print_leaf(leaf);
4799                BUG();
4800        }
4801}
4802
4803/*
4804 * Given a key and some data, insert items into the tree.
4805 * This does all the path init required, making room in the tree if needed.
4806 */
4807int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4808                            struct btrfs_root *root,
4809                            struct btrfs_path *path,
4810                            const struct btrfs_key *cpu_key, u32 *data_size,
4811                            int nr)
4812{
4813        int ret = 0;
4814        int slot;
4815        int i;
4816        u32 total_size = 0;
4817        u32 total_data = 0;
4818
4819        for (i = 0; i < nr; i++)
4820                total_data += data_size[i];
4821
4822        total_size = total_data + (nr * sizeof(struct btrfs_item));
4823        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4824        if (ret == 0)
4825                return -EEXIST;
4826        if (ret < 0)
4827                return ret;
4828
4829        slot = path->slots[0];
4830        BUG_ON(slot < 0);
4831
4832        setup_items_for_insert(root, path, cpu_key, data_size,
4833                               total_data, total_size, nr);
4834        return 0;
4835}
4836
4837/*
4838 * Given a key and some data, insert an item into the tree.
4839 * This does all the path init required, making room in the tree if needed.
4840 */
4841int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4842                      const struct btrfs_key *cpu_key, void *data,
4843                      u32 data_size)
4844{
4845        int ret = 0;
4846        struct btrfs_path *path;
4847        struct extent_buffer *leaf;
4848        unsigned long ptr;
4849
4850        path = btrfs_alloc_path();
4851        if (!path)
4852                return -ENOMEM;
4853        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4854        if (!ret) {
4855                leaf = path->nodes[0];
4856                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4857                write_extent_buffer(leaf, data, ptr, data_size);
4858                btrfs_mark_buffer_dirty(leaf);
4859        }
4860        btrfs_free_path(path);
4861        return ret;
4862}
4863
4864/*
4865 * delete the pointer from a given node.
4866 *
4867 * the tree should have been previously balanced so the deletion does not
4868 * empty a node.
4869 */
4870static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4871                    int level, int slot)
4872{
4873        struct extent_buffer *parent = path->nodes[level];
4874        u32 nritems;
4875        int ret;
4876
4877        nritems = btrfs_header_nritems(parent);
4878        if (slot != nritems - 1) {
4879                if (level) {
4880                        ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4881                                        nritems - slot - 1);
4882                        BUG_ON(ret < 0);
4883                }
4884                memmove_extent_buffer(parent,
4885                              btrfs_node_key_ptr_offset(slot),
4886                              btrfs_node_key_ptr_offset(slot + 1),
4887                              sizeof(struct btrfs_key_ptr) *
4888                              (nritems - slot - 1));
4889        } else if (level) {
4890                ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4891                                GFP_NOFS);
4892                BUG_ON(ret < 0);
4893        }
4894
4895        nritems--;
4896        btrfs_set_header_nritems(parent, nritems);
4897        if (nritems == 0 && parent == root->node) {
4898                BUG_ON(btrfs_header_level(root->node) != 1);
4899                /* just turn the root into a leaf and break */
4900                btrfs_set_header_level(root->node, 0);
4901        } else if (slot == 0) {
4902                struct btrfs_disk_key disk_key;
4903
4904                btrfs_node_key(parent, &disk_key, 0);
4905                fixup_low_keys(path, &disk_key, level + 1);
4906        }
4907        btrfs_mark_buffer_dirty(parent);
4908}
4909
4910/*
4911 * a helper function to delete the leaf pointed to by path->slots[1] and
4912 * path->nodes[1].
4913 *
4914 * This deletes the pointer in path->nodes[1] and frees the leaf
4915 * block extent.  zero is returned if it all worked out, < 0 otherwise.
4916 *
4917 * The path must have already been setup for deleting the leaf, including
4918 * all the proper balancing.  path->nodes[1] must be locked.
4919 */
4920static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4921                                    struct btrfs_root *root,
4922                                    struct btrfs_path *path,
4923                                    struct extent_buffer *leaf)
4924{
4925        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4926        del_ptr(root, path, 1, path->slots[1]);
4927
4928        /*
4929         * btrfs_free_extent is expensive, we want to make sure we
4930         * aren't holding any locks when we call it
4931         */
4932        btrfs_unlock_up_safe(path, 0);
4933
4934        root_sub_used(root, leaf->len);
4935
4936        extent_buffer_get(leaf);
4937        btrfs_free_tree_block(trans, root, leaf, 0, 1);
4938        free_extent_buffer_stale(leaf);
4939}
4940/*
4941 * delete the item at the leaf level in path.  If that empties
4942 * the leaf, remove it from the tree
4943 */
4944int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4945                    struct btrfs_path *path, int slot, int nr)
4946{
4947        struct btrfs_fs_info *fs_info = root->fs_info;
4948        struct extent_buffer *leaf;
4949        struct btrfs_item *item;
4950        u32 last_off;
4951        u32 dsize = 0;
4952        int ret = 0;
4953        int wret;
4954        int i;
4955        u32 nritems;
4956        struct btrfs_map_token token;
4957
4958        btrfs_init_map_token(&token);
4959
4960        leaf = path->nodes[0];
4961        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4962
4963        for (i = 0; i < nr; i++)
4964                dsize += btrfs_item_size_nr(leaf, slot + i);
4965
4966        nritems = btrfs_header_nritems(leaf);
4967
4968        if (slot + nr != nritems) {
4969                int data_end = leaf_data_end(fs_info, leaf);
4970
4971                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4972                              data_end + dsize,
4973                              BTRFS_LEAF_DATA_OFFSET + data_end,
4974                              last_off - data_end);
4975
4976                for (i = slot + nr; i < nritems; i++) {
4977                        u32 ioff;
4978
4979                        item = btrfs_item_nr(i);
4980                        ioff = btrfs_token_item_offset(leaf, item, &token);
4981                        btrfs_set_token_item_offset(leaf, item,
4982                                                    ioff + dsize, &token);
4983                }
4984
4985                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4986                              btrfs_item_nr_offset(slot + nr),
4987                              sizeof(struct btrfs_item) *
4988                              (nritems - slot - nr));
4989        }
4990        btrfs_set_header_nritems(leaf, nritems - nr);
4991        nritems -= nr;
4992
4993        /* delete the leaf if we've emptied it */
4994        if (nritems == 0) {
4995                if (leaf == root->node) {
4996                        btrfs_set_header_level(leaf, 0);
4997                } else {
4998                        btrfs_set_path_blocking(path);
4999                        clean_tree_block(fs_info, leaf);
5000                        btrfs_del_leaf(trans, root, path, leaf);
5001                }
5002        } else {
5003                int used = leaf_space_used(leaf, 0, nritems);
5004                if (slot == 0) {
5005                        struct btrfs_disk_key disk_key;
5006
5007                        btrfs_item_key(leaf, &disk_key, 0);
5008                        fixup_low_keys(path, &disk_key, 1);
5009                }
5010
5011                /* delete the leaf if it is mostly empty */
5012                if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5013                        /* push_leaf_left fixes the path.
5014                         * make sure the path still points to our leaf
5015                         * for possible call to del_ptr below
5016                         */
5017                        slot = path->slots[1];
5018                        extent_buffer_get(leaf);
5019
5020                        btrfs_set_path_blocking(path);
5021                        wret = push_leaf_left(trans, root, path, 1, 1,
5022                                              1, (u32)-1);
5023                        if (wret < 0 && wret != -ENOSPC)
5024                                ret = wret;
5025
5026                        if (path->nodes[0] == leaf &&
5027                            btrfs_header_nritems(leaf)) {
5028                                wret = push_leaf_right(trans, root, path, 1,
5029                                                       1, 1, 0);
5030                                if (wret < 0 && wret != -ENOSPC)
5031                                        ret = wret;
5032                        }
5033
5034                        if (btrfs_header_nritems(leaf) == 0) {
5035                                path->slots[1] = slot;
5036                                btrfs_del_leaf(trans, root, path, leaf);
5037                                free_extent_buffer(leaf);
5038                                ret = 0;
5039                        } else {
5040                                /* if we're still in the path, make sure
5041                                 * we're dirty.  Otherwise, one of the
5042                                 * push_leaf functions must have already
5043                                 * dirtied this buffer
5044                                 */
5045                                if (path->nodes[0] == leaf)
5046                                        btrfs_mark_buffer_dirty(leaf);
5047                                free_extent_buffer(leaf);
5048                        }
5049                } else {
5050                        btrfs_mark_buffer_dirty(leaf);
5051                }
5052        }
5053        return ret;
5054}
5055
5056/*
5057 * search the tree again to find a leaf with lesser keys
5058 * returns 0 if it found something or 1 if there are no lesser leaves.
5059 * returns < 0 on io errors.
5060 *
5061 * This may release the path, and so you may lose any locks held at the
5062 * time you call it.
5063 */
5064int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5065{
5066        struct btrfs_key key;
5067        struct btrfs_disk_key found_key;
5068        int ret;
5069
5070        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5071
5072        if (key.offset > 0) {
5073                key.offset--;
5074        } else if (key.type > 0) {
5075                key.type--;
5076                key.offset = (u64)-1;
5077        } else if (key.objectid > 0) {
5078                key.objectid--;
5079                key.type = (u8)-1;
5080                key.offset = (u64)-1;
5081        } else {
5082                return 1;
5083        }
5084
5085        btrfs_release_path(path);
5086        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5087        if (ret < 0)
5088                return ret;
5089        btrfs_item_key(path->nodes[0], &found_key, 0);
5090        ret = comp_keys(&found_key, &key);
5091        /*
5092         * We might have had an item with the previous key in the tree right
5093         * before we released our path. And after we released our path, that
5094         * item might have been pushed to the first slot (0) of the leaf we
5095         * were holding due to a tree balance. Alternatively, an item with the
5096         * previous key can exist as the only element of a leaf (big fat item).
5097         * Therefore account for these 2 cases, so that our callers (like
5098         * btrfs_previous_item) don't miss an existing item with a key matching
5099         * the previous key we computed above.
5100         */
5101        if (ret <= 0)
5102                return 0;
5103        return 1;
5104}
5105
5106/*
5107 * A helper function to walk down the tree starting at min_key, and looking
5108 * for nodes or leaves that are have a minimum transaction id.
5109 * This is used by the btree defrag code, and tree logging
5110 *
5111 * This does not cow, but it does stuff the starting key it finds back
5112 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5113 * key and get a writable path.
5114 *
5115 * This honors path->lowest_level to prevent descent past a given level
5116 * of the tree.
5117 *
5118 * min_trans indicates the oldest transaction that you are interested
5119 * in walking through.  Any nodes or leaves older than min_trans are
5120 * skipped over (without reading them).
5121 *
5122 * returns zero if something useful was found, < 0 on error and 1 if there
5123 * was nothing in the tree that matched the search criteria.
5124 */
5125int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5126                         struct btrfs_path *path,
5127                         u64 min_trans)
5128{
5129        struct btrfs_fs_info *fs_info = root->fs_info;
5130        struct extent_buffer *cur;
5131        struct btrfs_key found_key;
5132        int slot;
5133        int sret;
5134        u32 nritems;
5135        int level;
5136        int ret = 1;
5137        int keep_locks = path->keep_locks;
5138
5139        path->keep_locks = 1;
5140again:
5141        cur = btrfs_read_lock_root_node(root);
5142        level = btrfs_header_level(cur);
5143        WARN_ON(path->nodes[level]);
5144        path->nodes[level] = cur;
5145        path->locks[level] = BTRFS_READ_LOCK;
5146
5147        if (btrfs_header_generation(cur) < min_trans) {
5148                ret = 1;
5149                goto out;
5150        }
5151        while (1) {
5152                nritems = btrfs_header_nritems(cur);
5153                level = btrfs_header_level(cur);
5154                sret = btrfs_bin_search(cur, min_key, level, &slot);
5155                if (sret < 0) {
5156                        ret = sret;
5157                        goto out;
5158                }
5159
5160                /* at the lowest level, we're done, setup the path and exit */
5161                if (level == path->lowest_level) {
5162                        if (slot >= nritems)
5163                                goto find_next_key;
5164                        ret = 0;
5165                        path->slots[level] = slot;
5166                        btrfs_item_key_to_cpu(cur, &found_key, slot);
5167                        goto out;
5168                }
5169                if (sret && slot > 0)
5170                        slot--;
5171                /*
5172                 * check this node pointer against the min_trans parameters.
5173                 * If it is too old, old, skip to the next one.
5174                 */
5175                while (slot < nritems) {
5176                        u64 gen;
5177
5178                        gen = btrfs_node_ptr_generation(cur, slot);
5179                        if (gen < min_trans) {
5180                                slot++;
5181                                continue;
5182                        }
5183                        break;
5184                }
5185find_next_key:
5186                /*
5187                 * we didn't find a candidate key in this node, walk forward
5188                 * and find another one
5189                 */
5190                if (slot >= nritems) {
5191                        path->slots[level] = slot;
5192                        btrfs_set_path_blocking(path);
5193                        sret = btrfs_find_next_key(root, path, min_key, level,
5194                                                  min_trans);
5195                        if (sret == 0) {
5196                                btrfs_release_path(path);
5197                                goto again;
5198                        } else {
5199                                goto out;
5200                        }
5201                }
5202                /* save our key for returning back */
5203                btrfs_node_key_to_cpu(cur, &found_key, slot);
5204                path->slots[level] = slot;
5205                if (level == path->lowest_level) {
5206                        ret = 0;
5207                        goto out;
5208                }
5209                btrfs_set_path_blocking(path);
5210                cur = read_node_slot(fs_info, cur, slot);
5211                if (IS_ERR(cur)) {
5212                        ret = PTR_ERR(cur);
5213                        goto out;
5214                }
5215
5216                btrfs_tree_read_lock(cur);
5217
5218                path->locks[level - 1] = BTRFS_READ_LOCK;
5219                path->nodes[level - 1] = cur;
5220                unlock_up(path, level, 1, 0, NULL);
5221        }
5222out:
5223        path->keep_locks = keep_locks;
5224        if (ret == 0) {
5225                btrfs_unlock_up_safe(path, path->lowest_level + 1);
5226                btrfs_set_path_blocking(path);
5227                memcpy(min_key, &found_key, sizeof(found_key));
5228        }
5229        return ret;
5230}
5231
5232static int tree_move_down(struct btrfs_fs_info *fs_info,
5233                           struct btrfs_path *path,
5234                           int *level)
5235{
5236        struct extent_buffer *eb;
5237
5238        BUG_ON(*level == 0);
5239        eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5240        if (IS_ERR(eb))
5241                return PTR_ERR(eb);
5242
5243        path->nodes[*level - 1] = eb;
5244        path->slots[*level - 1] = 0;
5245        (*level)--;
5246        return 0;
5247}
5248
5249static int tree_move_next_or_upnext(struct btrfs_path *path,
5250                                    int *level, int root_level)
5251{
5252        int ret = 0;
5253        int nritems;
5254        nritems = btrfs_header_nritems(path->nodes[*level]);
5255
5256        path->slots[*level]++;
5257
5258        while (path->slots[*level] >= nritems) {
5259                if (*level == root_level)
5260                        return -1;
5261
5262                /* move upnext */
5263                path->slots[*level] = 0;
5264                free_extent_buffer(path->nodes[*level]);
5265                path->nodes[*level] = NULL;
5266                (*level)++;
5267                path->slots[*level]++;
5268
5269                nritems = btrfs_header_nritems(path->nodes[*level]);
5270                ret = 1;
5271        }
5272        return ret;
5273}
5274
5275/*
5276 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5277 * or down.
5278 */
5279static int tree_advance(struct btrfs_fs_info *fs_info,
5280                        struct btrfs_path *path,
5281                        int *level, int root_level,
5282                        int allow_down,
5283                        struct btrfs_key *key)
5284{
5285        int ret;
5286
5287        if (*level == 0 || !allow_down) {
5288                ret = tree_move_next_or_upnext(path, level, root_level);
5289        } else {
5290                ret = tree_move_down(fs_info, path, level);
5291        }
5292        if (ret >= 0) {
5293                if (*level == 0)
5294                        btrfs_item_key_to_cpu(path->nodes[*level], key,
5295                                        path->slots[*level]);
5296                else
5297                        btrfs_node_key_to_cpu(path->nodes[*level], key,
5298                                        path->slots[*level]);
5299        }
5300        return ret;
5301}
5302
5303static int tree_compare_item(struct btrfs_path *left_path,
5304                             struct btrfs_path *right_path,
5305                             char *tmp_buf)
5306{
5307        int cmp;
5308        int len1, len2;
5309        unsigned long off1, off2;
5310
5311        len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5312        len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5313        if (len1 != len2)
5314                return 1;
5315
5316        off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5317        off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5318                                right_path->slots[0]);
5319
5320        read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5321
5322        cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5323        if (cmp)
5324                return 1;
5325        return 0;
5326}
5327
5328#define ADVANCE 1
5329#define ADVANCE_ONLY_NEXT -1
5330
5331/*
5332 * This function compares two trees and calls the provided callback for
5333 * every changed/new/deleted item it finds.
5334 * If shared tree blocks are encountered, whole subtrees are skipped, making
5335 * the compare pretty fast on snapshotted subvolumes.
5336 *
5337 * This currently works on commit roots only. As commit roots are read only,
5338 * we don't do any locking. The commit roots are protected with transactions.
5339 * Transactions are ended and rejoined when a commit is tried in between.
5340 *
5341 * This function checks for modifications done to the trees while comparing.
5342 * If it detects a change, it aborts immediately.
5343 */
5344int btrfs_compare_trees(struct btrfs_root *left_root,
5345                        struct btrfs_root *right_root,
5346                        btrfs_changed_cb_t changed_cb, void *ctx)
5347{
5348        struct btrfs_fs_info *fs_info = left_root->fs_info;
5349        int ret;
5350        int cmp;
5351        struct btrfs_path *left_path = NULL;
5352        struct btrfs_path *right_path = NULL;
5353        struct btrfs_key left_key;
5354        struct btrfs_key right_key;
5355        char *tmp_buf = NULL;
5356        int left_root_level;
5357        int right_root_level;
5358        int left_level;
5359        int right_level;
5360        int left_end_reached;
5361        int right_end_reached;
5362        int advance_left;
5363        int advance_right;
5364        u64 left_blockptr;
5365        u64 right_blockptr;
5366        u64 left_gen;
5367        u64 right_gen;
5368
5369        left_path = btrfs_alloc_path();
5370        if (!left_path) {
5371                ret = -ENOMEM;
5372                goto out;
5373        }
5374        right_path = btrfs_alloc_path();
5375        if (!right_path) {
5376                ret = -ENOMEM;
5377                goto out;
5378        }
5379
5380        tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5381        if (!tmp_buf) {
5382                ret = -ENOMEM;
5383                goto out;
5384        }
5385
5386        left_path->search_commit_root = 1;
5387        left_path->skip_locking = 1;
5388        right_path->search_commit_root = 1;
5389        right_path->skip_locking = 1;
5390
5391        /*
5392         * Strategy: Go to the first items of both trees. Then do
5393         *
5394         * If both trees are at level 0
5395         *   Compare keys of current items
5396         *     If left < right treat left item as new, advance left tree
5397         *       and repeat
5398         *     If left > right treat right item as deleted, advance right tree
5399         *       and repeat
5400         *     If left == right do deep compare of items, treat as changed if
5401         *       needed, advance both trees and repeat
5402         * If both trees are at the same level but not at level 0
5403         *   Compare keys of current nodes/leafs
5404         *     If left < right advance left tree and repeat
5405         *     If left > right advance right tree and repeat
5406         *     If left == right compare blockptrs of the next nodes/leafs
5407         *       If they match advance both trees but stay at the same level
5408         *         and repeat
5409         *       If they don't match advance both trees while allowing to go
5410         *         deeper and repeat
5411         * If tree levels are different
5412         *   Advance the tree that needs it and repeat
5413         *
5414         * Advancing a tree means:
5415         *   If we are at level 0, try to go to the next slot. If that's not
5416         *   possible, go one level up and repeat. Stop when we found a level
5417         *   where we could go to the next slot. We may at this point be on a
5418         *   node or a leaf.
5419         *
5420         *   If we are not at level 0 and not on shared tree blocks, go one
5421         *   level deeper.
5422         *
5423         *   If we are not at level 0 and on shared tree blocks, go one slot to
5424         *   the right if possible or go up and right.
5425         */
5426
5427        down_read(&fs_info->commit_root_sem);
5428        left_level = btrfs_header_level(left_root->commit_root);
5429        left_root_level = left_level;
5430        left_path->nodes[left_level] =
5431                        btrfs_clone_extent_buffer(left_root->commit_root);
5432        if (!left_path->nodes[left_level]) {
5433                up_read(&fs_info->commit_root_sem);
5434                ret = -ENOMEM;
5435                goto out;
5436        }
5437
5438        right_level = btrfs_header_level(right_root->commit_root);
5439        right_root_level = right_level;
5440        right_path->nodes[right_level] =
5441                        btrfs_clone_extent_buffer(right_root->commit_root);
5442        if (!right_path->nodes[right_level]) {
5443                up_read(&fs_info->commit_root_sem);
5444                ret = -ENOMEM;
5445                goto out;
5446        }
5447        up_read(&fs_info->commit_root_sem);
5448
5449        if (left_level == 0)
5450                btrfs_item_key_to_cpu(left_path->nodes[left_level],
5451                                &left_key, left_path->slots[left_level]);
5452        else
5453                btrfs_node_key_to_cpu(left_path->nodes[left_level],
5454                                &left_key, left_path->slots[left_level]);
5455        if (right_level == 0)
5456                btrfs_item_key_to_cpu(right_path->nodes[right_level],
5457                                &right_key, right_path->slots[right_level]);
5458        else
5459                btrfs_node_key_to_cpu(right_path->nodes[right_level],
5460                                &right_key, right_path->slots[right_level]);
5461
5462        left_end_reached = right_end_reached = 0;
5463        advance_left = advance_right = 0;
5464
5465        while (1) {
5466                if (advance_left && !left_end_reached) {
5467                        ret = tree_advance(fs_info, left_path, &left_level,
5468                                        left_root_level,
5469                                        advance_left != ADVANCE_ONLY_NEXT,
5470                                        &left_key);
5471                        if (ret == -1)
5472                                left_end_reached = ADVANCE;
5473                        else if (ret < 0)
5474                                goto out;
5475                        advance_left = 0;
5476                }
5477                if (advance_right && !right_end_reached) {
5478                        ret = tree_advance(fs_info, right_path, &right_level,
5479                                        right_root_level,
5480                                        advance_right != ADVANCE_ONLY_NEXT,
5481                                        &right_key);
5482                        if (ret == -1)
5483                                right_end_reached = ADVANCE;
5484                        else if (ret < 0)
5485                                goto out;
5486                        advance_right = 0;
5487                }
5488
5489                if (left_end_reached && right_end_reached) {
5490                        ret = 0;
5491                        goto out;
5492                } else if (left_end_reached) {
5493                        if (right_level == 0) {
5494                                ret = changed_cb(left_path, right_path,
5495                                                &right_key,
5496                                                BTRFS_COMPARE_TREE_DELETED,
5497                                                ctx);
5498                                if (ret < 0)
5499                                        goto out;
5500                        }
5501                        advance_right = ADVANCE;
5502                        continue;
5503                } else if (right_end_reached) {
5504                        if (left_level == 0) {
5505                                ret = changed_cb(left_path, right_path,
5506                                                &left_key,
5507                                                BTRFS_COMPARE_TREE_NEW,
5508                                                ctx);
5509                                if (ret < 0)
5510                                        goto out;
5511                        }
5512                        advance_left = ADVANCE;
5513                        continue;
5514                }
5515
5516                if (left_level == 0 && right_level == 0) {
5517                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5518                        if (cmp < 0) {
5519                                ret = changed_cb(left_path, right_path,
5520                                                &left_key,
5521                                                BTRFS_COMPARE_TREE_NEW,
5522                                                ctx);
5523                                if (ret < 0)
5524                                        goto out;
5525                                advance_left = ADVANCE;
5526                        } else if (cmp > 0) {
5527                                ret = changed_cb(left_path, right_path,
5528                                                &right_key,
5529                                                BTRFS_COMPARE_TREE_DELETED,
5530                                                ctx);
5531                                if (ret < 0)
5532                                        goto out;
5533                                advance_right = ADVANCE;
5534                        } else {
5535                                enum btrfs_compare_tree_result result;
5536
5537                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5538                                ret = tree_compare_item(left_path, right_path,
5539                                                        tmp_buf);
5540                                if (ret)
5541                                        result = BTRFS_COMPARE_TREE_CHANGED;
5542                                else
5543                                        result = BTRFS_COMPARE_TREE_SAME;
5544                                ret = changed_cb(left_path, right_path,
5545                                                 &left_key, result, ctx);
5546                                if (ret < 0)
5547                                        goto out;
5548                                advance_left = ADVANCE;
5549                                advance_right = ADVANCE;
5550                        }
5551                } else if (left_level == right_level) {
5552                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5553                        if (cmp < 0) {
5554                                advance_left = ADVANCE;
5555                        } else if (cmp > 0) {
5556                                advance_right = ADVANCE;
5557                        } else {
5558                                left_blockptr = btrfs_node_blockptr(
5559                                                left_path->nodes[left_level],
5560                                                left_path->slots[left_level]);
5561                                right_blockptr = btrfs_node_blockptr(
5562                                                right_path->nodes[right_level],
5563                                                right_path->slots[right_level]);
5564                                left_gen = btrfs_node_ptr_generation(
5565                                                left_path->nodes[left_level],
5566                                                left_path->slots[left_level]);
5567                                right_gen = btrfs_node_ptr_generation(
5568                                                right_path->nodes[right_level],
5569                                                right_path->slots[right_level]);
5570                                if (left_blockptr == right_blockptr &&
5571                                    left_gen == right_gen) {
5572                                        /*
5573                                         * As we're on a shared block, don't
5574                                         * allow to go deeper.
5575                                         */
5576                                        advance_left = ADVANCE_ONLY_NEXT;
5577                                        advance_right = ADVANCE_ONLY_NEXT;
5578                                } else {
5579                                        advance_left = ADVANCE;
5580                                        advance_right = ADVANCE;
5581                                }
5582                        }
5583                } else if (left_level < right_level) {
5584                        advance_right = ADVANCE;
5585                } else {
5586                        advance_left = ADVANCE;
5587                }
5588        }
5589
5590out:
5591        btrfs_free_path(left_path);
5592        btrfs_free_path(right_path);
5593        kvfree(tmp_buf);
5594        return ret;
5595}
5596
5597/*
5598 * this is similar to btrfs_next_leaf, but does not try to preserve
5599 * and fixup the path.  It looks for and returns the next key in the
5600 * tree based on the current path and the min_trans parameters.
5601 *
5602 * 0 is returned if another key is found, < 0 if there are any errors
5603 * and 1 is returned if there are no higher keys in the tree
5604 *
5605 * path->keep_locks should be set to 1 on the search made before
5606 * calling this function.
5607 */
5608int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5609                        struct btrfs_key *key, int level, u64 min_trans)
5610{
5611        int slot;
5612        struct extent_buffer *c;
5613
5614        WARN_ON(!path->keep_locks);
5615        while (level < BTRFS_MAX_LEVEL) {
5616                if (!path->nodes[level])
5617                        return 1;
5618
5619                slot = path->slots[level] + 1;
5620                c = path->nodes[level];
5621next:
5622                if (slot >= btrfs_header_nritems(c)) {
5623                        int ret;
5624                        int orig_lowest;
5625                        struct btrfs_key cur_key;
5626                        if (level + 1 >= BTRFS_MAX_LEVEL ||
5627                            !path->nodes[level + 1])
5628                                return 1;
5629
5630                        if (path->locks[level + 1]) {
5631                                level++;
5632                                continue;
5633                        }
5634
5635                        slot = btrfs_header_nritems(c) - 1;
5636                        if (level == 0)
5637                                btrfs_item_key_to_cpu(c, &cur_key, slot);
5638                        else
5639                                btrfs_node_key_to_cpu(c, &cur_key, slot);
5640
5641                        orig_lowest = path->lowest_level;
5642                        btrfs_release_path(path);
5643                        path->lowest_level = level;
5644                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
5645                                                0, 0);
5646                        path->lowest_level = orig_lowest;
5647                        if (ret < 0)
5648                                return ret;
5649
5650                        c = path->nodes[level];
5651                        slot = path->slots[level];
5652                        if (ret == 0)
5653                                slot++;
5654                        goto next;
5655                }
5656
5657                if (level == 0)
5658                        btrfs_item_key_to_cpu(c, key, slot);
5659                else {
5660                        u64 gen = btrfs_node_ptr_generation(c, slot);
5661
5662                        if (gen < min_trans) {
5663                                slot++;
5664                                goto next;
5665                        }
5666                        btrfs_node_key_to_cpu(c, key, slot);
5667                }
5668                return 0;
5669        }
5670        return 1;
5671}
5672
5673/*
5674 * search the tree again to find a leaf with greater keys
5675 * returns 0 if it found something or 1 if there are no greater leaves.
5676 * returns < 0 on io errors.
5677 */
5678int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5679{
5680        return btrfs_next_old_leaf(root, path, 0);
5681}
5682
5683int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5684                        u64 time_seq)
5685{
5686        int slot;
5687        int level;
5688        struct extent_buffer *c;
5689        struct extent_buffer *next;
5690        struct btrfs_key key;
5691        u32 nritems;
5692        int ret;
5693        int old_spinning = path->leave_spinning;
5694        int next_rw_lock = 0;
5695
5696        nritems = btrfs_header_nritems(path->nodes[0]);
5697        if (nritems == 0)
5698                return 1;
5699
5700        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5701again:
5702        level = 1;
5703        next = NULL;
5704        next_rw_lock = 0;
5705        btrfs_release_path(path);
5706
5707        path->keep_locks = 1;
5708        path->leave_spinning = 1;
5709
5710        if (time_seq)
5711                ret = btrfs_search_old_slot(root, &key, path, time_seq);
5712        else
5713                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5714        path->keep_locks = 0;
5715
5716        if (ret < 0)
5717                return ret;
5718
5719        nritems = btrfs_header_nritems(path->nodes[0]);
5720        /*
5721         * by releasing the path above we dropped all our locks.  A balance
5722         * could have added more items next to the key that used to be
5723         * at the very end of the block.  So, check again here and
5724         * advance the path if there are now more items available.
5725         */
5726        if (nritems > 0 && path->slots[0] < nritems - 1) {
5727                if (ret == 0)
5728                        path->slots[0]++;
5729                ret = 0;
5730                goto done;
5731        }
5732        /*
5733         * So the above check misses one case:
5734         * - after releasing the path above, someone has removed the item that
5735         *   used to be at the very end of the block, and balance between leafs
5736         *   gets another one with bigger key.offset to replace it.
5737         *
5738         * This one should be returned as well, or we can get leaf corruption
5739         * later(esp. in __btrfs_drop_extents()).
5740         *
5741         * And a bit more explanation about this check,
5742         * with ret > 0, the key isn't found, the path points to the slot
5743         * where it should be inserted, so the path->slots[0] item must be the
5744         * bigger one.
5745         */
5746        if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5747                ret = 0;
5748                goto done;
5749        }
5750
5751        while (level < BTRFS_MAX_LEVEL) {
5752                if (!path->nodes[level]) {
5753                        ret = 1;
5754                        goto done;
5755                }
5756
5757                slot = path->slots[level] + 1;
5758                c = path->nodes[level];
5759                if (slot >= btrfs_header_nritems(c)) {
5760                        level++;
5761                        if (level == BTRFS_MAX_LEVEL) {
5762                                ret = 1;
5763                                goto done;
5764                        }
5765                        continue;
5766                }
5767
5768                if (next) {
5769                        btrfs_tree_unlock_rw(next, next_rw_lock);
5770                        free_extent_buffer(next);
5771                }
5772
5773                next = c;
5774                next_rw_lock = path->locks[level];
5775                ret = read_block_for_search(root, path, &next, level,
5776                                            slot, &key);
5777                if (ret == -EAGAIN)
5778                        goto again;
5779
5780                if (ret < 0) {
5781                        btrfs_release_path(path);
5782                        goto done;
5783                }
5784
5785                if (!path->skip_locking) {
5786                        ret = btrfs_try_tree_read_lock(next);
5787                        if (!ret && time_seq) {
5788                                /*
5789                                 * If we don't get the lock, we may be racing
5790                                 * with push_leaf_left, holding that lock while
5791                                 * itself waiting for the leaf we've currently
5792                                 * locked. To solve this situation, we give up
5793                                 * on our lock and cycle.
5794                                 */
5795                                free_extent_buffer(next);
5796                                btrfs_release_path(path);
5797                                cond_resched();
5798                                goto again;
5799                        }
5800                        if (!ret) {
5801                                btrfs_set_path_blocking(path);
5802                                btrfs_tree_read_lock(next);
5803                        }
5804                        next_rw_lock = BTRFS_READ_LOCK;
5805                }
5806                break;
5807        }
5808        path->slots[level] = slot;
5809        while (1) {
5810                level--;
5811                c = path->nodes[level];
5812                if (path->locks[level])
5813                        btrfs_tree_unlock_rw(c, path->locks[level]);
5814
5815                free_extent_buffer(c);
5816                path->nodes[level] = next;
5817                path->slots[level] = 0;
5818                if (!path->skip_locking)
5819                        path->locks[level] = next_rw_lock;
5820                if (!level)
5821                        break;
5822
5823                ret = read_block_for_search(root, path, &next, level,
5824                                            0, &key);
5825                if (ret == -EAGAIN)
5826                        goto again;
5827
5828                if (ret < 0) {
5829                        btrfs_release_path(path);
5830                        goto done;
5831                }
5832
5833                if (!path->skip_locking) {
5834                        ret = btrfs_try_tree_read_lock(next);
5835                        if (!ret) {
5836                                btrfs_set_path_blocking(path);
5837                                btrfs_tree_read_lock(next);
5838                        }
5839                        next_rw_lock = BTRFS_READ_LOCK;
5840                }
5841        }
5842        ret = 0;
5843done:
5844        unlock_up(path, 0, 1, 0, NULL);
5845        path->leave_spinning = old_spinning;
5846        if (!old_spinning)
5847                btrfs_set_path_blocking(path);
5848
5849        return ret;
5850}
5851
5852/*
5853 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5854 * searching until it gets past min_objectid or finds an item of 'type'
5855 *
5856 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5857 */
5858int btrfs_previous_item(struct btrfs_root *root,
5859                        struct btrfs_path *path, u64 min_objectid,
5860                        int type)
5861{
5862        struct btrfs_key found_key;
5863        struct extent_buffer *leaf;
5864        u32 nritems;
5865        int ret;
5866
5867        while (1) {
5868                if (path->slots[0] == 0) {
5869                        btrfs_set_path_blocking(path);
5870                        ret = btrfs_prev_leaf(root, path);
5871                        if (ret != 0)
5872                                return ret;
5873                } else {
5874                        path->slots[0]--;
5875                }
5876                leaf = path->nodes[0];
5877                nritems = btrfs_header_nritems(leaf);
5878                if (nritems == 0)
5879                        return 1;
5880                if (path->slots[0] == nritems)
5881                        path->slots[0]--;
5882
5883                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5884                if (found_key.objectid < min_objectid)
5885                        break;
5886                if (found_key.type == type)
5887                        return 0;
5888                if (found_key.objectid == min_objectid &&
5889                    found_key.type < type)
5890                        break;
5891        }
5892        return 1;
5893}
5894
5895/*
5896 * search in extent tree to find a previous Metadata/Data extent item with
5897 * min objecitd.
5898 *
5899 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5900 */
5901int btrfs_previous_extent_item(struct btrfs_root *root,
5902                        struct btrfs_path *path, u64 min_objectid)
5903{
5904        struct btrfs_key found_key;
5905        struct extent_buffer *leaf;
5906        u32 nritems;
5907        int ret;
5908
5909        while (1) {
5910                if (path->slots[0] == 0) {
5911                        btrfs_set_path_blocking(path);
5912                        ret = btrfs_prev_leaf(root, path);
5913                        if (ret != 0)
5914                                return ret;
5915                } else {
5916                        path->slots[0]--;
5917                }
5918                leaf = path->nodes[0];
5919                nritems = btrfs_header_nritems(leaf);
5920                if (nritems == 0)
5921                        return 1;
5922                if (path->slots[0] == nritems)
5923                        path->slots[0]--;
5924
5925                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5926                if (found_key.objectid < min_objectid)
5927                        break;
5928                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5929                    found_key.type == BTRFS_METADATA_ITEM_KEY)
5930                        return 0;
5931                if (found_key.objectid == min_objectid &&
5932                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5933                        break;
5934        }
5935        return 1;
5936}
5937