linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/slab.h>
  21#include <linux/rbtree.h>
  22#include "ctree.h"
  23#include "disk-io.h"
  24#include "transaction.h"
  25#include "print-tree.h"
  26#include "locking.h"
  27
  28static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  29                      *root, struct btrfs_path *path, int level);
  30static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
  31                      *root, struct btrfs_key *ins_key,
  32                      struct btrfs_path *path, int data_size, int extend);
  33static int push_node_left(struct btrfs_trans_handle *trans,
  34                          struct btrfs_root *root, struct extent_buffer *dst,
  35                          struct extent_buffer *src, int empty);
  36static int balance_node_right(struct btrfs_trans_handle *trans,
  37                              struct btrfs_root *root,
  38                              struct extent_buffer *dst_buf,
  39                              struct extent_buffer *src_buf);
  40static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
  41                    int level, int slot);
  42static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
  43                                 struct extent_buffer *eb);
  44static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
  45
  46struct btrfs_path *btrfs_alloc_path(void)
  47{
  48        struct btrfs_path *path;
  49        path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  50        return path;
  51}
  52
  53/*
  54 * set all locked nodes in the path to blocking locks.  This should
  55 * be done before scheduling
  56 */
  57noinline void btrfs_set_path_blocking(struct btrfs_path *p)
  58{
  59        int i;
  60        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  61                if (!p->nodes[i] || !p->locks[i])
  62                        continue;
  63                btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
  64                if (p->locks[i] == BTRFS_READ_LOCK)
  65                        p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
  66                else if (p->locks[i] == BTRFS_WRITE_LOCK)
  67                        p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
  68        }
  69}
  70
  71/*
  72 * reset all the locked nodes in the patch to spinning locks.
  73 *
  74 * held is used to keep lockdep happy, when lockdep is enabled
  75 * we set held to a blocking lock before we go around and
  76 * retake all the spinlocks in the path.  You can safely use NULL
  77 * for held
  78 */
  79noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
  80                                        struct extent_buffer *held, int held_rw)
  81{
  82        int i;
  83
  84#ifdef CONFIG_DEBUG_LOCK_ALLOC
  85        /* lockdep really cares that we take all of these spinlocks
  86         * in the right order.  If any of the locks in the path are not
  87         * currently blocking, it is going to complain.  So, make really
  88         * really sure by forcing the path to blocking before we clear
  89         * the path blocking.
  90         */
  91        if (held) {
  92                btrfs_set_lock_blocking_rw(held, held_rw);
  93                if (held_rw == BTRFS_WRITE_LOCK)
  94                        held_rw = BTRFS_WRITE_LOCK_BLOCKING;
  95                else if (held_rw == BTRFS_READ_LOCK)
  96                        held_rw = BTRFS_READ_LOCK_BLOCKING;
  97        }
  98        btrfs_set_path_blocking(p);
  99#endif
 100
 101        for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
 102                if (p->nodes[i] && p->locks[i]) {
 103                        btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
 104                        if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
 105                                p->locks[i] = BTRFS_WRITE_LOCK;
 106                        else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
 107                                p->locks[i] = BTRFS_READ_LOCK;
 108                }
 109        }
 110
 111#ifdef CONFIG_DEBUG_LOCK_ALLOC
 112        if (held)
 113                btrfs_clear_lock_blocking_rw(held, held_rw);
 114#endif
 115}
 116
 117/* this also releases the path */
 118void btrfs_free_path(struct btrfs_path *p)
 119{
 120        if (!p)
 121                return;
 122        btrfs_release_path(p);
 123        kmem_cache_free(btrfs_path_cachep, p);
 124}
 125
 126/*
 127 * path release drops references on the extent buffers in the path
 128 * and it drops any locks held by this path
 129 *
 130 * It is safe to call this on paths that no locks or extent buffers held.
 131 */
 132noinline void btrfs_release_path(struct btrfs_path *p)
 133{
 134        int i;
 135
 136        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 137                p->slots[i] = 0;
 138                if (!p->nodes[i])
 139                        continue;
 140                if (p->locks[i]) {
 141                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
 142                        p->locks[i] = 0;
 143                }
 144                free_extent_buffer(p->nodes[i]);
 145                p->nodes[i] = NULL;
 146        }
 147}
 148
 149/*
 150 * safely gets a reference on the root node of a tree.  A lock
 151 * is not taken, so a concurrent writer may put a different node
 152 * at the root of the tree.  See btrfs_lock_root_node for the
 153 * looping required.
 154 *
 155 * The extent buffer returned by this has a reference taken, so
 156 * it won't disappear.  It may stop being the root of the tree
 157 * at any time because there are no locks held.
 158 */
 159struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 160{
 161        struct extent_buffer *eb;
 162
 163        while (1) {
 164                rcu_read_lock();
 165                eb = rcu_dereference(root->node);
 166
 167                /*
 168                 * RCU really hurts here, we could free up the root node because
 169                 * it was cow'ed but we may not get the new root node yet so do
 170                 * the inc_not_zero dance and if it doesn't work then
 171                 * synchronize_rcu and try again.
 172                 */
 173                if (atomic_inc_not_zero(&eb->refs)) {
 174                        rcu_read_unlock();
 175                        break;
 176                }
 177                rcu_read_unlock();
 178                synchronize_rcu();
 179        }
 180        return eb;
 181}
 182
 183/* loop around taking references on and locking the root node of the
 184 * tree until you end up with a lock on the root.  A locked buffer
 185 * is returned, with a reference held.
 186 */
 187struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 188{
 189        struct extent_buffer *eb;
 190
 191        while (1) {
 192                eb = btrfs_root_node(root);
 193                btrfs_tree_lock(eb);
 194                if (eb == root->node)
 195                        break;
 196                btrfs_tree_unlock(eb);
 197                free_extent_buffer(eb);
 198        }
 199        return eb;
 200}
 201
 202/* loop around taking references on and locking the root node of the
 203 * tree until you end up with a lock on the root.  A locked buffer
 204 * is returned, with a reference held.
 205 */
 206static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 207{
 208        struct extent_buffer *eb;
 209
 210        while (1) {
 211                eb = btrfs_root_node(root);
 212                btrfs_tree_read_lock(eb);
 213                if (eb == root->node)
 214                        break;
 215                btrfs_tree_read_unlock(eb);
 216                free_extent_buffer(eb);
 217        }
 218        return eb;
 219}
 220
 221/* cowonly root (everything not a reference counted cow subvolume), just get
 222 * put onto a simple dirty list.  transaction.c walks this to make sure they
 223 * get properly updated on disk.
 224 */
 225static void add_root_to_dirty_list(struct btrfs_root *root)
 226{
 227        spin_lock(&root->fs_info->trans_lock);
 228        if (root->track_dirty && list_empty(&root->dirty_list)) {
 229                list_add(&root->dirty_list,
 230                         &root->fs_info->dirty_cowonly_roots);
 231        }
 232        spin_unlock(&root->fs_info->trans_lock);
 233}
 234
 235/*
 236 * used by snapshot creation to make a copy of a root for a tree with
 237 * a given objectid.  The buffer with the new root node is returned in
 238 * cow_ret, and this func returns zero on success or a negative error code.
 239 */
 240int btrfs_copy_root(struct btrfs_trans_handle *trans,
 241                      struct btrfs_root *root,
 242                      struct extent_buffer *buf,
 243                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 244{
 245        struct extent_buffer *cow;
 246        int ret = 0;
 247        int level;
 248        struct btrfs_disk_key disk_key;
 249
 250        WARN_ON(root->ref_cows && trans->transid !=
 251                root->fs_info->running_transaction->transid);
 252        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 253
 254        level = btrfs_header_level(buf);
 255        if (level == 0)
 256                btrfs_item_key(buf, &disk_key, 0);
 257        else
 258                btrfs_node_key(buf, &disk_key, 0);
 259
 260        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
 261                                     new_root_objectid, &disk_key, level,
 262                                     buf->start, 0);
 263        if (IS_ERR(cow))
 264                return PTR_ERR(cow);
 265
 266        copy_extent_buffer(cow, buf, 0, 0, cow->len);
 267        btrfs_set_header_bytenr(cow, cow->start);
 268        btrfs_set_header_generation(cow, trans->transid);
 269        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 270        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 271                                     BTRFS_HEADER_FLAG_RELOC);
 272        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 273                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 274        else
 275                btrfs_set_header_owner(cow, new_root_objectid);
 276
 277        write_extent_buffer(cow, root->fs_info->fsid,
 278                            (unsigned long)btrfs_header_fsid(cow),
 279                            BTRFS_FSID_SIZE);
 280
 281        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 282        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 283                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 284        else
 285                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 286
 287        if (ret)
 288                return ret;
 289
 290        btrfs_mark_buffer_dirty(cow);
 291        *cow_ret = cow;
 292        return 0;
 293}
 294
 295enum mod_log_op {
 296        MOD_LOG_KEY_REPLACE,
 297        MOD_LOG_KEY_ADD,
 298        MOD_LOG_KEY_REMOVE,
 299        MOD_LOG_KEY_REMOVE_WHILE_FREEING,
 300        MOD_LOG_KEY_REMOVE_WHILE_MOVING,
 301        MOD_LOG_MOVE_KEYS,
 302        MOD_LOG_ROOT_REPLACE,
 303};
 304
 305struct tree_mod_move {
 306        int dst_slot;
 307        int nr_items;
 308};
 309
 310struct tree_mod_root {
 311        u64 logical;
 312        u8 level;
 313};
 314
 315struct tree_mod_elem {
 316        struct rb_node node;
 317        u64 index;              /* shifted logical */
 318        u64 seq;
 319        enum mod_log_op op;
 320
 321        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
 322        int slot;
 323
 324        /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
 325        u64 generation;
 326
 327        /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
 328        struct btrfs_disk_key key;
 329        u64 blockptr;
 330
 331        /* this is used for op == MOD_LOG_MOVE_KEYS */
 332        struct tree_mod_move move;
 333
 334        /* this is used for op == MOD_LOG_ROOT_REPLACE */
 335        struct tree_mod_root old_root;
 336};
 337
 338static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
 339{
 340        read_lock(&fs_info->tree_mod_log_lock);
 341}
 342
 343static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
 344{
 345        read_unlock(&fs_info->tree_mod_log_lock);
 346}
 347
 348static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
 349{
 350        write_lock(&fs_info->tree_mod_log_lock);
 351}
 352
 353static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
 354{
 355        write_unlock(&fs_info->tree_mod_log_lock);
 356}
 357
 358/*
 359 * Increment the upper half of tree_mod_seq, set lower half zero.
 360 *
 361 * Must be called with fs_info->tree_mod_seq_lock held.
 362 */
 363static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
 364{
 365        u64 seq = atomic64_read(&fs_info->tree_mod_seq);
 366        seq &= 0xffffffff00000000ull;
 367        seq += 1ull << 32;
 368        atomic64_set(&fs_info->tree_mod_seq, seq);
 369        return seq;
 370}
 371
 372/*
 373 * Increment the lower half of tree_mod_seq.
 374 *
 375 * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
 376 * are generated should not technically require a spin lock here. (Rationale:
 377 * incrementing the minor while incrementing the major seq number is between its
 378 * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
 379 * just returns a unique sequence number as usual.) We have decided to leave
 380 * that requirement in here and rethink it once we notice it really imposes a
 381 * problem on some workload.
 382 */
 383static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
 384{
 385        return atomic64_inc_return(&fs_info->tree_mod_seq);
 386}
 387
 388/*
 389 * return the last minor in the previous major tree_mod_seq number
 390 */
 391u64 btrfs_tree_mod_seq_prev(u64 seq)
 392{
 393        return (seq & 0xffffffff00000000ull) - 1ull;
 394}
 395
 396/*
 397 * This adds a new blocker to the tree mod log's blocker list if the @elem
 398 * passed does not already have a sequence number set. So when a caller expects
 399 * to record tree modifications, it should ensure to set elem->seq to zero
 400 * before calling btrfs_get_tree_mod_seq.
 401 * Returns a fresh, unused tree log modification sequence number, even if no new
 402 * blocker was added.
 403 */
 404u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 405                           struct seq_list *elem)
 406{
 407        u64 seq;
 408
 409        tree_mod_log_write_lock(fs_info);
 410        spin_lock(&fs_info->tree_mod_seq_lock);
 411        if (!elem->seq) {
 412                elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
 413                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 414        }
 415        seq = btrfs_inc_tree_mod_seq_minor(fs_info);
 416        spin_unlock(&fs_info->tree_mod_seq_lock);
 417        tree_mod_log_write_unlock(fs_info);
 418
 419        return seq;
 420}
 421
 422void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 423                            struct seq_list *elem)
 424{
 425        struct rb_root *tm_root;
 426        struct rb_node *node;
 427        struct rb_node *next;
 428        struct seq_list *cur_elem;
 429        struct tree_mod_elem *tm;
 430        u64 min_seq = (u64)-1;
 431        u64 seq_putting = elem->seq;
 432
 433        if (!seq_putting)
 434                return;
 435
 436        spin_lock(&fs_info->tree_mod_seq_lock);
 437        list_del(&elem->list);
 438        elem->seq = 0;
 439
 440        list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
 441                if (cur_elem->seq < min_seq) {
 442                        if (seq_putting > cur_elem->seq) {
 443                                /*
 444                                 * blocker with lower sequence number exists, we
 445                                 * cannot remove anything from the log
 446                                 */
 447                                spin_unlock(&fs_info->tree_mod_seq_lock);
 448                                return;
 449                        }
 450                        min_seq = cur_elem->seq;
 451                }
 452        }
 453        spin_unlock(&fs_info->tree_mod_seq_lock);
 454
 455        /*
 456         * anything that's lower than the lowest existing (read: blocked)
 457         * sequence number can be removed from the tree.
 458         */
 459        tree_mod_log_write_lock(fs_info);
 460        tm_root = &fs_info->tree_mod_log;
 461        for (node = rb_first(tm_root); node; node = next) {
 462                next = rb_next(node);
 463                tm = container_of(node, struct tree_mod_elem, node);
 464                if (tm->seq > min_seq)
 465                        continue;
 466                rb_erase(node, tm_root);
 467                kfree(tm);
 468        }
 469        tree_mod_log_write_unlock(fs_info);
 470}
 471
 472/*
 473 * key order of the log:
 474 *       index -> sequence
 475 *
 476 * the index is the shifted logical of the *new* root node for root replace
 477 * operations, or the shifted logical of the affected block for all other
 478 * operations.
 479 */
 480static noinline int
 481__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 482{
 483        struct rb_root *tm_root;
 484        struct rb_node **new;
 485        struct rb_node *parent = NULL;
 486        struct tree_mod_elem *cur;
 487
 488        BUG_ON(!tm || !tm->seq);
 489
 490        tm_root = &fs_info->tree_mod_log;
 491        new = &tm_root->rb_node;
 492        while (*new) {
 493                cur = container_of(*new, struct tree_mod_elem, node);
 494                parent = *new;
 495                if (cur->index < tm->index)
 496                        new = &((*new)->rb_left);
 497                else if (cur->index > tm->index)
 498                        new = &((*new)->rb_right);
 499                else if (cur->seq < tm->seq)
 500                        new = &((*new)->rb_left);
 501                else if (cur->seq > tm->seq)
 502                        new = &((*new)->rb_right);
 503                else {
 504                        kfree(tm);
 505                        return -EEXIST;
 506                }
 507        }
 508
 509        rb_link_node(&tm->node, parent, new);
 510        rb_insert_color(&tm->node, tm_root);
 511        return 0;
 512}
 513
 514/*
 515 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 516 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 517 * this until all tree mod log insertions are recorded in the rb tree and then
 518 * call tree_mod_log_write_unlock() to release.
 519 */
 520static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 521                                    struct extent_buffer *eb) {
 522        smp_mb();
 523        if (list_empty(&(fs_info)->tree_mod_seq_list))
 524                return 1;
 525        if (eb && btrfs_header_level(eb) == 0)
 526                return 1;
 527
 528        tree_mod_log_write_lock(fs_info);
 529        if (list_empty(&fs_info->tree_mod_seq_list)) {
 530                /*
 531                 * someone emptied the list while we were waiting for the lock.
 532                 * we must not add to the list when no blocker exists.
 533                 */
 534                tree_mod_log_write_unlock(fs_info);
 535                return 1;
 536        }
 537
 538        return 0;
 539}
 540
 541/*
 542 * This allocates memory and gets a tree modification sequence number.
 543 *
 544 * Returns <0 on error.
 545 * Returns >0 (the added sequence number) on success.
 546 */
 547static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
 548                                 struct tree_mod_elem **tm_ret)
 549{
 550        struct tree_mod_elem *tm;
 551
 552        /*
 553         * once we switch from spin locks to something different, we should
 554         * honor the flags parameter here.
 555         */
 556        tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
 557        if (!tm)
 558                return -ENOMEM;
 559
 560        spin_lock(&fs_info->tree_mod_seq_lock);
 561        tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
 562        spin_unlock(&fs_info->tree_mod_seq_lock);
 563
 564        return tm->seq;
 565}
 566
 567static inline int
 568__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 569                          struct extent_buffer *eb, int slot,
 570                          enum mod_log_op op, gfp_t flags)
 571{
 572        int ret;
 573        struct tree_mod_elem *tm;
 574
 575        ret = tree_mod_alloc(fs_info, flags, &tm);
 576        if (ret < 0)
 577                return ret;
 578
 579        tm->index = eb->start >> PAGE_CACHE_SHIFT;
 580        if (op != MOD_LOG_KEY_ADD) {
 581                btrfs_node_key(eb, &tm->key, slot);
 582                tm->blockptr = btrfs_node_blockptr(eb, slot);
 583        }
 584        tm->op = op;
 585        tm->slot = slot;
 586        tm->generation = btrfs_node_ptr_generation(eb, slot);
 587
 588        return __tree_mod_log_insert(fs_info, tm);
 589}
 590
 591static noinline int
 592tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
 593                             struct extent_buffer *eb, int slot,
 594                             enum mod_log_op op, gfp_t flags)
 595{
 596        int ret;
 597
 598        if (tree_mod_dont_log(fs_info, eb))
 599                return 0;
 600
 601        ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
 602
 603        tree_mod_log_write_unlock(fs_info);
 604        return ret;
 605}
 606
 607static noinline int
 608tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 609                        int slot, enum mod_log_op op)
 610{
 611        return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
 612}
 613
 614static noinline int
 615tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
 616                             struct extent_buffer *eb, int slot,
 617                             enum mod_log_op op)
 618{
 619        return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
 620}
 621
 622static noinline int
 623tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 624                         struct extent_buffer *eb, int dst_slot, int src_slot,
 625                         int nr_items, gfp_t flags)
 626{
 627        struct tree_mod_elem *tm;
 628        int ret;
 629        int i;
 630
 631        if (tree_mod_dont_log(fs_info, eb))
 632                return 0;
 633
 634        /*
 635         * When we override something during the move, we log these removals.
 636         * This can only happen when we move towards the beginning of the
 637         * buffer, i.e. dst_slot < src_slot.
 638         */
 639        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 640                ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
 641                                              MOD_LOG_KEY_REMOVE_WHILE_MOVING);
 642                BUG_ON(ret < 0);
 643        }
 644
 645        ret = tree_mod_alloc(fs_info, flags, &tm);
 646        if (ret < 0)
 647                goto out;
 648
 649        tm->index = eb->start >> PAGE_CACHE_SHIFT;
 650        tm->slot = src_slot;
 651        tm->move.dst_slot = dst_slot;
 652        tm->move.nr_items = nr_items;
 653        tm->op = MOD_LOG_MOVE_KEYS;
 654
 655        ret = __tree_mod_log_insert(fs_info, tm);
 656out:
 657        tree_mod_log_write_unlock(fs_info);
 658        return ret;
 659}
 660
 661static inline void
 662__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 663{
 664        int i;
 665        u32 nritems;
 666        int ret;
 667
 668        if (btrfs_header_level(eb) == 0)
 669                return;
 670
 671        nritems = btrfs_header_nritems(eb);
 672        for (i = nritems - 1; i >= 0; i--) {
 673                ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
 674                                              MOD_LOG_KEY_REMOVE_WHILE_FREEING);
 675                BUG_ON(ret < 0);
 676        }
 677}
 678
 679static noinline int
 680tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 681                         struct extent_buffer *old_root,
 682                         struct extent_buffer *new_root, gfp_t flags,
 683                         int log_removal)
 684{
 685        struct tree_mod_elem *tm;
 686        int ret;
 687
 688        if (tree_mod_dont_log(fs_info, NULL))
 689                return 0;
 690
 691        if (log_removal)
 692                __tree_mod_log_free_eb(fs_info, old_root);
 693
 694        ret = tree_mod_alloc(fs_info, flags, &tm);
 695        if (ret < 0)
 696                goto out;
 697
 698        tm->index = new_root->start >> PAGE_CACHE_SHIFT;
 699        tm->old_root.logical = old_root->start;
 700        tm->old_root.level = btrfs_header_level(old_root);
 701        tm->generation = btrfs_header_generation(old_root);
 702        tm->op = MOD_LOG_ROOT_REPLACE;
 703
 704        ret = __tree_mod_log_insert(fs_info, tm);
 705out:
 706        tree_mod_log_write_unlock(fs_info);
 707        return ret;
 708}
 709
 710static struct tree_mod_elem *
 711__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 712                      int smallest)
 713{
 714        struct rb_root *tm_root;
 715        struct rb_node *node;
 716        struct tree_mod_elem *cur = NULL;
 717        struct tree_mod_elem *found = NULL;
 718        u64 index = start >> PAGE_CACHE_SHIFT;
 719
 720        tree_mod_log_read_lock(fs_info);
 721        tm_root = &fs_info->tree_mod_log;
 722        node = tm_root->rb_node;
 723        while (node) {
 724                cur = container_of(node, struct tree_mod_elem, node);
 725                if (cur->index < index) {
 726                        node = node->rb_left;
 727                } else if (cur->index > index) {
 728                        node = node->rb_right;
 729                } else if (cur->seq < min_seq) {
 730                        node = node->rb_left;
 731                } else if (!smallest) {
 732                        /* we want the node with the highest seq */
 733                        if (found)
 734                                BUG_ON(found->seq > cur->seq);
 735                        found = cur;
 736                        node = node->rb_left;
 737                } else if (cur->seq > min_seq) {
 738                        /* we want the node with the smallest seq */
 739                        if (found)
 740                                BUG_ON(found->seq < cur->seq);
 741                        found = cur;
 742                        node = node->rb_right;
 743                } else {
 744                        found = cur;
 745                        break;
 746                }
 747        }
 748        tree_mod_log_read_unlock(fs_info);
 749
 750        return found;
 751}
 752
 753/*
 754 * this returns the element from the log with the smallest time sequence
 755 * value that's in the log (the oldest log item). any element with a time
 756 * sequence lower than min_seq will be ignored.
 757 */
 758static struct tree_mod_elem *
 759tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
 760                           u64 min_seq)
 761{
 762        return __tree_mod_log_search(fs_info, start, min_seq, 1);
 763}
 764
 765/*
 766 * this returns the element from the log with the largest time sequence
 767 * value that's in the log (the most recent log item). any element with
 768 * a time sequence lower than min_seq will be ignored.
 769 */
 770static struct tree_mod_elem *
 771tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 772{
 773        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 774}
 775
 776static noinline void
 777tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 778                     struct extent_buffer *src, unsigned long dst_offset,
 779                     unsigned long src_offset, int nr_items)
 780{
 781        int ret;
 782        int i;
 783
 784        if (tree_mod_dont_log(fs_info, NULL))
 785                return;
 786
 787        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
 788                tree_mod_log_write_unlock(fs_info);
 789                return;
 790        }
 791
 792        for (i = 0; i < nr_items; i++) {
 793                ret = tree_mod_log_insert_key_locked(fs_info, src,
 794                                                i + src_offset,
 795                                                MOD_LOG_KEY_REMOVE);
 796                BUG_ON(ret < 0);
 797                ret = tree_mod_log_insert_key_locked(fs_info, dst,
 798                                                     i + dst_offset,
 799                                                     MOD_LOG_KEY_ADD);
 800                BUG_ON(ret < 0);
 801        }
 802
 803        tree_mod_log_write_unlock(fs_info);
 804}
 805
 806static inline void
 807tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 808                     int dst_offset, int src_offset, int nr_items)
 809{
 810        int ret;
 811        ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
 812                                       nr_items, GFP_NOFS);
 813        BUG_ON(ret < 0);
 814}
 815
 816static noinline void
 817tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 818                          struct extent_buffer *eb, int slot, int atomic)
 819{
 820        int ret;
 821
 822        ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
 823                                           MOD_LOG_KEY_REPLACE,
 824                                           atomic ? GFP_ATOMIC : GFP_NOFS);
 825        BUG_ON(ret < 0);
 826}
 827
 828static noinline void
 829tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 830{
 831        if (tree_mod_dont_log(fs_info, eb))
 832                return;
 833
 834        __tree_mod_log_free_eb(fs_info, eb);
 835
 836        tree_mod_log_write_unlock(fs_info);
 837}
 838
 839static noinline void
 840tree_mod_log_set_root_pointer(struct btrfs_root *root,
 841                              struct extent_buffer *new_root_node,
 842                              int log_removal)
 843{
 844        int ret;
 845        ret = tree_mod_log_insert_root(root->fs_info, root->node,
 846                                       new_root_node, GFP_NOFS, log_removal);
 847        BUG_ON(ret < 0);
 848}
 849
 850/*
 851 * check if the tree block can be shared by multiple trees
 852 */
 853int btrfs_block_can_be_shared(struct btrfs_root *root,
 854                              struct extent_buffer *buf)
 855{
 856        /*
 857         * Tree blocks not in refernece counted trees and tree roots
 858         * are never shared. If a block was allocated after the last
 859         * snapshot and the block was not allocated by tree relocation,
 860         * we know the block is not shared.
 861         */
 862        if (root->ref_cows &&
 863            buf != root->node && buf != root->commit_root &&
 864            (btrfs_header_generation(buf) <=
 865             btrfs_root_last_snapshot(&root->root_item) ||
 866             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 867                return 1;
 868#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 869        if (root->ref_cows &&
 870            btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 871                return 1;
 872#endif
 873        return 0;
 874}
 875
 876static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 877                                       struct btrfs_root *root,
 878                                       struct extent_buffer *buf,
 879                                       struct extent_buffer *cow,
 880                                       int *last_ref)
 881{
 882        u64 refs;
 883        u64 owner;
 884        u64 flags;
 885        u64 new_flags = 0;
 886        int ret;
 887
 888        /*
 889         * Backrefs update rules:
 890         *
 891         * Always use full backrefs for extent pointers in tree block
 892         * allocated by tree relocation.
 893         *
 894         * If a shared tree block is no longer referenced by its owner
 895         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 896         * use full backrefs for extent pointers in tree block.
 897         *
 898         * If a tree block is been relocating
 899         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
 900         * use full backrefs for extent pointers in tree block.
 901         * The reason for this is some operations (such as drop tree)
 902         * are only allowed for blocks use full backrefs.
 903         */
 904
 905        if (btrfs_block_can_be_shared(root, buf)) {
 906                ret = btrfs_lookup_extent_info(trans, root, buf->start,
 907                                               btrfs_header_level(buf), 1,
 908                                               &refs, &flags);
 909                if (ret)
 910                        return ret;
 911                if (refs == 0) {
 912                        ret = -EROFS;
 913                        btrfs_std_error(root->fs_info, ret);
 914                        return ret;
 915                }
 916        } else {
 917                refs = 1;
 918                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
 919                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 920                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 921                else
 922                        flags = 0;
 923        }
 924
 925        owner = btrfs_header_owner(buf);
 926        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
 927               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 928
 929        if (refs > 1) {
 930                if ((owner == root->root_key.objectid ||
 931                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 932                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
 933                        ret = btrfs_inc_ref(trans, root, buf, 1, 1);
 934                        BUG_ON(ret); /* -ENOMEM */
 935
 936                        if (root->root_key.objectid ==
 937                            BTRFS_TREE_RELOC_OBJECTID) {
 938                                ret = btrfs_dec_ref(trans, root, buf, 0, 1);
 939                                BUG_ON(ret); /* -ENOMEM */
 940                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 941                                BUG_ON(ret); /* -ENOMEM */
 942                        }
 943                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 944                } else {
 945
 946                        if (root->root_key.objectid ==
 947                            BTRFS_TREE_RELOC_OBJECTID)
 948                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 949                        else
 950                                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 951                        BUG_ON(ret); /* -ENOMEM */
 952                }
 953                if (new_flags != 0) {
 954                        int level = btrfs_header_level(buf);
 955
 956                        ret = btrfs_set_disk_extent_flags(trans, root,
 957                                                          buf->start,
 958                                                          buf->len,
 959                                                          new_flags, level, 0);
 960                        if (ret)
 961                                return ret;
 962                }
 963        } else {
 964                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 965                        if (root->root_key.objectid ==
 966                            BTRFS_TREE_RELOC_OBJECTID)
 967                                ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 968                        else
 969                                ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 970                        BUG_ON(ret); /* -ENOMEM */
 971                        ret = btrfs_dec_ref(trans, root, buf, 1, 1);
 972                        BUG_ON(ret); /* -ENOMEM */
 973                }
 974                clean_tree_block(trans, root, buf);
 975                *last_ref = 1;
 976        }
 977        return 0;
 978}
 979
 980/*
 981 * does the dirty work in cow of a single block.  The parent block (if
 982 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 983 * dirty and returned locked.  If you modify the block it needs to be marked
 984 * dirty again.
 985 *
 986 * search_start -- an allocation hint for the new block
 987 *
 988 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 989 * bytes the allocator should try to find free next to the block it returns.
 990 * This is just a hint and may be ignored by the allocator.
 991 */
 992static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 993                             struct btrfs_root *root,
 994                             struct extent_buffer *buf,
 995                             struct extent_buffer *parent, int parent_slot,
 996                             struct extent_buffer **cow_ret,
 997                             u64 search_start, u64 empty_size)
 998{
 999        struct btrfs_disk_key disk_key;
1000        struct extent_buffer *cow;
1001        int level, ret;
1002        int last_ref = 0;
1003        int unlock_orig = 0;
1004        u64 parent_start;
1005
1006        if (*cow_ret == buf)
1007                unlock_orig = 1;
1008
1009        btrfs_assert_tree_locked(buf);
1010
1011        WARN_ON(root->ref_cows && trans->transid !=
1012                root->fs_info->running_transaction->transid);
1013        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
1014
1015        level = btrfs_header_level(buf);
1016
1017        if (level == 0)
1018                btrfs_item_key(buf, &disk_key, 0);
1019        else
1020                btrfs_node_key(buf, &disk_key, 0);
1021
1022        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1023                if (parent)
1024                        parent_start = parent->start;
1025                else
1026                        parent_start = 0;
1027        } else
1028                parent_start = 0;
1029
1030        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
1031                                     root->root_key.objectid, &disk_key,
1032                                     level, search_start, empty_size);
1033        if (IS_ERR(cow))
1034                return PTR_ERR(cow);
1035
1036        /* cow is set to blocking by btrfs_init_new_buffer */
1037
1038        copy_extent_buffer(cow, buf, 0, 0, cow->len);
1039        btrfs_set_header_bytenr(cow, cow->start);
1040        btrfs_set_header_generation(cow, trans->transid);
1041        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1042        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1043                                     BTRFS_HEADER_FLAG_RELOC);
1044        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1045                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1046        else
1047                btrfs_set_header_owner(cow, root->root_key.objectid);
1048
1049        write_extent_buffer(cow, root->fs_info->fsid,
1050                            (unsigned long)btrfs_header_fsid(cow),
1051                            BTRFS_FSID_SIZE);
1052
1053        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1054        if (ret) {
1055                btrfs_abort_transaction(trans, root, ret);
1056                return ret;
1057        }
1058
1059        if (root->ref_cows)
1060                btrfs_reloc_cow_block(trans, root, buf, cow);
1061
1062        if (buf == root->node) {
1063                WARN_ON(parent && parent != buf);
1064                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1065                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1066                        parent_start = buf->start;
1067                else
1068                        parent_start = 0;
1069
1070                extent_buffer_get(cow);
1071                tree_mod_log_set_root_pointer(root, cow, 1);
1072                rcu_assign_pointer(root->node, cow);
1073
1074                btrfs_free_tree_block(trans, root, buf, parent_start,
1075                                      last_ref);
1076                free_extent_buffer(buf);
1077                add_root_to_dirty_list(root);
1078        } else {
1079                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1080                        parent_start = parent->start;
1081                else
1082                        parent_start = 0;
1083
1084                WARN_ON(trans->transid != btrfs_header_generation(parent));
1085                tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1086                                        MOD_LOG_KEY_REPLACE);
1087                btrfs_set_node_blockptr(parent, parent_slot,
1088                                        cow->start);
1089                btrfs_set_node_ptr_generation(parent, parent_slot,
1090                                              trans->transid);
1091                btrfs_mark_buffer_dirty(parent);
1092                tree_mod_log_free_eb(root->fs_info, buf);
1093                btrfs_free_tree_block(trans, root, buf, parent_start,
1094                                      last_ref);
1095        }
1096        if (unlock_orig)
1097                btrfs_tree_unlock(buf);
1098        free_extent_buffer_stale(buf);
1099        btrfs_mark_buffer_dirty(cow);
1100        *cow_ret = cow;
1101        return 0;
1102}
1103
1104/*
1105 * returns the logical address of the oldest predecessor of the given root.
1106 * entries older than time_seq are ignored.
1107 */
1108static struct tree_mod_elem *
1109__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1110                           struct extent_buffer *eb_root, u64 time_seq)
1111{
1112        struct tree_mod_elem *tm;
1113        struct tree_mod_elem *found = NULL;
1114        u64 root_logical = eb_root->start;
1115        int looped = 0;
1116
1117        if (!time_seq)
1118                return 0;
1119
1120        /*
1121         * the very last operation that's logged for a root is the replacement
1122         * operation (if it is replaced at all). this has the index of the *new*
1123         * root, making it the very first operation that's logged for this root.
1124         */
1125        while (1) {
1126                tm = tree_mod_log_search_oldest(fs_info, root_logical,
1127                                                time_seq);
1128                if (!looped && !tm)
1129                        return 0;
1130                /*
1131                 * if there are no tree operation for the oldest root, we simply
1132                 * return it. this should only happen if that (old) root is at
1133                 * level 0.
1134                 */
1135                if (!tm)
1136                        break;
1137
1138                /*
1139                 * if there's an operation that's not a root replacement, we
1140                 * found the oldest version of our root. normally, we'll find a
1141                 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1142                 */
1143                if (tm->op != MOD_LOG_ROOT_REPLACE)
1144                        break;
1145
1146                found = tm;
1147                root_logical = tm->old_root.logical;
1148                looped = 1;
1149        }
1150
1151        /* if there's no old root to return, return what we found instead */
1152        if (!found)
1153                found = tm;
1154
1155        return found;
1156}
1157
1158/*
1159 * tm is a pointer to the first operation to rewind within eb. then, all
1160 * previous operations will be rewinded (until we reach something older than
1161 * time_seq).
1162 */
1163static void
1164__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1165                      struct tree_mod_elem *first_tm)
1166{
1167        u32 n;
1168        struct rb_node *next;
1169        struct tree_mod_elem *tm = first_tm;
1170        unsigned long o_dst;
1171        unsigned long o_src;
1172        unsigned long p_size = sizeof(struct btrfs_key_ptr);
1173
1174        n = btrfs_header_nritems(eb);
1175        while (tm && tm->seq >= time_seq) {
1176                /*
1177                 * all the operations are recorded with the operator used for
1178                 * the modification. as we're going backwards, we do the
1179                 * opposite of each operation here.
1180                 */
1181                switch (tm->op) {
1182                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1183                        BUG_ON(tm->slot < n);
1184                        /* Fallthrough */
1185                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1186                case MOD_LOG_KEY_REMOVE:
1187                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1188                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1189                        btrfs_set_node_ptr_generation(eb, tm->slot,
1190                                                      tm->generation);
1191                        n++;
1192                        break;
1193                case MOD_LOG_KEY_REPLACE:
1194                        BUG_ON(tm->slot >= n);
1195                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1196                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1197                        btrfs_set_node_ptr_generation(eb, tm->slot,
1198                                                      tm->generation);
1199                        break;
1200                case MOD_LOG_KEY_ADD:
1201                        /* if a move operation is needed it's in the log */
1202                        n--;
1203                        break;
1204                case MOD_LOG_MOVE_KEYS:
1205                        o_dst = btrfs_node_key_ptr_offset(tm->slot);
1206                        o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1207                        memmove_extent_buffer(eb, o_dst, o_src,
1208                                              tm->move.nr_items * p_size);
1209                        break;
1210                case MOD_LOG_ROOT_REPLACE:
1211                        /*
1212                         * this operation is special. for roots, this must be
1213                         * handled explicitly before rewinding.
1214                         * for non-roots, this operation may exist if the node
1215                         * was a root: root A -> child B; then A gets empty and
1216                         * B is promoted to the new root. in the mod log, we'll
1217                         * have a root-replace operation for B, a tree block
1218                         * that is no root. we simply ignore that operation.
1219                         */
1220                        break;
1221                }
1222                next = rb_next(&tm->node);
1223                if (!next)
1224                        break;
1225                tm = container_of(next, struct tree_mod_elem, node);
1226                if (tm->index != first_tm->index)
1227                        break;
1228        }
1229        btrfs_set_header_nritems(eb, n);
1230}
1231
1232/*
1233 * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1234 * is returned. If rewind operations happen, a fresh buffer is returned. The
1235 * returned buffer is always read-locked. If the returned buffer is not the
1236 * input buffer, the lock on the input buffer is released and the input buffer
1237 * is freed (its refcount is decremented).
1238 */
1239static struct extent_buffer *
1240tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1241                    u64 time_seq)
1242{
1243        struct extent_buffer *eb_rewin;
1244        struct tree_mod_elem *tm;
1245
1246        if (!time_seq)
1247                return eb;
1248
1249        if (btrfs_header_level(eb) == 0)
1250                return eb;
1251
1252        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1253        if (!tm)
1254                return eb;
1255
1256        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1257                BUG_ON(tm->slot != 0);
1258                eb_rewin = alloc_dummy_extent_buffer(eb->start,
1259                                                fs_info->tree_root->nodesize);
1260                BUG_ON(!eb_rewin);
1261                btrfs_set_header_bytenr(eb_rewin, eb->start);
1262                btrfs_set_header_backref_rev(eb_rewin,
1263                                             btrfs_header_backref_rev(eb));
1264                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1265                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1266        } else {
1267                eb_rewin = btrfs_clone_extent_buffer(eb);
1268                BUG_ON(!eb_rewin);
1269        }
1270
1271        extent_buffer_get(eb_rewin);
1272        btrfs_tree_read_unlock(eb);
1273        free_extent_buffer(eb);
1274
1275        extent_buffer_get(eb_rewin);
1276        btrfs_tree_read_lock(eb_rewin);
1277        __tree_mod_log_rewind(eb_rewin, time_seq, tm);
1278        WARN_ON(btrfs_header_nritems(eb_rewin) >
1279                BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1280
1281        return eb_rewin;
1282}
1283
1284/*
1285 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1286 * value. If there are no changes, the current root->root_node is returned. If
1287 * anything changed in between, there's a fresh buffer allocated on which the
1288 * rewind operations are done. In any case, the returned buffer is read locked.
1289 * Returns NULL on error (with no locks held).
1290 */
1291static inline struct extent_buffer *
1292get_old_root(struct btrfs_root *root, u64 time_seq)
1293{
1294        struct tree_mod_elem *tm;
1295        struct extent_buffer *eb = NULL;
1296        struct extent_buffer *eb_root;
1297        struct extent_buffer *old;
1298        struct tree_mod_root *old_root = NULL;
1299        u64 old_generation = 0;
1300        u64 logical;
1301        u32 blocksize;
1302
1303        eb_root = btrfs_read_lock_root_node(root);
1304        tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1305        if (!tm)
1306                return eb_root;
1307
1308        if (tm->op == MOD_LOG_ROOT_REPLACE) {
1309                old_root = &tm->old_root;
1310                old_generation = tm->generation;
1311                logical = old_root->logical;
1312        } else {
1313                logical = eb_root->start;
1314        }
1315
1316        tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1317        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1318                btrfs_tree_read_unlock(eb_root);
1319                free_extent_buffer(eb_root);
1320                blocksize = btrfs_level_size(root, old_root->level);
1321                old = read_tree_block(root, logical, blocksize, 0);
1322                if (!old || !extent_buffer_uptodate(old)) {
1323                        free_extent_buffer(old);
1324                        pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
1325                                logical);
1326                        WARN_ON(1);
1327                } else {
1328                        eb = btrfs_clone_extent_buffer(old);
1329                        free_extent_buffer(old);
1330                }
1331        } else if (old_root) {
1332                btrfs_tree_read_unlock(eb_root);
1333                free_extent_buffer(eb_root);
1334                eb = alloc_dummy_extent_buffer(logical, root->nodesize);
1335        } else {
1336                eb = btrfs_clone_extent_buffer(eb_root);
1337                btrfs_tree_read_unlock(eb_root);
1338                free_extent_buffer(eb_root);
1339        }
1340
1341        if (!eb)
1342                return NULL;
1343        extent_buffer_get(eb);
1344        btrfs_tree_read_lock(eb);
1345        if (old_root) {
1346                btrfs_set_header_bytenr(eb, eb->start);
1347                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1348                btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1349                btrfs_set_header_level(eb, old_root->level);
1350                btrfs_set_header_generation(eb, old_generation);
1351        }
1352        if (tm)
1353                __tree_mod_log_rewind(eb, time_seq, tm);
1354        else
1355                WARN_ON(btrfs_header_level(eb) != 0);
1356        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1357
1358        return eb;
1359}
1360
1361int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1362{
1363        struct tree_mod_elem *tm;
1364        int level;
1365        struct extent_buffer *eb_root = btrfs_root_node(root);
1366
1367        tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1368        if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1369                level = tm->old_root.level;
1370        } else {
1371                level = btrfs_header_level(eb_root);
1372        }
1373        free_extent_buffer(eb_root);
1374
1375        return level;
1376}
1377
1378static inline int should_cow_block(struct btrfs_trans_handle *trans,
1379                                   struct btrfs_root *root,
1380                                   struct extent_buffer *buf)
1381{
1382        /* ensure we can see the force_cow */
1383        smp_rmb();
1384
1385        /*
1386         * We do not need to cow a block if
1387         * 1) this block is not created or changed in this transaction;
1388         * 2) this block does not belong to TREE_RELOC tree;
1389         * 3) the root is not forced COW.
1390         *
1391         * What is forced COW:
1392         *    when we create snapshot during commiting the transaction,
1393         *    after we've finished coping src root, we must COW the shared
1394         *    block to ensure the metadata consistency.
1395         */
1396        if (btrfs_header_generation(buf) == trans->transid &&
1397            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1398            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1399              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1400            !root->force_cow)
1401                return 0;
1402        return 1;
1403}
1404
1405/*
1406 * cows a single block, see __btrfs_cow_block for the real work.
1407 * This version of it has extra checks so that a block isn't cow'd more than
1408 * once per transaction, as long as it hasn't been written yet
1409 */
1410noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1411                    struct btrfs_root *root, struct extent_buffer *buf,
1412                    struct extent_buffer *parent, int parent_slot,
1413                    struct extent_buffer **cow_ret)
1414{
1415        u64 search_start;
1416        int ret;
1417
1418        if (trans->transaction != root->fs_info->running_transaction)
1419                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1420                       (unsigned long long)trans->transid,
1421                       (unsigned long long)
1422                       root->fs_info->running_transaction->transid);
1423
1424        if (trans->transid != root->fs_info->generation)
1425                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1426                       (unsigned long long)trans->transid,
1427                       (unsigned long long)root->fs_info->generation);
1428
1429        if (!should_cow_block(trans, root, buf)) {
1430                *cow_ret = buf;
1431                return 0;
1432        }
1433
1434        search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1435
1436        if (parent)
1437                btrfs_set_lock_blocking(parent);
1438        btrfs_set_lock_blocking(buf);
1439
1440        ret = __btrfs_cow_block(trans, root, buf, parent,
1441                                 parent_slot, cow_ret, search_start, 0);
1442
1443        trace_btrfs_cow_block(root, buf, *cow_ret);
1444
1445        return ret;
1446}
1447
1448/*
1449 * helper function for defrag to decide if two blocks pointed to by a
1450 * node are actually close by
1451 */
1452static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1453{
1454        if (blocknr < other && other - (blocknr + blocksize) < 32768)
1455                return 1;
1456        if (blocknr > other && blocknr - (other + blocksize) < 32768)
1457                return 1;
1458        return 0;
1459}
1460
1461/*
1462 * compare two keys in a memcmp fashion
1463 */
1464static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1465{
1466        struct btrfs_key k1;
1467
1468        btrfs_disk_key_to_cpu(&k1, disk);
1469
1470        return btrfs_comp_cpu_keys(&k1, k2);
1471}
1472
1473/*
1474 * same as comp_keys only with two btrfs_key's
1475 */
1476int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1477{
1478        if (k1->objectid > k2->objectid)
1479                return 1;
1480        if (k1->objectid < k2->objectid)
1481                return -1;
1482        if (k1->type > k2->type)
1483                return 1;
1484        if (k1->type < k2->type)
1485                return -1;
1486        if (k1->offset > k2->offset)
1487                return 1;
1488        if (k1->offset < k2->offset)
1489                return -1;
1490        return 0;
1491}
1492
1493/*
1494 * this is used by the defrag code to go through all the
1495 * leaves pointed to by a node and reallocate them so that
1496 * disk order is close to key order
1497 */
1498int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1499                       struct btrfs_root *root, struct extent_buffer *parent,
1500                       int start_slot, u64 *last_ret,
1501                       struct btrfs_key *progress)
1502{
1503        struct extent_buffer *cur;
1504        u64 blocknr;
1505        u64 gen;
1506        u64 search_start = *last_ret;
1507        u64 last_block = 0;
1508        u64 other;
1509        u32 parent_nritems;
1510        int end_slot;
1511        int i;
1512        int err = 0;
1513        int parent_level;
1514        int uptodate;
1515        u32 blocksize;
1516        int progress_passed = 0;
1517        struct btrfs_disk_key disk_key;
1518
1519        parent_level = btrfs_header_level(parent);
1520
1521        WARN_ON(trans->transaction != root->fs_info->running_transaction);
1522        WARN_ON(trans->transid != root->fs_info->generation);
1523
1524        parent_nritems = btrfs_header_nritems(parent);
1525        blocksize = btrfs_level_size(root, parent_level - 1);
1526        end_slot = parent_nritems;
1527
1528        if (parent_nritems == 1)
1529                return 0;
1530
1531        btrfs_set_lock_blocking(parent);
1532
1533        for (i = start_slot; i < end_slot; i++) {
1534                int close = 1;
1535
1536                btrfs_node_key(parent, &disk_key, i);
1537                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1538                        continue;
1539
1540                progress_passed = 1;
1541                blocknr = btrfs_node_blockptr(parent, i);
1542                gen = btrfs_node_ptr_generation(parent, i);
1543                if (last_block == 0)
1544                        last_block = blocknr;
1545
1546                if (i > 0) {
1547                        other = btrfs_node_blockptr(parent, i - 1);
1548                        close = close_blocks(blocknr, other, blocksize);
1549                }
1550                if (!close && i < end_slot - 2) {
1551                        other = btrfs_node_blockptr(parent, i + 1);
1552                        close = close_blocks(blocknr, other, blocksize);
1553                }
1554                if (close) {
1555                        last_block = blocknr;
1556                        continue;
1557                }
1558
1559                cur = btrfs_find_tree_block(root, blocknr, blocksize);
1560                if (cur)
1561                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1562                else
1563                        uptodate = 0;
1564                if (!cur || !uptodate) {
1565                        if (!cur) {
1566                                cur = read_tree_block(root, blocknr,
1567                                                         blocksize, gen);
1568                                if (!cur || !extent_buffer_uptodate(cur)) {
1569                                        free_extent_buffer(cur);
1570                                        return -EIO;
1571                                }
1572                        } else if (!uptodate) {
1573                                err = btrfs_read_buffer(cur, gen);
1574                                if (err) {
1575                                        free_extent_buffer(cur);
1576                                        return err;
1577                                }
1578                        }
1579                }
1580                if (search_start == 0)
1581                        search_start = last_block;
1582
1583                btrfs_tree_lock(cur);
1584                btrfs_set_lock_blocking(cur);
1585                err = __btrfs_cow_block(trans, root, cur, parent, i,
1586                                        &cur, search_start,
1587                                        min(16 * blocksize,
1588                                            (end_slot - i) * blocksize));
1589                if (err) {
1590                        btrfs_tree_unlock(cur);
1591                        free_extent_buffer(cur);
1592                        break;
1593                }
1594                search_start = cur->start;
1595                last_block = cur->start;
1596                *last_ret = search_start;
1597                btrfs_tree_unlock(cur);
1598                free_extent_buffer(cur);
1599        }
1600        return err;
1601}
1602
1603/*
1604 * The leaf data grows from end-to-front in the node.
1605 * this returns the address of the start of the last item,
1606 * which is the stop of the leaf data stack
1607 */
1608static inline unsigned int leaf_data_end(struct btrfs_root *root,
1609                                         struct extent_buffer *leaf)
1610{
1611        u32 nr = btrfs_header_nritems(leaf);
1612        if (nr == 0)
1613                return BTRFS_LEAF_DATA_SIZE(root);
1614        return btrfs_item_offset_nr(leaf, nr - 1);
1615}
1616
1617
1618/*
1619 * search for key in the extent_buffer.  The items start at offset p,
1620 * and they are item_size apart.  There are 'max' items in p.
1621 *
1622 * the slot in the array is returned via slot, and it points to
1623 * the place where you would insert key if it is not found in
1624 * the array.
1625 *
1626 * slot may point to max if the key is bigger than all of the keys
1627 */
1628static noinline int generic_bin_search(struct extent_buffer *eb,
1629                                       unsigned long p,
1630                                       int item_size, struct btrfs_key *key,
1631                                       int max, int *slot)
1632{
1633        int low = 0;
1634        int high = max;
1635        int mid;
1636        int ret;
1637        struct btrfs_disk_key *tmp = NULL;
1638        struct btrfs_disk_key unaligned;
1639        unsigned long offset;
1640        char *kaddr = NULL;
1641        unsigned long map_start = 0;
1642        unsigned long map_len = 0;
1643        int err;
1644
1645        while (low < high) {
1646                mid = (low + high) / 2;
1647                offset = p + mid * item_size;
1648
1649                if (!kaddr || offset < map_start ||
1650                    (offset + sizeof(struct btrfs_disk_key)) >
1651                    map_start + map_len) {
1652
1653                        err = map_private_extent_buffer(eb, offset,
1654                                                sizeof(struct btrfs_disk_key),
1655                                                &kaddr, &map_start, &map_len);
1656
1657                        if (!err) {
1658                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
1659                                                        map_start);
1660                        } else {
1661                                read_extent_buffer(eb, &unaligned,
1662                                                   offset, sizeof(unaligned));
1663                                tmp = &unaligned;
1664                        }
1665
1666                } else {
1667                        tmp = (struct btrfs_disk_key *)(kaddr + offset -
1668                                                        map_start);
1669                }
1670                ret = comp_keys(tmp, key);
1671
1672                if (ret < 0)
1673                        low = mid + 1;
1674                else if (ret > 0)
1675                        high = mid;
1676                else {
1677                        *slot = mid;
1678                        return 0;
1679                }
1680        }
1681        *slot = low;
1682        return 1;
1683}
1684
1685/*
1686 * simple bin_search frontend that does the right thing for
1687 * leaves vs nodes
1688 */
1689static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1690                      int level, int *slot)
1691{
1692        if (level == 0)
1693                return generic_bin_search(eb,
1694                                          offsetof(struct btrfs_leaf, items),
1695                                          sizeof(struct btrfs_item),
1696                                          key, btrfs_header_nritems(eb),
1697                                          slot);
1698        else
1699                return generic_bin_search(eb,
1700                                          offsetof(struct btrfs_node, ptrs),
1701                                          sizeof(struct btrfs_key_ptr),
1702                                          key, btrfs_header_nritems(eb),
1703                                          slot);
1704}
1705
1706int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1707                     int level, int *slot)
1708{
1709        return bin_search(eb, key, level, slot);
1710}
1711
1712static void root_add_used(struct btrfs_root *root, u32 size)
1713{
1714        spin_lock(&root->accounting_lock);
1715        btrfs_set_root_used(&root->root_item,
1716                            btrfs_root_used(&root->root_item) + size);
1717        spin_unlock(&root->accounting_lock);
1718}
1719
1720static void root_sub_used(struct btrfs_root *root, u32 size)
1721{
1722        spin_lock(&root->accounting_lock);
1723        btrfs_set_root_used(&root->root_item,
1724                            btrfs_root_used(&root->root_item) - size);
1725        spin_unlock(&root->accounting_lock);
1726}
1727
1728/* given a node and slot number, this reads the blocks it points to.  The
1729 * extent buffer is returned with a reference taken (but unlocked).
1730 * NULL is returned on error.
1731 */
1732static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1733                                   struct extent_buffer *parent, int slot)
1734{
1735        int level = btrfs_header_level(parent);
1736        struct extent_buffer *eb;
1737
1738        if (slot < 0)
1739                return NULL;
1740        if (slot >= btrfs_header_nritems(parent))
1741                return NULL;
1742
1743        BUG_ON(level == 0);
1744
1745        eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1746                             btrfs_level_size(root, level - 1),
1747                             btrfs_node_ptr_generation(parent, slot));
1748        if (eb && !extent_buffer_uptodate(eb)) {
1749                free_extent_buffer(eb);
1750                eb = NULL;
1751        }
1752
1753        return eb;
1754}
1755
1756/*
1757 * node level balancing, used to make sure nodes are in proper order for
1758 * item deletion.  We balance from the top down, so we have to make sure
1759 * that a deletion won't leave an node completely empty later on.
1760 */
1761static noinline int balance_level(struct btrfs_trans_handle *trans,
1762                         struct btrfs_root *root,
1763                         struct btrfs_path *path, int level)
1764{
1765        struct extent_buffer *right = NULL;
1766        struct extent_buffer *mid;
1767        struct extent_buffer *left = NULL;
1768        struct extent_buffer *parent = NULL;
1769        int ret = 0;
1770        int wret;
1771        int pslot;
1772        int orig_slot = path->slots[level];
1773        u64 orig_ptr;
1774
1775        if (level == 0)
1776                return 0;
1777
1778        mid = path->nodes[level];
1779
1780        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1781                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1782        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1783
1784        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1785
1786        if (level < BTRFS_MAX_LEVEL - 1) {
1787                parent = path->nodes[level + 1];
1788                pslot = path->slots[level + 1];
1789        }
1790
1791        /*
1792         * deal with the case where there is only one pointer in the root
1793         * by promoting the node below to a root
1794         */
1795        if (!parent) {
1796                struct extent_buffer *child;
1797
1798                if (btrfs_header_nritems(mid) != 1)
1799                        return 0;
1800
1801                /* promote the child to a root */
1802                child = read_node_slot(root, mid, 0);
1803                if (!child) {
1804                        ret = -EROFS;
1805                        btrfs_std_error(root->fs_info, ret);
1806                        goto enospc;
1807                }
1808
1809                btrfs_tree_lock(child);
1810                btrfs_set_lock_blocking(child);
1811                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1812                if (ret) {
1813                        btrfs_tree_unlock(child);
1814                        free_extent_buffer(child);
1815                        goto enospc;
1816                }
1817
1818                tree_mod_log_set_root_pointer(root, child, 1);
1819                rcu_assign_pointer(root->node, child);
1820
1821                add_root_to_dirty_list(root);
1822                btrfs_tree_unlock(child);
1823
1824                path->locks[level] = 0;
1825                path->nodes[level] = NULL;
1826                clean_tree_block(trans, root, mid);
1827                btrfs_tree_unlock(mid);
1828                /* once for the path */
1829                free_extent_buffer(mid);
1830
1831                root_sub_used(root, mid->len);
1832                btrfs_free_tree_block(trans, root, mid, 0, 1);
1833                /* once for the root ptr */
1834                free_extent_buffer_stale(mid);
1835                return 0;
1836        }
1837        if (btrfs_header_nritems(mid) >
1838            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1839                return 0;
1840
1841        left = read_node_slot(root, parent, pslot - 1);
1842        if (left) {
1843                btrfs_tree_lock(left);
1844                btrfs_set_lock_blocking(left);
1845                wret = btrfs_cow_block(trans, root, left,
1846                                       parent, pslot - 1, &left);
1847                if (wret) {
1848                        ret = wret;
1849                        goto enospc;
1850                }
1851        }
1852        right = read_node_slot(root, parent, pslot + 1);
1853        if (right) {
1854                btrfs_tree_lock(right);
1855                btrfs_set_lock_blocking(right);
1856                wret = btrfs_cow_block(trans, root, right,
1857                                       parent, pslot + 1, &right);
1858                if (wret) {
1859                        ret = wret;
1860                        goto enospc;
1861                }
1862        }
1863
1864        /* first, try to make some room in the middle buffer */
1865        if (left) {
1866                orig_slot += btrfs_header_nritems(left);
1867                wret = push_node_left(trans, root, left, mid, 1);
1868                if (wret < 0)
1869                        ret = wret;
1870        }
1871
1872        /*
1873         * then try to empty the right most buffer into the middle
1874         */
1875        if (right) {
1876                wret = push_node_left(trans, root, mid, right, 1);
1877                if (wret < 0 && wret != -ENOSPC)
1878                        ret = wret;
1879                if (btrfs_header_nritems(right) == 0) {
1880                        clean_tree_block(trans, root, right);
1881                        btrfs_tree_unlock(right);
1882                        del_ptr(root, path, level + 1, pslot + 1);
1883                        root_sub_used(root, right->len);
1884                        btrfs_free_tree_block(trans, root, right, 0, 1);
1885                        free_extent_buffer_stale(right);
1886                        right = NULL;
1887                } else {
1888                        struct btrfs_disk_key right_key;
1889                        btrfs_node_key(right, &right_key, 0);
1890                        tree_mod_log_set_node_key(root->fs_info, parent,
1891                                                  pslot + 1, 0);
1892                        btrfs_set_node_key(parent, &right_key, pslot + 1);
1893                        btrfs_mark_buffer_dirty(parent);
1894                }
1895        }
1896        if (btrfs_header_nritems(mid) == 1) {
1897                /*
1898                 * we're not allowed to leave a node with one item in the
1899                 * tree during a delete.  A deletion from lower in the tree
1900                 * could try to delete the only pointer in this node.
1901                 * So, pull some keys from the left.
1902                 * There has to be a left pointer at this point because
1903                 * otherwise we would have pulled some pointers from the
1904                 * right
1905                 */
1906                if (!left) {
1907                        ret = -EROFS;
1908                        btrfs_std_error(root->fs_info, ret);
1909                        goto enospc;
1910                }
1911                wret = balance_node_right(trans, root, mid, left);
1912                if (wret < 0) {
1913                        ret = wret;
1914                        goto enospc;
1915                }
1916                if (wret == 1) {
1917                        wret = push_node_left(trans, root, left, mid, 1);
1918                        if (wret < 0)
1919                                ret = wret;
1920                }
1921                BUG_ON(wret == 1);
1922        }
1923        if (btrfs_header_nritems(mid) == 0) {
1924                clean_tree_block(trans, root, mid);
1925                btrfs_tree_unlock(mid);
1926                del_ptr(root, path, level + 1, pslot);
1927                root_sub_used(root, mid->len);
1928                btrfs_free_tree_block(trans, root, mid, 0, 1);
1929                free_extent_buffer_stale(mid);
1930                mid = NULL;
1931        } else {
1932                /* update the parent key to reflect our changes */
1933                struct btrfs_disk_key mid_key;
1934                btrfs_node_key(mid, &mid_key, 0);
1935                tree_mod_log_set_node_key(root->fs_info, parent,
1936                                          pslot, 0);
1937                btrfs_set_node_key(parent, &mid_key, pslot);
1938                btrfs_mark_buffer_dirty(parent);
1939        }
1940
1941        /* update the path */
1942        if (left) {
1943                if (btrfs_header_nritems(left) > orig_slot) {
1944                        extent_buffer_get(left);
1945                        /* left was locked after cow */
1946                        path->nodes[level] = left;
1947                        path->slots[level + 1] -= 1;
1948                        path->slots[level] = orig_slot;
1949                        if (mid) {
1950                                btrfs_tree_unlock(mid);
1951                                free_extent_buffer(mid);
1952                        }
1953                } else {
1954                        orig_slot -= btrfs_header_nritems(left);
1955                        path->slots[level] = orig_slot;
1956                }
1957        }
1958        /* double check we haven't messed things up */
1959        if (orig_ptr !=
1960            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1961                BUG();
1962enospc:
1963        if (right) {
1964                btrfs_tree_unlock(right);
1965                free_extent_buffer(right);
1966        }
1967        if (left) {
1968                if (path->nodes[level] != left)
1969                        btrfs_tree_unlock(left);
1970                free_extent_buffer(left);
1971        }
1972        return ret;
1973}
1974
1975/* Node balancing for insertion.  Here we only split or push nodes around
1976 * when they are completely full.  This is also done top down, so we
1977 * have to be pessimistic.
1978 */
1979static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1980                                          struct btrfs_root *root,
1981                                          struct btrfs_path *path, int level)
1982{
1983        struct extent_buffer *right = NULL;
1984        struct extent_buffer *mid;
1985        struct extent_buffer *left = NULL;
1986        struct extent_buffer *parent = NULL;
1987        int ret = 0;
1988        int wret;
1989        int pslot;
1990        int orig_slot = path->slots[level];
1991
1992        if (level == 0)
1993                return 1;
1994
1995        mid = path->nodes[level];
1996        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1997
1998        if (level < BTRFS_MAX_LEVEL - 1) {
1999                parent = path->nodes[level + 1];
2000                pslot = path->slots[level + 1];
2001        }
2002
2003        if (!parent)
2004                return 1;
2005
2006        left = read_node_slot(root, parent, pslot - 1);
2007
2008        /* first, try to make some room in the middle buffer */
2009        if (left) {
2010                u32 left_nr;
2011
2012                btrfs_tree_lock(left);
2013                btrfs_set_lock_blocking(left);
2014
2015                left_nr = btrfs_header_nritems(left);
2016                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2017                        wret = 1;
2018                } else {
2019                        ret = btrfs_cow_block(trans, root, left, parent,
2020                                              pslot - 1, &left);
2021                        if (ret)
2022                                wret = 1;
2023                        else {
2024                                wret = push_node_left(trans, root,
2025                                                      left, mid, 0);
2026                        }
2027                }
2028                if (wret < 0)
2029                        ret = wret;
2030                if (wret == 0) {
2031                        struct btrfs_disk_key disk_key;
2032                        orig_slot += left_nr;
2033                        btrfs_node_key(mid, &disk_key, 0);
2034                        tree_mod_log_set_node_key(root->fs_info, parent,
2035                                                  pslot, 0);
2036                        btrfs_set_node_key(parent, &disk_key, pslot);
2037                        btrfs_mark_buffer_dirty(parent);
2038                        if (btrfs_header_nritems(left) > orig_slot) {
2039                                path->nodes[level] = left;
2040                                path->slots[level + 1] -= 1;
2041                                path->slots[level] = orig_slot;
2042                                btrfs_tree_unlock(mid);
2043                                free_extent_buffer(mid);
2044                        } else {
2045                                orig_slot -=
2046                                        btrfs_header_nritems(left);
2047                                path->slots[level] = orig_slot;
2048                                btrfs_tree_unlock(left);
2049                                free_extent_buffer(left);
2050                        }
2051                        return 0;
2052                }
2053                btrfs_tree_unlock(left);
2054                free_extent_buffer(left);
2055        }
2056        right = read_node_slot(root, parent, pslot + 1);
2057
2058        /*
2059         * then try to empty the right most buffer into the middle
2060         */
2061        if (right) {
2062                u32 right_nr;
2063
2064                btrfs_tree_lock(right);
2065                btrfs_set_lock_blocking(right);
2066
2067                right_nr = btrfs_header_nritems(right);
2068                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2069                        wret = 1;
2070                } else {
2071                        ret = btrfs_cow_block(trans, root, right,
2072                                              parent, pslot + 1,
2073                                              &right);
2074                        if (ret)
2075                                wret = 1;
2076                        else {
2077                                wret = balance_node_right(trans, root,
2078                                                          right, mid);
2079                        }
2080                }
2081                if (wret < 0)
2082                        ret = wret;
2083                if (wret == 0) {
2084                        struct btrfs_disk_key disk_key;
2085
2086                        btrfs_node_key(right, &disk_key, 0);
2087                        tree_mod_log_set_node_key(root->fs_info, parent,
2088                                                  pslot + 1, 0);
2089                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
2090                        btrfs_mark_buffer_dirty(parent);
2091
2092                        if (btrfs_header_nritems(mid) <= orig_slot) {
2093                                path->nodes[level] = right;
2094                                path->slots[level + 1] += 1;
2095                                path->slots[level] = orig_slot -
2096                                        btrfs_header_nritems(mid);
2097                                btrfs_tree_unlock(mid);
2098                                free_extent_buffer(mid);
2099                        } else {
2100                                btrfs_tree_unlock(right);
2101                                free_extent_buffer(right);
2102                        }
2103                        return 0;
2104                }
2105                btrfs_tree_unlock(right);
2106                free_extent_buffer(right);
2107        }
2108        return 1;
2109}
2110
2111/*
2112 * readahead one full node of leaves, finding things that are close
2113 * to the block in 'slot', and triggering ra on them.
2114 */
2115static void reada_for_search(struct btrfs_root *root,
2116                             struct btrfs_path *path,
2117                             int level, int slot, u64 objectid)
2118{
2119        struct extent_buffer *node;
2120        struct btrfs_disk_key disk_key;
2121        u32 nritems;
2122        u64 search;
2123        u64 target;
2124        u64 nread = 0;
2125        u64 gen;
2126        int direction = path->reada;
2127        struct extent_buffer *eb;
2128        u32 nr;
2129        u32 blocksize;
2130        u32 nscan = 0;
2131
2132        if (level != 1)
2133                return;
2134
2135        if (!path->nodes[level])
2136                return;
2137
2138        node = path->nodes[level];
2139
2140        search = btrfs_node_blockptr(node, slot);
2141        blocksize = btrfs_level_size(root, level - 1);
2142        eb = btrfs_find_tree_block(root, search, blocksize);
2143        if (eb) {
2144                free_extent_buffer(eb);
2145                return;
2146        }
2147
2148        target = search;
2149
2150        nritems = btrfs_header_nritems(node);
2151        nr = slot;
2152
2153        while (1) {
2154                if (direction < 0) {
2155                        if (nr == 0)
2156                                break;
2157                        nr--;
2158                } else if (direction > 0) {
2159                        nr++;
2160                        if (nr >= nritems)
2161                                break;
2162                }
2163                if (path->reada < 0 && objectid) {
2164                        btrfs_node_key(node, &disk_key, nr);
2165                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
2166                                break;
2167                }
2168                search = btrfs_node_blockptr(node, nr);
2169                if ((search <= target && target - search <= 65536) ||
2170                    (search > target && search - target <= 65536)) {
2171                        gen = btrfs_node_ptr_generation(node, nr);
2172                        readahead_tree_block(root, search, blocksize, gen);
2173                        nread += blocksize;
2174                }
2175                nscan++;
2176                if ((nread > 65536 || nscan > 32))
2177                        break;
2178        }
2179}
2180
2181/*
2182 * returns -EAGAIN if it had to drop the path, or zero if everything was in
2183 * cache
2184 */
2185static noinline int reada_for_balance(struct btrfs_root *root,
2186                                      struct btrfs_path *path, int level)
2187{
2188        int slot;
2189        int nritems;
2190        struct extent_buffer *parent;
2191        struct extent_buffer *eb;
2192        u64 gen;
2193        u64 block1 = 0;
2194        u64 block2 = 0;
2195        int ret = 0;
2196        int blocksize;
2197
2198        parent = path->nodes[level + 1];
2199        if (!parent)
2200                return 0;
2201
2202        nritems = btrfs_header_nritems(parent);
2203        slot = path->slots[level + 1];
2204        blocksize = btrfs_level_size(root, level);
2205
2206        if (slot > 0) {
2207                block1 = btrfs_node_blockptr(parent, slot - 1);
2208                gen = btrfs_node_ptr_generation(parent, slot - 1);
2209                eb = btrfs_find_tree_block(root, block1, blocksize);
2210                /*
2211                 * if we get -eagain from btrfs_buffer_uptodate, we
2212                 * don't want to return eagain here.  That will loop
2213                 * forever
2214                 */
2215                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2216                        block1 = 0;
2217                free_extent_buffer(eb);
2218        }
2219        if (slot + 1 < nritems) {
2220                block2 = btrfs_node_blockptr(parent, slot + 1);
2221                gen = btrfs_node_ptr_generation(parent, slot + 1);
2222                eb = btrfs_find_tree_block(root, block2, blocksize);
2223                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2224                        block2 = 0;
2225                free_extent_buffer(eb);
2226        }
2227        if (block1 || block2) {
2228                ret = -EAGAIN;
2229
2230                /* release the whole path */
2231                btrfs_release_path(path);
2232
2233                /* read the blocks */
2234                if (block1)
2235                        readahead_tree_block(root, block1, blocksize, 0);
2236                if (block2)
2237                        readahead_tree_block(root, block2, blocksize, 0);
2238
2239                if (block1) {
2240                        eb = read_tree_block(root, block1, blocksize, 0);
2241                        free_extent_buffer(eb);
2242                }
2243                if (block2) {
2244                        eb = read_tree_block(root, block2, blocksize, 0);
2245                        free_extent_buffer(eb);
2246                }
2247        }
2248        return ret;
2249}
2250
2251
2252/*
2253 * when we walk down the tree, it is usually safe to unlock the higher layers
2254 * in the tree.  The exceptions are when our path goes through slot 0, because
2255 * operations on the tree might require changing key pointers higher up in the
2256 * tree.
2257 *
2258 * callers might also have set path->keep_locks, which tells this code to keep
2259 * the lock if the path points to the last slot in the block.  This is part of
2260 * walking through the tree, and selecting the next slot in the higher block.
2261 *
2262 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2263 * if lowest_unlock is 1, level 0 won't be unlocked
2264 */
2265static noinline void unlock_up(struct btrfs_path *path, int level,
2266                               int lowest_unlock, int min_write_lock_level,
2267                               int *write_lock_level)
2268{
2269        int i;
2270        int skip_level = level;
2271        int no_skips = 0;
2272        struct extent_buffer *t;
2273
2274        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2275                if (!path->nodes[i])
2276                        break;
2277                if (!path->locks[i])
2278                        break;
2279                if (!no_skips && path->slots[i] == 0) {
2280                        skip_level = i + 1;
2281                        continue;
2282                }
2283                if (!no_skips && path->keep_locks) {
2284                        u32 nritems;
2285                        t = path->nodes[i];
2286                        nritems = btrfs_header_nritems(t);
2287                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
2288                                skip_level = i + 1;
2289                                continue;
2290                        }
2291                }
2292                if (skip_level < i && i >= lowest_unlock)
2293                        no_skips = 1;
2294
2295                t = path->nodes[i];
2296                if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2297                        btrfs_tree_unlock_rw(t, path->locks[i]);
2298                        path->locks[i] = 0;
2299                        if (write_lock_level &&
2300                            i > min_write_lock_level &&
2301                            i <= *write_lock_level) {
2302                                *write_lock_level = i - 1;
2303                        }
2304                }
2305        }
2306}
2307
2308/*
2309 * This releases any locks held in the path starting at level and
2310 * going all the way up to the root.
2311 *
2312 * btrfs_search_slot will keep the lock held on higher nodes in a few
2313 * corner cases, such as COW of the block at slot zero in the node.  This
2314 * ignores those rules, and it should only be called when there are no
2315 * more updates to be done higher up in the tree.
2316 */
2317noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2318{
2319        int i;
2320
2321        if (path->keep_locks)
2322                return;
2323
2324        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2325                if (!path->nodes[i])
2326                        continue;
2327                if (!path->locks[i])
2328                        continue;
2329                btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2330                path->locks[i] = 0;
2331        }
2332}
2333
2334/*
2335 * helper function for btrfs_search_slot.  The goal is to find a block
2336 * in cache without setting the path to blocking.  If we find the block
2337 * we return zero and the path is unchanged.
2338 *
2339 * If we can't find the block, we set the path blocking and do some
2340 * reada.  -EAGAIN is returned and the search must be repeated.
2341 */
2342static int
2343read_block_for_search(struct btrfs_trans_handle *trans,
2344                       struct btrfs_root *root, struct btrfs_path *p,
2345                       struct extent_buffer **eb_ret, int level, int slot,
2346                       struct btrfs_key *key, u64 time_seq)
2347{
2348        u64 blocknr;
2349        u64 gen;
2350        u32 blocksize;
2351        struct extent_buffer *b = *eb_ret;
2352        struct extent_buffer *tmp;
2353        int ret;
2354
2355        blocknr = btrfs_node_blockptr(b, slot);
2356        gen = btrfs_node_ptr_generation(b, slot);
2357        blocksize = btrfs_level_size(root, level - 1);
2358
2359        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
2360        if (tmp) {
2361                /* first we do an atomic uptodate check */
2362                if (btrfs_buffer_uptodate(tmp, 0, 1) > 0) {
2363                        if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2364                                /*
2365                                 * we found an up to date block without
2366                                 * sleeping, return
2367                                 * right away
2368                                 */
2369                                *eb_ret = tmp;
2370                                return 0;
2371                        }
2372                        /* the pages were up to date, but we failed
2373                         * the generation number check.  Do a full
2374                         * read for the generation number that is correct.
2375                         * We must do this without dropping locks so
2376                         * we can trust our generation number
2377                         */
2378                        free_extent_buffer(tmp);
2379                        btrfs_set_path_blocking(p);
2380
2381                        /* now we're allowed to do a blocking uptodate check */
2382                        tmp = read_tree_block(root, blocknr, blocksize, gen);
2383                        if (tmp && btrfs_buffer_uptodate(tmp, gen, 0) > 0) {
2384                                *eb_ret = tmp;
2385                                return 0;
2386                        }
2387                        free_extent_buffer(tmp);
2388                        btrfs_release_path(p);
2389                        return -EIO;
2390                }
2391        }
2392
2393        /*
2394         * reduce lock contention at high levels
2395         * of the btree by dropping locks before
2396         * we read.  Don't release the lock on the current
2397         * level because we need to walk this node to figure
2398         * out which blocks to read.
2399         */
2400        btrfs_unlock_up_safe(p, level + 1);
2401        btrfs_set_path_blocking(p);
2402
2403        free_extent_buffer(tmp);
2404        if (p->reada)
2405                reada_for_search(root, p, level, slot, key->objectid);
2406
2407        btrfs_release_path(p);
2408
2409        ret = -EAGAIN;
2410        tmp = read_tree_block(root, blocknr, blocksize, 0);
2411        if (tmp) {
2412                /*
2413                 * If the read above didn't mark this buffer up to date,
2414                 * it will never end up being up to date.  Set ret to EIO now
2415                 * and give up so that our caller doesn't loop forever
2416                 * on our EAGAINs.
2417                 */
2418                if (!btrfs_buffer_uptodate(tmp, 0, 0))
2419                        ret = -EIO;
2420                free_extent_buffer(tmp);
2421        }
2422        return ret;
2423}
2424
2425/*
2426 * helper function for btrfs_search_slot.  This does all of the checks
2427 * for node-level blocks and does any balancing required based on
2428 * the ins_len.
2429 *
2430 * If no extra work was required, zero is returned.  If we had to
2431 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2432 * start over
2433 */
2434static int
2435setup_nodes_for_search(struct btrfs_trans_handle *trans,
2436                       struct btrfs_root *root, struct btrfs_path *p,
2437                       struct extent_buffer *b, int level, int ins_len,
2438                       int *write_lock_level)
2439{
2440        int ret;
2441        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2442            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2443                int sret;
2444
2445                if (*write_lock_level < level + 1) {
2446                        *write_lock_level = level + 1;
2447                        btrfs_release_path(p);
2448                        goto again;
2449                }
2450
2451                sret = reada_for_balance(root, p, level);
2452                if (sret)
2453                        goto again;
2454
2455                btrfs_set_path_blocking(p);
2456                sret = split_node(trans, root, p, level);
2457                btrfs_clear_path_blocking(p, NULL, 0);
2458
2459                BUG_ON(sret > 0);
2460                if (sret) {
2461                        ret = sret;
2462                        goto done;
2463                }
2464                b = p->nodes[level];
2465        } else if (ins_len < 0 && btrfs_header_nritems(b) <
2466                   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2467                int sret;
2468
2469                if (*write_lock_level < level + 1) {
2470                        *write_lock_level = level + 1;
2471                        btrfs_release_path(p);
2472                        goto again;
2473                }
2474
2475                sret = reada_for_balance(root, p, level);
2476                if (sret)
2477                        goto again;
2478
2479                btrfs_set_path_blocking(p);
2480                sret = balance_level(trans, root, p, level);
2481                btrfs_clear_path_blocking(p, NULL, 0);
2482
2483                if (sret) {
2484                        ret = sret;
2485                        goto done;
2486                }
2487                b = p->nodes[level];
2488                if (!b) {
2489                        btrfs_release_path(p);
2490                        goto again;
2491                }
2492                BUG_ON(btrfs_header_nritems(b) == 1);
2493        }
2494        return 0;
2495
2496again:
2497        ret = -EAGAIN;
2498done:
2499        return ret;
2500}
2501
2502/*
2503 * look for key in the tree.  path is filled in with nodes along the way
2504 * if key is found, we return zero and you can find the item in the leaf
2505 * level of the path (level 0)
2506 *
2507 * If the key isn't found, the path points to the slot where it should
2508 * be inserted, and 1 is returned.  If there are other errors during the
2509 * search a negative error number is returned.
2510 *
2511 * if ins_len > 0, nodes and leaves will be split as we walk down the
2512 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2513 * possible)
2514 */
2515int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2516                      *root, struct btrfs_key *key, struct btrfs_path *p, int
2517                      ins_len, int cow)
2518{
2519        struct extent_buffer *b;
2520        int slot;
2521        int ret;
2522        int err;
2523        int level;
2524        int lowest_unlock = 1;
2525        int root_lock;
2526        /* everything at write_lock_level or lower must be write locked */
2527        int write_lock_level = 0;
2528        u8 lowest_level = 0;
2529        int min_write_lock_level;
2530
2531        lowest_level = p->lowest_level;
2532        WARN_ON(lowest_level && ins_len > 0);
2533        WARN_ON(p->nodes[0] != NULL);
2534
2535        if (ins_len < 0) {
2536                lowest_unlock = 2;
2537
2538                /* when we are removing items, we might have to go up to level
2539                 * two as we update tree pointers  Make sure we keep write
2540                 * for those levels as well
2541                 */
2542                write_lock_level = 2;
2543        } else if (ins_len > 0) {
2544                /*
2545                 * for inserting items, make sure we have a write lock on
2546                 * level 1 so we can update keys
2547                 */
2548                write_lock_level = 1;
2549        }
2550
2551        if (!cow)
2552                write_lock_level = -1;
2553
2554        if (cow && (p->keep_locks || p->lowest_level))
2555                write_lock_level = BTRFS_MAX_LEVEL;
2556
2557        min_write_lock_level = write_lock_level;
2558
2559again:
2560        /*
2561         * we try very hard to do read locks on the root
2562         */
2563        root_lock = BTRFS_READ_LOCK;
2564        level = 0;
2565        if (p->search_commit_root) {
2566                /*
2567                 * the commit roots are read only
2568                 * so we always do read locks
2569                 */
2570                b = root->commit_root;
2571                extent_buffer_get(b);
2572                level = btrfs_header_level(b);
2573                if (!p->skip_locking)
2574                        btrfs_tree_read_lock(b);
2575        } else {
2576                if (p->skip_locking) {
2577                        b = btrfs_root_node(root);
2578                        level = btrfs_header_level(b);
2579                } else {
2580                        /* we don't know the level of the root node
2581                         * until we actually have it read locked
2582                         */
2583                        b = btrfs_read_lock_root_node(root);
2584                        level = btrfs_header_level(b);
2585                        if (level <= write_lock_level) {
2586                                /* whoops, must trade for write lock */
2587                                btrfs_tree_read_unlock(b);
2588                                free_extent_buffer(b);
2589                                b = btrfs_lock_root_node(root);
2590                                root_lock = BTRFS_WRITE_LOCK;
2591
2592                                /* the level might have changed, check again */
2593                                level = btrfs_header_level(b);
2594                        }
2595                }
2596        }
2597        p->nodes[level] = b;
2598        if (!p->skip_locking)
2599                p->locks[level] = root_lock;
2600
2601        while (b) {
2602                level = btrfs_header_level(b);
2603
2604                /*
2605                 * setup the path here so we can release it under lock
2606                 * contention with the cow code
2607                 */
2608                if (cow) {
2609                        /*
2610                         * if we don't really need to cow this block
2611                         * then we don't want to set the path blocking,
2612                         * so we test it here
2613                         */
2614                        if (!should_cow_block(trans, root, b))
2615                                goto cow_done;
2616
2617                        btrfs_set_path_blocking(p);
2618
2619                        /*
2620                         * must have write locks on this node and the
2621                         * parent
2622                         */
2623                        if (level > write_lock_level ||
2624                            (level + 1 > write_lock_level &&
2625                            level + 1 < BTRFS_MAX_LEVEL &&
2626                            p->nodes[level + 1])) {
2627                                write_lock_level = level + 1;
2628                                btrfs_release_path(p);
2629                                goto again;
2630                        }
2631
2632                        err = btrfs_cow_block(trans, root, b,
2633                                              p->nodes[level + 1],
2634                                              p->slots[level + 1], &b);
2635                        if (err) {
2636                                ret = err;
2637                                goto done;
2638                        }
2639                }
2640cow_done:
2641                BUG_ON(!cow && ins_len);
2642
2643                p->nodes[level] = b;
2644                btrfs_clear_path_blocking(p, NULL, 0);
2645
2646                /*
2647                 * we have a lock on b and as long as we aren't changing
2648                 * the tree, there is no way to for the items in b to change.
2649                 * It is safe to drop the lock on our parent before we
2650                 * go through the expensive btree search on b.
2651                 *
2652                 * If cow is true, then we might be changing slot zero,
2653                 * which may require changing the parent.  So, we can't
2654                 * drop the lock until after we know which slot we're
2655                 * operating on.
2656                 */
2657                if (!cow)
2658                        btrfs_unlock_up_safe(p, level + 1);
2659
2660                ret = bin_search(b, key, level, &slot);
2661
2662                if (level != 0) {
2663                        int dec = 0;
2664                        if (ret && slot > 0) {
2665                                dec = 1;
2666                                slot -= 1;
2667                        }
2668                        p->slots[level] = slot;
2669                        err = setup_nodes_for_search(trans, root, p, b, level,
2670                                             ins_len, &write_lock_level);
2671                        if (err == -EAGAIN)
2672                                goto again;
2673                        if (err) {
2674                                ret = err;
2675                                goto done;
2676                        }
2677                        b = p->nodes[level];
2678                        slot = p->slots[level];
2679
2680                        /*
2681                         * slot 0 is special, if we change the key
2682                         * we have to update the parent pointer
2683                         * which means we must have a write lock
2684                         * on the parent
2685                         */
2686                        if (slot == 0 && cow &&
2687                            write_lock_level < level + 1) {
2688                                write_lock_level = level + 1;
2689                                btrfs_release_path(p);
2690                                goto again;
2691                        }
2692
2693                        unlock_up(p, level, lowest_unlock,
2694                                  min_write_lock_level, &write_lock_level);
2695
2696                        if (level == lowest_level) {
2697                                if (dec)
2698                                        p->slots[level]++;
2699                                goto done;
2700                        }
2701
2702                        err = read_block_for_search(trans, root, p,
2703                                                    &b, level, slot, key, 0);
2704                        if (err == -EAGAIN)
2705                                goto again;
2706                        if (err) {
2707                                ret = err;
2708                                goto done;
2709                        }
2710
2711                        if (!p->skip_locking) {
2712                                level = btrfs_header_level(b);
2713                                if (level <= write_lock_level) {
2714                                        err = btrfs_try_tree_write_lock(b);
2715                                        if (!err) {
2716                                                btrfs_set_path_blocking(p);
2717                                                btrfs_tree_lock(b);
2718                                                btrfs_clear_path_blocking(p, b,
2719                                                                  BTRFS_WRITE_LOCK);
2720                                        }
2721                                        p->locks[level] = BTRFS_WRITE_LOCK;
2722                                } else {
2723                                        err = btrfs_try_tree_read_lock(b);
2724                                        if (!err) {
2725                                                btrfs_set_path_blocking(p);
2726                                                btrfs_tree_read_lock(b);
2727                                                btrfs_clear_path_blocking(p, b,
2728                                                                  BTRFS_READ_LOCK);
2729                                        }
2730                                        p->locks[level] = BTRFS_READ_LOCK;
2731                                }
2732                                p->nodes[level] = b;
2733                        }
2734                } else {
2735                        p->slots[level] = slot;
2736                        if (ins_len > 0 &&
2737                            btrfs_leaf_free_space(root, b) < ins_len) {
2738                                if (write_lock_level < 1) {
2739                                        write_lock_level = 1;
2740                                        btrfs_release_path(p);
2741                                        goto again;
2742                                }
2743
2744                                btrfs_set_path_blocking(p);
2745                                err = split_leaf(trans, root, key,
2746                                                 p, ins_len, ret == 0);
2747                                btrfs_clear_path_blocking(p, NULL, 0);
2748
2749                                BUG_ON(err > 0);
2750                                if (err) {
2751                                        ret = err;
2752                                        goto done;
2753                                }
2754                        }
2755                        if (!p->search_for_split)
2756                                unlock_up(p, level, lowest_unlock,
2757                                          min_write_lock_level, &write_lock_level);
2758                        goto done;
2759                }
2760        }
2761        ret = 1;
2762done:
2763        /*
2764         * we don't really know what they plan on doing with the path
2765         * from here on, so for now just mark it as blocking
2766         */
2767        if (!p->leave_spinning)
2768                btrfs_set_path_blocking(p);
2769        if (ret < 0)
2770                btrfs_release_path(p);
2771        return ret;
2772}
2773
2774/*
2775 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2776 * current state of the tree together with the operations recorded in the tree
2777 * modification log to search for the key in a previous version of this tree, as
2778 * denoted by the time_seq parameter.
2779 *
2780 * Naturally, there is no support for insert, delete or cow operations.
2781 *
2782 * The resulting path and return value will be set up as if we called
2783 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2784 */
2785int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2786                          struct btrfs_path *p, u64 time_seq)
2787{
2788        struct extent_buffer *b;
2789        int slot;
2790        int ret;
2791        int err;
2792        int level;
2793        int lowest_unlock = 1;
2794        u8 lowest_level = 0;
2795
2796        lowest_level = p->lowest_level;
2797        WARN_ON(p->nodes[0] != NULL);
2798
2799        if (p->search_commit_root) {
2800                BUG_ON(time_seq);
2801                return btrfs_search_slot(NULL, root, key, p, 0, 0);
2802        }
2803
2804again:
2805        b = get_old_root(root, time_seq);
2806        level = btrfs_header_level(b);
2807        p->locks[level] = BTRFS_READ_LOCK;
2808
2809        while (b) {
2810                level = btrfs_header_level(b);
2811                p->nodes[level] = b;
2812                btrfs_clear_path_blocking(p, NULL, 0);
2813
2814                /*
2815                 * we have a lock on b and as long as we aren't changing
2816                 * the tree, there is no way to for the items in b to change.
2817                 * It is safe to drop the lock on our parent before we
2818                 * go through the expensive btree search on b.
2819                 */
2820                btrfs_unlock_up_safe(p, level + 1);
2821
2822                ret = bin_search(b, key, level, &slot);
2823
2824                if (level != 0) {
2825                        int dec = 0;
2826                        if (ret && slot > 0) {
2827                                dec = 1;
2828                                slot -= 1;
2829                        }
2830                        p->slots[level] = slot;
2831                        unlock_up(p, level, lowest_unlock, 0, NULL);
2832
2833                        if (level == lowest_level) {
2834                                if (dec)
2835                                        p->slots[level]++;
2836                                goto done;
2837                        }
2838
2839                        err = read_block_for_search(NULL, root, p, &b, level,
2840                                                    slot, key, time_seq);
2841                        if (err == -EAGAIN)
2842                                goto again;
2843                        if (err) {
2844                                ret = err;
2845                                goto done;
2846                        }
2847
2848                        level = btrfs_header_level(b);
2849                        err = btrfs_try_tree_read_lock(b);
2850                        if (!err) {
2851                                btrfs_set_path_blocking(p);
2852                                btrfs_tree_read_lock(b);
2853                                btrfs_clear_path_blocking(p, b,
2854                                                          BTRFS_READ_LOCK);
2855                        }
2856                        b = tree_mod_log_rewind(root->fs_info, b, time_seq);
2857                        p->locks[level] = BTRFS_READ_LOCK;
2858                        p->nodes[level] = b;
2859                } else {
2860                        p->slots[level] = slot;
2861                        unlock_up(p, level, lowest_unlock, 0, NULL);
2862                        goto done;
2863                }
2864        }
2865        ret = 1;
2866done:
2867        if (!p->leave_spinning)
2868                btrfs_set_path_blocking(p);
2869        if (ret < 0)
2870                btrfs_release_path(p);
2871
2872        return ret;
2873}
2874
2875/*
2876 * helper to use instead of search slot if no exact match is needed but
2877 * instead the next or previous item should be returned.
2878 * When find_higher is true, the next higher item is returned, the next lower
2879 * otherwise.
2880 * When return_any and find_higher are both true, and no higher item is found,
2881 * return the next lower instead.
2882 * When return_any is true and find_higher is false, and no lower item is found,
2883 * return the next higher instead.
2884 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2885 * < 0 on error
2886 */
2887int btrfs_search_slot_for_read(struct btrfs_root *root,
2888                               struct btrfs_key *key, struct btrfs_path *p,
2889                               int find_higher, int return_any)
2890{
2891        int ret;
2892        struct extent_buffer *leaf;
2893
2894again:
2895        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2896        if (ret <= 0)
2897                return ret;
2898        /*
2899         * a return value of 1 means the path is at the position where the
2900         * item should be inserted. Normally this is the next bigger item,
2901         * but in case the previous item is the last in a leaf, path points
2902         * to the first free slot in the previous leaf, i.e. at an invalid
2903         * item.
2904         */
2905        leaf = p->nodes[0];
2906
2907        if (find_higher) {
2908                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2909                        ret = btrfs_next_leaf(root, p);
2910                        if (ret <= 0)
2911                                return ret;
2912                        if (!return_any)
2913                                return 1;
2914                        /*
2915                         * no higher item found, return the next
2916                         * lower instead
2917                         */
2918                        return_any = 0;
2919                        find_higher = 0;
2920                        btrfs_release_path(p);
2921                        goto again;
2922                }
2923        } else {
2924                if (p->slots[0] == 0) {
2925                        ret = btrfs_prev_leaf(root, p);
2926                        if (ret < 0)
2927                                return ret;
2928                        if (!ret) {
2929                                p->slots[0] = btrfs_header_nritems(leaf) - 1;
2930                                return 0;
2931                        }
2932                        if (!return_any)
2933                                return 1;
2934                        /*
2935                         * no lower item found, return the next
2936                         * higher instead
2937                         */
2938                        return_any = 0;
2939                        find_higher = 1;
2940                        btrfs_release_path(p);
2941                        goto again;
2942                } else {
2943                        --p->slots[0];
2944                }
2945        }
2946        return 0;
2947}
2948
2949/*
2950 * adjust the pointers going up the tree, starting at level
2951 * making sure the right key of each node is points to 'key'.
2952 * This is used after shifting pointers to the left, so it stops
2953 * fixing up pointers when a given leaf/node is not in slot 0 of the
2954 * higher levels
2955 *
2956 */
2957static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
2958                           struct btrfs_disk_key *key, int level)
2959{
2960        int i;
2961        struct extent_buffer *t;
2962
2963        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2964                int tslot = path->slots[i];
2965                if (!path->nodes[i])
2966                        break;
2967                t = path->nodes[i];
2968                tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
2969                btrfs_set_node_key(t, key, tslot);
2970                btrfs_mark_buffer_dirty(path->nodes[i]);
2971                if (tslot != 0)
2972                        break;
2973        }
2974}
2975
2976/*
2977 * update item key.
2978 *
2979 * This function isn't completely safe. It's the caller's responsibility
2980 * that the new key won't break the order
2981 */
2982void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
2983                             struct btrfs_key *new_key)
2984{
2985        struct btrfs_disk_key disk_key;
2986        struct extent_buffer *eb;
2987        int slot;
2988
2989        eb = path->nodes[0];
2990        slot = path->slots[0];
2991        if (slot > 0) {
2992                btrfs_item_key(eb, &disk_key, slot - 1);
2993                BUG_ON(comp_keys(&disk_key, new_key) >= 0);
2994        }
2995        if (slot < btrfs_header_nritems(eb) - 1) {
2996                btrfs_item_key(eb, &disk_key, slot + 1);
2997                BUG_ON(comp_keys(&disk_key, new_key) <= 0);
2998        }
2999
3000        btrfs_cpu_key_to_disk(&disk_key, new_key);
3001        btrfs_set_item_key(eb, &disk_key, slot);
3002        btrfs_mark_buffer_dirty(eb);
3003        if (slot == 0)
3004                fixup_low_keys(root, path, &disk_key, 1);
3005}
3006
3007/*
3008 * try to push data from one node into the next node left in the
3009 * tree.
3010 *
3011 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3012 * error, and > 0 if there was no room in the left hand block.
3013 */
3014static int push_node_left(struct btrfs_trans_handle *trans,
3015                          struct btrfs_root *root, struct extent_buffer *dst,
3016                          struct extent_buffer *src, int empty)
3017{
3018        int push_items = 0;
3019        int src_nritems;
3020        int dst_nritems;
3021        int ret = 0;
3022
3023        src_nritems = btrfs_header_nritems(src);
3024        dst_nritems = btrfs_header_nritems(dst);
3025        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3026        WARN_ON(btrfs_header_generation(src) != trans->transid);
3027        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3028
3029        if (!empty && src_nritems <= 8)
3030                return 1;
3031
3032        if (push_items <= 0)
3033                return 1;
3034
3035        if (empty) {
3036                push_items = min(src_nritems, push_items);
3037                if (push_items < src_nritems) {
3038                        /* leave at least 8 pointers in the node if
3039                         * we aren't going to empty it
3040                         */
3041                        if (src_nritems - push_items < 8) {
3042                                if (push_items <= 8)
3043                                        return 1;
3044                                push_items -= 8;
3045                        }
3046                }
3047        } else
3048                push_items = min(src_nritems - 8, push_items);
3049
3050        tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3051                             push_items);
3052        copy_extent_buffer(dst, src,
3053                           btrfs_node_key_ptr_offset(dst_nritems),
3054                           btrfs_node_key_ptr_offset(0),
3055                           push_items * sizeof(struct btrfs_key_ptr));
3056
3057        if (push_items < src_nritems) {
3058                /*
3059                 * don't call tree_mod_log_eb_move here, key removal was already
3060                 * fully logged by tree_mod_log_eb_copy above.
3061                 */
3062                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3063                                      btrfs_node_key_ptr_offset(push_items),
3064                                      (src_nritems - push_items) *
3065                                      sizeof(struct btrfs_key_ptr));
3066        }
3067        btrfs_set_header_nritems(src, src_nritems - push_items);
3068        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3069        btrfs_mark_buffer_dirty(src);
3070        btrfs_mark_buffer_dirty(dst);
3071
3072        return ret;
3073}
3074
3075/*
3076 * try to push data from one node into the next node right in the
3077 * tree.
3078 *
3079 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3080 * error, and > 0 if there was no room in the right hand block.
3081 *
3082 * this will  only push up to 1/2 the contents of the left node over
3083 */
3084static int balance_node_right(struct btrfs_trans_handle *trans,
3085                              struct btrfs_root *root,
3086                              struct extent_buffer *dst,
3087                              struct extent_buffer *src)
3088{
3089        int push_items = 0;
3090        int max_push;
3091        int src_nritems;
3092        int dst_nritems;
3093        int ret = 0;
3094
3095        WARN_ON(btrfs_header_generation(src) != trans->transid);
3096        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3097
3098        src_nritems = btrfs_header_nritems(src);
3099        dst_nritems = btrfs_header_nritems(dst);
3100        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3101        if (push_items <= 0)
3102                return 1;
3103
3104        if (src_nritems < 4)
3105                return 1;
3106
3107        max_push = src_nritems / 2 + 1;
3108        /* don't try to empty the node */
3109        if (max_push >= src_nritems)
3110                return 1;
3111
3112        if (max_push < push_items)
3113                push_items = max_push;
3114
3115        tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3116        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3117                                      btrfs_node_key_ptr_offset(0),
3118                                      (dst_nritems) *
3119                                      sizeof(struct btrfs_key_ptr));
3120
3121        tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3122                             src_nritems - push_items, push_items);
3123        copy_extent_buffer(dst, src,
3124                           btrfs_node_key_ptr_offset(0),
3125                           btrfs_node_key_ptr_offset(src_nritems - push_items),
3126                           push_items * sizeof(struct btrfs_key_ptr));
3127
3128        btrfs_set_header_nritems(src, src_nritems - push_items);
3129        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3130
3131        btrfs_mark_buffer_dirty(src);
3132        btrfs_mark_buffer_dirty(dst);
3133
3134        return ret;
3135}
3136
3137/*
3138 * helper function to insert a new root level in the tree.
3139 * A new node is allocated, and a single item is inserted to
3140 * point to the existing root
3141 *
3142 * returns zero on success or < 0 on failure.
3143 */
3144static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3145                           struct btrfs_root *root,
3146                           struct btrfs_path *path, int level, int log_removal)
3147{
3148        u64 lower_gen;
3149        struct extent_buffer *lower;
3150        struct extent_buffer *c;
3151        struct extent_buffer *old;
3152        struct btrfs_disk_key lower_key;
3153
3154        BUG_ON(path->nodes[level]);
3155        BUG_ON(path->nodes[level-1] != root->node);
3156
3157        lower = path->nodes[level-1];
3158        if (level == 1)
3159                btrfs_item_key(lower, &lower_key, 0);
3160        else
3161                btrfs_node_key(lower, &lower_key, 0);
3162
3163        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3164                                   root->root_key.objectid, &lower_key,
3165                                   level, root->node->start, 0);
3166        if (IS_ERR(c))
3167                return PTR_ERR(c);
3168
3169        root_add_used(root, root->nodesize);
3170
3171        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3172        btrfs_set_header_nritems(c, 1);
3173        btrfs_set_header_level(c, level);
3174        btrfs_set_header_bytenr(c, c->start);
3175        btrfs_set_header_generation(c, trans->transid);
3176        btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3177        btrfs_set_header_owner(c, root->root_key.objectid);
3178
3179        write_extent_buffer(c, root->fs_info->fsid,
3180                            (unsigned long)btrfs_header_fsid(c),
3181                            BTRFS_FSID_SIZE);
3182
3183        write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3184                            (unsigned long)btrfs_header_chunk_tree_uuid(c),
3185                            BTRFS_UUID_SIZE);
3186
3187        btrfs_set_node_key(c, &lower_key, 0);
3188        btrfs_set_node_blockptr(c, 0, lower->start);
3189        lower_gen = btrfs_header_generation(lower);
3190        WARN_ON(lower_gen != trans->transid);
3191
3192        btrfs_set_node_ptr_generation(c, 0, lower_gen);
3193
3194        btrfs_mark_buffer_dirty(c);
3195
3196        old = root->node;
3197        tree_mod_log_set_root_pointer(root, c, log_removal);
3198        rcu_assign_pointer(root->node, c);
3199
3200        /* the super has an extra ref to root->node */
3201        free_extent_buffer(old);
3202
3203        add_root_to_dirty_list(root);
3204        extent_buffer_get(c);
3205        path->nodes[level] = c;
3206        path->locks[level] = BTRFS_WRITE_LOCK;
3207        path->slots[level] = 0;
3208        return 0;
3209}
3210
3211/*
3212 * worker function to insert a single pointer in a node.
3213 * the node should have enough room for the pointer already
3214 *
3215 * slot and level indicate where you want the key to go, and
3216 * blocknr is the block the key points to.
3217 */
3218static void insert_ptr(struct btrfs_trans_handle *trans,
3219                       struct btrfs_root *root, struct btrfs_path *path,
3220                       struct btrfs_disk_key *key, u64 bytenr,
3221                       int slot, int level)
3222{
3223        struct extent_buffer *lower;
3224        int nritems;
3225        int ret;
3226
3227        BUG_ON(!path->nodes[level]);
3228        btrfs_assert_tree_locked(path->nodes[level]);
3229        lower = path->nodes[level];
3230        nritems = btrfs_header_nritems(lower);
3231        BUG_ON(slot > nritems);
3232        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3233        if (slot != nritems) {
3234                if (level)
3235                        tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3236                                             slot, nritems - slot);
3237                memmove_extent_buffer(lower,
3238                              btrfs_node_key_ptr_offset(slot + 1),
3239                              btrfs_node_key_ptr_offset(slot),
3240                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
3241        }
3242        if (level) {
3243                ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3244                                              MOD_LOG_KEY_ADD);
3245                BUG_ON(ret < 0);
3246        }
3247        btrfs_set_node_key(lower, key, slot);
3248        btrfs_set_node_blockptr(lower, slot, bytenr);
3249        WARN_ON(trans->transid == 0);
3250        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3251        btrfs_set_header_nritems(lower, nritems + 1);
3252        btrfs_mark_buffer_dirty(lower);
3253}
3254
3255/*
3256 * split the node at the specified level in path in two.
3257 * The path is corrected to point to the appropriate node after the split
3258 *
3259 * Before splitting this tries to make some room in the node by pushing
3260 * left and right, if either one works, it returns right away.
3261 *
3262 * returns 0 on success and < 0 on failure
3263 */
3264static noinline int split_node(struct btrfs_trans_handle *trans,
3265                               struct btrfs_root *root,
3266                               struct btrfs_path *path, int level)
3267{
3268        struct extent_buffer *c;
3269        struct extent_buffer *split;
3270        struct btrfs_disk_key disk_key;
3271        int mid;
3272        int ret;
3273        u32 c_nritems;
3274
3275        c = path->nodes[level];
3276        WARN_ON(btrfs_header_generation(c) != trans->transid);
3277        if (c == root->node) {
3278                /*
3279                 * trying to split the root, lets make a new one
3280                 *
3281                 * tree mod log: We pass 0 as log_removal parameter to
3282                 * insert_new_root, because that root buffer will be kept as a
3283                 * normal node. We are going to log removal of half of the
3284                 * elements below with tree_mod_log_eb_copy. We're holding a
3285                 * tree lock on the buffer, which is why we cannot race with
3286                 * other tree_mod_log users.
3287                 */
3288                ret = insert_new_root(trans, root, path, level + 1, 0);
3289                if (ret)
3290                        return ret;
3291        } else {
3292                ret = push_nodes_for_insert(trans, root, path, level);
3293                c = path->nodes[level];
3294                if (!ret && btrfs_header_nritems(c) <
3295                    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3296                        return 0;
3297                if (ret < 0)
3298                        return ret;
3299        }
3300
3301        c_nritems = btrfs_header_nritems(c);
3302        mid = (c_nritems + 1) / 2;
3303        btrfs_node_key(c, &disk_key, mid);
3304
3305        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
3306                                        root->root_key.objectid,
3307                                        &disk_key, level, c->start, 0);
3308        if (IS_ERR(split))
3309                return PTR_ERR(split);
3310
3311        root_add_used(root, root->nodesize);
3312
3313        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3314        btrfs_set_header_level(split, btrfs_header_level(c));
3315        btrfs_set_header_bytenr(split, split->start);
3316        btrfs_set_header_generation(split, trans->transid);
3317        btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3318        btrfs_set_header_owner(split, root->root_key.objectid);
3319        write_extent_buffer(split, root->fs_info->fsid,
3320                            (unsigned long)btrfs_header_fsid(split),
3321                            BTRFS_FSID_SIZE);
3322        write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3323                            (unsigned long)btrfs_header_chunk_tree_uuid(split),
3324                            BTRFS_UUID_SIZE);
3325
3326        tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
3327        copy_extent_buffer(split, c,
3328                           btrfs_node_key_ptr_offset(0),
3329                           btrfs_node_key_ptr_offset(mid),
3330                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3331        btrfs_set_header_nritems(split, c_nritems - mid);
3332        btrfs_set_header_nritems(c, mid);
3333        ret = 0;
3334
3335        btrfs_mark_buffer_dirty(c);
3336        btrfs_mark_buffer_dirty(split);
3337
3338        insert_ptr(trans, root, path, &disk_key, split->start,
3339                   path->slots[level + 1] + 1, level + 1);
3340
3341        if (path->slots[level] >= mid) {
3342                path->slots[level] -= mid;
3343                btrfs_tree_unlock(c);
3344                free_extent_buffer(c);
3345                path->nodes[level] = split;
3346                path->slots[level + 1] += 1;
3347        } else {
3348                btrfs_tree_unlock(split);
3349                free_extent_buffer(split);
3350        }
3351        return ret;
3352}
3353
3354/*
3355 * how many bytes are required to store the items in a leaf.  start
3356 * and nr indicate which items in the leaf to check.  This totals up the
3357 * space used both by the item structs and the item data
3358 */
3359static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3360{
3361        struct btrfs_item *start_item;
3362        struct btrfs_item *end_item;
3363        struct btrfs_map_token token;
3364        int data_len;
3365        int nritems = btrfs_header_nritems(l);
3366        int end = min(nritems, start + nr) - 1;
3367
3368        if (!nr)
3369                return 0;
3370        btrfs_init_map_token(&token);
3371        start_item = btrfs_item_nr(l, start);
3372        end_item = btrfs_item_nr(l, end);
3373        data_len = btrfs_token_item_offset(l, start_item, &token) +
3374                btrfs_token_item_size(l, start_item, &token);
3375        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3376        data_len += sizeof(struct btrfs_item) * nr;
3377        WARN_ON(data_len < 0);
3378        return data_len;
3379}
3380
3381/*
3382 * The space between the end of the leaf items and
3383 * the start of the leaf data.  IOW, how much room
3384 * the leaf has left for both items and data
3385 */
3386noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3387                                   struct extent_buffer *leaf)
3388{
3389        int nritems = btrfs_header_nritems(leaf);
3390        int ret;
3391        ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3392        if (ret < 0) {
3393                printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
3394                       "used %d nritems %d\n",
3395                       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3396                       leaf_space_used(leaf, 0, nritems), nritems);
3397        }
3398        return ret;
3399}
3400
3401/*
3402 * min slot controls the lowest index we're willing to push to the
3403 * right.  We'll push up to and including min_slot, but no lower
3404 */
3405static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3406                                      struct btrfs_root *root,
3407                                      struct btrfs_path *path,
3408                                      int data_size, int empty,
3409                                      struct extent_buffer *right,
3410                                      int free_space, u32 left_nritems,
3411                                      u32 min_slot)
3412{
3413        struct extent_buffer *left = path->nodes[0];
3414        struct extent_buffer *upper = path->nodes[1];
3415        struct btrfs_map_token token;
3416        struct btrfs_disk_key disk_key;
3417        int slot;
3418        u32 i;
3419        int push_space = 0;
3420        int push_items = 0;
3421        struct btrfs_item *item;
3422        u32 nr;
3423        u32 right_nritems;
3424        u32 data_end;
3425        u32 this_item_size;
3426
3427        btrfs_init_map_token(&token);
3428
3429        if (empty)
3430                nr = 0;
3431        else
3432                nr = max_t(u32, 1, min_slot);
3433
3434        if (path->slots[0] >= left_nritems)
3435                push_space += data_size;
3436
3437        slot = path->slots[1];
3438        i = left_nritems - 1;
3439        while (i >= nr) {
3440                item = btrfs_item_nr(left, i);
3441
3442                if (!empty && push_items > 0) {
3443                        if (path->slots[0] > i)
3444                                break;
3445                        if (path->slots[0] == i) {
3446                                int space = btrfs_leaf_free_space(root, left);
3447                                if (space + push_space * 2 > free_space)
3448                                        break;
3449                        }
3450                }
3451
3452                if (path->slots[0] == i)
3453                        push_space += data_size;
3454
3455                this_item_size = btrfs_item_size(left, item);
3456                if (this_item_size + sizeof(*item) + push_space > free_space)
3457                        break;
3458
3459                push_items++;
3460                push_space += this_item_size + sizeof(*item);
3461                if (i == 0)
3462                        break;
3463                i--;
3464        }
3465
3466        if (push_items == 0)
3467                goto out_unlock;
3468
3469        WARN_ON(!empty && push_items == left_nritems);
3470
3471        /* push left to right */
3472        right_nritems = btrfs_header_nritems(right);
3473
3474        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3475        push_space -= leaf_data_end(root, left);
3476
3477        /* make room in the right data area */
3478        data_end = leaf_data_end(root, right);
3479        memmove_extent_buffer(right,
3480                              btrfs_leaf_data(right) + data_end - push_space,
3481                              btrfs_leaf_data(right) + data_end,
3482                              BTRFS_LEAF_DATA_SIZE(root) - data_end);
3483
3484        /* copy from the left data area */
3485        copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3486                     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3487                     btrfs_leaf_data(left) + leaf_data_end(root, left),
3488                     push_space);
3489
3490        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3491                              btrfs_item_nr_offset(0),
3492                              right_nritems * sizeof(struct btrfs_item));
3493
3494        /* copy the items from left to right */
3495        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3496                   btrfs_item_nr_offset(left_nritems - push_items),
3497                   push_items * sizeof(struct btrfs_item));
3498
3499        /* update the item pointers */
3500        right_nritems += push_items;
3501        btrfs_set_header_nritems(right, right_nritems);
3502        push_space = BTRFS_LEAF_DATA_SIZE(root);
3503        for (i = 0; i < right_nritems; i++) {
3504                item = btrfs_item_nr(right, i);
3505                push_space -= btrfs_token_item_size(right, item, &token);
3506                btrfs_set_token_item_offset(right, item, push_space, &token);
3507        }
3508
3509        left_nritems -= push_items;
3510        btrfs_set_header_nritems(left, left_nritems);
3511
3512        if (left_nritems)
3513                btrfs_mark_buffer_dirty(left);
3514        else
3515                clean_tree_block(trans, root, left);
3516
3517        btrfs_mark_buffer_dirty(right);
3518
3519        btrfs_item_key(right, &disk_key, 0);
3520        btrfs_set_node_key(upper, &disk_key, slot + 1);
3521        btrfs_mark_buffer_dirty(upper);
3522
3523        /* then fixup the leaf pointer in the path */
3524        if (path->slots[0] >= left_nritems) {
3525                path->slots[0] -= left_nritems;
3526                if (btrfs_header_nritems(path->nodes[0]) == 0)
3527                        clean_tree_block(trans, root, path->nodes[0]);
3528                btrfs_tree_unlock(path->nodes[0]);
3529                free_extent_buffer(path->nodes[0]);
3530                path->nodes[0] = right;
3531                path->slots[1] += 1;
3532        } else {
3533                btrfs_tree_unlock(right);
3534                free_extent_buffer(right);
3535        }
3536        return 0;
3537
3538out_unlock:
3539        btrfs_tree_unlock(right);
3540        free_extent_buffer(right);
3541        return 1;
3542}
3543
3544/*
3545 * push some data in the path leaf to the right, trying to free up at
3546 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3547 *
3548 * returns 1 if the push failed because the other node didn't have enough
3549 * room, 0 if everything worked out and < 0 if there were major errors.
3550 *
3551 * this will push starting from min_slot to the end of the leaf.  It won't
3552 * push any slot lower than min_slot
3553 */
3554static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3555                           *root, struct btrfs_path *path,
3556                           int min_data_size, int data_size,
3557                           int empty, u32 min_slot)
3558{
3559        struct extent_buffer *left = path->nodes[0];
3560        struct extent_buffer *right;
3561        struct extent_buffer *upper;
3562        int slot;
3563        int free_space;
3564        u32 left_nritems;
3565        int ret;
3566
3567        if (!path->nodes[1])
3568                return 1;
3569
3570        slot = path->slots[1];
3571        upper = path->nodes[1];
3572        if (slot >= btrfs_header_nritems(upper) - 1)
3573                return 1;
3574
3575        btrfs_assert_tree_locked(path->nodes[1]);
3576
3577        right = read_node_slot(root, upper, slot + 1);
3578        if (right == NULL)
3579                return 1;
3580
3581        btrfs_tree_lock(right);
3582        btrfs_set_lock_blocking(right);
3583
3584        free_space = btrfs_leaf_free_space(root, right);
3585        if (free_space < data_size)
3586                goto out_unlock;
3587
3588        /* cow and double check */
3589        ret = btrfs_cow_block(trans, root, right, upper,
3590                              slot + 1, &right);
3591        if (ret)
3592                goto out_unlock;
3593
3594        free_space = btrfs_leaf_free_space(root, right);
3595        if (free_space < data_size)
3596                goto out_unlock;
3597
3598        left_nritems = btrfs_header_nritems(left);
3599        if (left_nritems == 0)
3600                goto out_unlock;
3601
3602        return __push_leaf_right(trans, root, path, min_data_size, empty,
3603                                right, free_space, left_nritems, min_slot);
3604out_unlock:
3605        btrfs_tree_unlock(right);
3606        free_extent_buffer(right);
3607        return 1;
3608}
3609
3610/*
3611 * push some data in the path leaf to the left, trying to free up at
3612 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3613 *
3614 * max_slot can put a limit on how far into the leaf we'll push items.  The
3615 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3616 * items
3617 */
3618static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3619                                     struct btrfs_root *root,
3620                                     struct btrfs_path *path, int data_size,
3621                                     int empty, struct extent_buffer *left,
3622                                     int free_space, u32 right_nritems,
3623                                     u32 max_slot)
3624{
3625        struct btrfs_disk_key disk_key;
3626        struct extent_buffer *right = path->nodes[0];
3627        int i;
3628        int push_space = 0;
3629        int push_items = 0;
3630        struct btrfs_item *item;
3631        u32 old_left_nritems;
3632        u32 nr;
3633        int ret = 0;
3634        u32 this_item_size;
3635        u32 old_left_item_size;
3636        struct btrfs_map_token token;
3637
3638        btrfs_init_map_token(&token);
3639
3640        if (empty)
3641                nr = min(right_nritems, max_slot);
3642        else
3643                nr = min(right_nritems - 1, max_slot);
3644
3645        for (i = 0; i < nr; i++) {
3646                item = btrfs_item_nr(right, i);
3647
3648                if (!empty && push_items > 0) {
3649                        if (path->slots[0] < i)
3650                                break;
3651                        if (path->slots[0] == i) {
3652                                int space = btrfs_leaf_free_space(root, right);
3653                                if (space + push_space * 2 > free_space)
3654                                        break;
3655                        }
3656                }
3657
3658                if (path->slots[0] == i)
3659                        push_space += data_size;
3660
3661                this_item_size = btrfs_item_size(right, item);
3662                if (this_item_size + sizeof(*item) + push_space > free_space)
3663                        break;
3664
3665                push_items++;
3666                push_space += this_item_size + sizeof(*item);
3667        }
3668
3669        if (push_items == 0) {
3670                ret = 1;
3671                goto out;
3672        }
3673        if (!empty && push_items == btrfs_header_nritems(right))
3674                WARN_ON(1);
3675
3676        /* push data from right to left */
3677        copy_extent_buffer(left, right,
3678                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
3679                           btrfs_item_nr_offset(0),
3680                           push_items * sizeof(struct btrfs_item));
3681
3682        push_space = BTRFS_LEAF_DATA_SIZE(root) -
3683                     btrfs_item_offset_nr(right, push_items - 1);
3684
3685        copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3686                     leaf_data_end(root, left) - push_space,
3687                     btrfs_leaf_data(right) +
3688                     btrfs_item_offset_nr(right, push_items - 1),
3689                     push_space);
3690        old_left_nritems = btrfs_header_nritems(left);
3691        BUG_ON(old_left_nritems <= 0);
3692
3693        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3694        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3695                u32 ioff;
3696
3697                item = btrfs_item_nr(left, i);
3698
3699                ioff = btrfs_token_item_offset(left, item, &token);
3700                btrfs_set_token_item_offset(left, item,
3701                      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3702                      &token);
3703        }
3704        btrfs_set_header_nritems(left, old_left_nritems + push_items);
3705
3706        /* fixup right node */
3707        if (push_items > right_nritems)
3708                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3709                       right_nritems);
3710
3711        if (push_items < right_nritems) {
3712                push_space = btrfs_item_offset_nr(right, push_items - 1) -
3713                                                  leaf_data_end(root, right);
3714                memmove_extent_buffer(right, btrfs_leaf_data(right) +
3715                                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3716                                      btrfs_leaf_data(right) +
3717                                      leaf_data_end(root, right), push_space);
3718
3719                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3720                              btrfs_item_nr_offset(push_items),
3721                             (btrfs_header_nritems(right) - push_items) *
3722                             sizeof(struct btrfs_item));
3723        }
3724        right_nritems -= push_items;
3725        btrfs_set_header_nritems(right, right_nritems);
3726        push_space = BTRFS_LEAF_DATA_SIZE(root);
3727        for (i = 0; i < right_nritems; i++) {
3728                item = btrfs_item_nr(right, i);
3729
3730                push_space = push_space - btrfs_token_item_size(right,
3731                                                                item, &token);
3732                btrfs_set_token_item_offset(right, item, push_space, &token);
3733        }
3734
3735        btrfs_mark_buffer_dirty(left);
3736        if (right_nritems)
3737                btrfs_mark_buffer_dirty(right);
3738        else
3739                clean_tree_block(trans, root, right);
3740
3741        btrfs_item_key(right, &disk_key, 0);
3742        fixup_low_keys(root, path, &disk_key, 1);
3743
3744        /* then fixup the leaf pointer in the path */
3745        if (path->slots[0] < push_items) {
3746                path->slots[0] += old_left_nritems;
3747                btrfs_tree_unlock(path->nodes[0]);
3748                free_extent_buffer(path->nodes[0]);
3749                path->nodes[0] = left;
3750                path->slots[1] -= 1;
3751        } else {
3752                btrfs_tree_unlock(left);
3753                free_extent_buffer(left);
3754                path->slots[0] -= push_items;
3755        }
3756        BUG_ON(path->slots[0] < 0);
3757        return ret;
3758out:
3759        btrfs_tree_unlock(left);
3760        free_extent_buffer(left);
3761        return ret;
3762}
3763
3764/*
3765 * push some data in the path leaf to the left, trying to free up at
3766 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3767 *
3768 * max_slot can put a limit on how far into the leaf we'll push items.  The
3769 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3770 * items
3771 */
3772static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3773                          *root, struct btrfs_path *path, int min_data_size,
3774                          int data_size, int empty, u32 max_slot)
3775{
3776        struct extent_buffer *right = path->nodes[0];
3777        struct extent_buffer *left;
3778        int slot;
3779        int free_space;
3780        u32 right_nritems;
3781        int ret = 0;
3782
3783        slot = path->slots[1];
3784        if (slot == 0)
3785                return 1;
3786        if (!path->nodes[1])
3787                return 1;
3788
3789        right_nritems = btrfs_header_nritems(right);
3790        if (right_nritems == 0)
3791                return 1;
3792
3793        btrfs_assert_tree_locked(path->nodes[1]);
3794
3795        left = read_node_slot(root, path->nodes[1], slot - 1);
3796        if (left == NULL)
3797                return 1;
3798
3799        btrfs_tree_lock(left);
3800        btrfs_set_lock_blocking(left);
3801
3802        free_space = btrfs_leaf_free_space(root, left);
3803        if (free_space < data_size) {
3804                ret = 1;
3805                goto out;
3806        }
3807
3808        /* cow and double check */
3809        ret = btrfs_cow_block(trans, root, left,
3810                              path->nodes[1], slot - 1, &left);
3811        if (ret) {
3812                /* we hit -ENOSPC, but it isn't fatal here */
3813                if (ret == -ENOSPC)
3814                        ret = 1;
3815                goto out;
3816        }
3817
3818        free_space = btrfs_leaf_free_space(root, left);
3819        if (free_space < data_size) {
3820                ret = 1;
3821                goto out;
3822        }
3823
3824        return __push_leaf_left(trans, root, path, min_data_size,
3825                               empty, left, free_space, right_nritems,
3826                               max_slot);
3827out:
3828        btrfs_tree_unlock(left);
3829        free_extent_buffer(left);
3830        return ret;
3831}
3832
3833/*
3834 * split the path's leaf in two, making sure there is at least data_size
3835 * available for the resulting leaf level of the path.
3836 */
3837static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3838                                    struct btrfs_root *root,
3839                                    struct btrfs_path *path,
3840                                    struct extent_buffer *l,
3841                                    struct extent_buffer *right,
3842                                    int slot, int mid, int nritems)
3843{
3844        int data_copy_size;
3845        int rt_data_off;
3846        int i;
3847        struct btrfs_disk_key disk_key;
3848        struct btrfs_map_token token;
3849
3850        btrfs_init_map_token(&token);
3851
3852        nritems = nritems - mid;
3853        btrfs_set_header_nritems(right, nritems);
3854        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
3855
3856        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3857                           btrfs_item_nr_offset(mid),
3858                           nritems * sizeof(struct btrfs_item));
3859
3860        copy_extent_buffer(right, l,
3861                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
3862                     data_copy_size, btrfs_leaf_data(l) +
3863                     leaf_data_end(root, l), data_copy_size);
3864
3865        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
3866                      btrfs_item_end_nr(l, mid);
3867
3868        for (i = 0; i < nritems; i++) {
3869                struct btrfs_item *item = btrfs_item_nr(right, i);
3870                u32 ioff;
3871
3872                ioff = btrfs_token_item_offset(right, item, &token);
3873                btrfs_set_token_item_offset(right, item,
3874                                            ioff + rt_data_off, &token);
3875        }
3876
3877        btrfs_set_header_nritems(l, mid);
3878        btrfs_item_key(right, &disk_key, 0);
3879        insert_ptr(trans, root, path, &disk_key, right->start,
3880                   path->slots[1] + 1, 1);
3881
3882        btrfs_mark_buffer_dirty(right);
3883        btrfs_mark_buffer_dirty(l);
3884        BUG_ON(path->slots[0] != slot);
3885
3886        if (mid <= slot) {
3887                btrfs_tree_unlock(path->nodes[0]);
3888                free_extent_buffer(path->nodes[0]);
3889                path->nodes[0] = right;
3890                path->slots[0] -= mid;
3891                path->slots[1] += 1;
3892        } else {
3893                btrfs_tree_unlock(right);
3894                free_extent_buffer(right);
3895        }
3896
3897        BUG_ON(path->slots[0] < 0);
3898}
3899
3900/*
3901 * double splits happen when we need to insert a big item in the middle
3902 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3903 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3904 *          A                 B                 C
3905 *
3906 * We avoid this by trying to push the items on either side of our target
3907 * into the adjacent leaves.  If all goes well we can avoid the double split
3908 * completely.
3909 */
3910static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3911                                          struct btrfs_root *root,
3912                                          struct btrfs_path *path,
3913                                          int data_size)
3914{
3915        int ret;
3916        int progress = 0;
3917        int slot;
3918        u32 nritems;
3919
3920        slot = path->slots[0];
3921
3922        /*
3923         * try to push all the items after our slot into the
3924         * right leaf
3925         */
3926        ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
3927        if (ret < 0)
3928                return ret;
3929
3930        if (ret == 0)
3931                progress++;
3932
3933        nritems = btrfs_header_nritems(path->nodes[0]);
3934        /*
3935         * our goal is to get our slot at the start or end of a leaf.  If
3936         * we've done so we're done
3937         */
3938        if (path->slots[0] == 0 || path->slots[0] == nritems)
3939                return 0;
3940
3941        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3942                return 0;
3943
3944        /* try to push all the items before our slot into the next leaf */
3945        slot = path->slots[0];
3946        ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
3947        if (ret < 0)
3948                return ret;
3949
3950        if (ret == 0)
3951                progress++;
3952
3953        if (progress)
3954                return 0;
3955        return 1;
3956}
3957
3958/*
3959 * split the path's leaf in two, making sure there is at least data_size
3960 * available for the resulting leaf level of the path.
3961 *
3962 * returns 0 if all went well and < 0 on failure.
3963 */
3964static noinline int split_leaf(struct btrfs_trans_handle *trans,
3965                               struct btrfs_root *root,
3966                               struct btrfs_key *ins_key,
3967                               struct btrfs_path *path, int data_size,
3968                               int extend)
3969{
3970        struct btrfs_disk_key disk_key;
3971        struct extent_buffer *l;
3972        u32 nritems;
3973        int mid;
3974        int slot;
3975        struct extent_buffer *right;
3976        int ret = 0;
3977        int wret;
3978        int split;
3979        int num_doubles = 0;
3980        int tried_avoid_double = 0;
3981
3982        l = path->nodes[0];
3983        slot = path->slots[0];
3984        if (extend && data_size + btrfs_item_size_nr(l, slot) +
3985            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
3986                return -EOVERFLOW;
3987
3988        /* first try to make some room by pushing left and right */
3989        if (data_size) {
3990                wret = push_leaf_right(trans, root, path, data_size,
3991                                       data_size, 0, 0);
3992                if (wret < 0)
3993                        return wret;
3994                if (wret) {
3995                        wret = push_leaf_left(trans, root, path, data_size,
3996                                              data_size, 0, (u32)-1);
3997                        if (wret < 0)
3998                                return wret;
3999                }
4000                l = path->nodes[0];
4001
4002                /* did the pushes work? */
4003                if (btrfs_leaf_free_space(root, l) >= data_size)
4004                        return 0;
4005        }
4006
4007        if (!path->nodes[1]) {
4008                ret = insert_new_root(trans, root, path, 1, 1);
4009                if (ret)
4010                        return ret;
4011        }
4012again:
4013        split = 1;
4014        l = path->nodes[0];
4015        slot = path->slots[0];
4016        nritems = btrfs_header_nritems(l);
4017        mid = (nritems + 1) / 2;
4018
4019        if (mid <= slot) {
4020                if (nritems == 1 ||
4021                    leaf_space_used(l, mid, nritems - mid) + data_size >
4022                        BTRFS_LEAF_DATA_SIZE(root)) {
4023                        if (slot >= nritems) {
4024                                split = 0;
4025                        } else {
4026                                mid = slot;
4027                                if (mid != nritems &&
4028                                    leaf_space_used(l, mid, nritems - mid) +
4029                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4030                                        if (data_size && !tried_avoid_double)
4031                                                goto push_for_double;
4032                                        split = 2;
4033                                }
4034                        }
4035                }
4036        } else {
4037                if (leaf_space_used(l, 0, mid) + data_size >
4038                        BTRFS_LEAF_DATA_SIZE(root)) {
4039                        if (!extend && data_size && slot == 0) {
4040                                split = 0;
4041                        } else if ((extend || !data_size) && slot == 0) {
4042                                mid = 1;
4043                        } else {
4044                                mid = slot;
4045                                if (mid != nritems &&
4046                                    leaf_space_used(l, mid, nritems - mid) +
4047                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4048                                        if (data_size && !tried_avoid_double)
4049                                                goto push_for_double;
4050                                        split = 2 ;
4051                                }
4052                        }
4053                }
4054        }
4055
4056        if (split == 0)
4057                btrfs_cpu_key_to_disk(&disk_key, ins_key);
4058        else
4059                btrfs_item_key(l, &disk_key, mid);
4060
4061        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
4062                                        root->root_key.objectid,
4063                                        &disk_key, 0, l->start, 0);
4064        if (IS_ERR(right))
4065                return PTR_ERR(right);
4066
4067        root_add_used(root, root->leafsize);
4068
4069        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4070        btrfs_set_header_bytenr(right, right->start);
4071        btrfs_set_header_generation(right, trans->transid);
4072        btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4073        btrfs_set_header_owner(right, root->root_key.objectid);
4074        btrfs_set_header_level(right, 0);
4075        write_extent_buffer(right, root->fs_info->fsid,
4076                            (unsigned long)btrfs_header_fsid(right),
4077                            BTRFS_FSID_SIZE);
4078
4079        write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
4080                            (unsigned long)btrfs_header_chunk_tree_uuid(right),
4081                            BTRFS_UUID_SIZE);
4082
4083        if (split == 0) {
4084                if (mid <= slot) {
4085                        btrfs_set_header_nritems(right, 0);
4086                        insert_ptr(trans, root, path, &disk_key, right->start,
4087                                   path->slots[1] + 1, 1);
4088                        btrfs_tree_unlock(path->nodes[0]);
4089                        free_extent_buffer(path->nodes[0]);
4090                        path->nodes[0] = right;
4091                        path->slots[0] = 0;
4092                        path->slots[1] += 1;
4093                } else {
4094                        btrfs_set_header_nritems(right, 0);
4095                        insert_ptr(trans, root, path, &disk_key, right->start,
4096                                          path->slots[1], 1);
4097                        btrfs_tree_unlock(path->nodes[0]);
4098                        free_extent_buffer(path->nodes[0]);
4099                        path->nodes[0] = right;
4100                        path->slots[0] = 0;
4101                        if (path->slots[1] == 0)
4102                                fixup_low_keys(root, path, &disk_key, 1);
4103                }
4104                btrfs_mark_buffer_dirty(right);
4105                return ret;
4106        }
4107
4108        copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4109
4110        if (split == 2) {
4111                BUG_ON(num_doubles != 0);
4112                num_doubles++;
4113                goto again;
4114        }
4115
4116        return 0;
4117
4118push_for_double:
4119        push_for_double_split(trans, root, path, data_size);
4120        tried_avoid_double = 1;
4121        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4122                return 0;
4123        goto again;
4124}
4125
4126static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4127                                         struct btrfs_root *root,
4128                                         struct btrfs_path *path, int ins_len)
4129{
4130        struct btrfs_key key;
4131        struct extent_buffer *leaf;
4132        struct btrfs_file_extent_item *fi;
4133        u64 extent_len = 0;
4134        u32 item_size;
4135        int ret;
4136
4137        leaf = path->nodes[0];
4138        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4139
4140        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4141               key.type != BTRFS_EXTENT_CSUM_KEY);
4142
4143        if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4144                return 0;
4145
4146        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4147        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4148                fi = btrfs_item_ptr(leaf, path->slots[0],
4149                                    struct btrfs_file_extent_item);
4150                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4151        }
4152        btrfs_release_path(path);
4153
4154        path->keep_locks = 1;
4155        path->search_for_split = 1;
4156        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4157        path->search_for_split = 0;
4158        if (ret < 0)
4159                goto err;
4160
4161        ret = -EAGAIN;
4162        leaf = path->nodes[0];
4163        /* if our item isn't there or got smaller, return now */
4164        if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4165                goto err;
4166
4167        /* the leaf has  changed, it now has room.  return now */
4168        if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4169                goto err;
4170
4171        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4172                fi = btrfs_item_ptr(leaf, path->slots[0],
4173                                    struct btrfs_file_extent_item);
4174                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4175                        goto err;
4176        }
4177
4178        btrfs_set_path_blocking(path);
4179        ret = split_leaf(trans, root, &key, path, ins_len, 1);
4180        if (ret)
4181                goto err;
4182
4183        path->keep_locks = 0;
4184        btrfs_unlock_up_safe(path, 1);
4185        return 0;
4186err:
4187        path->keep_locks = 0;
4188        return ret;
4189}
4190
4191static noinline int split_item(struct btrfs_trans_handle *trans,
4192                               struct btrfs_root *root,
4193                               struct btrfs_path *path,
4194                               struct btrfs_key *new_key,
4195                               unsigned long split_offset)
4196{
4197        struct extent_buffer *leaf;
4198        struct btrfs_item *item;
4199        struct btrfs_item *new_item;
4200        int slot;
4201        char *buf;
4202        u32 nritems;
4203        u32 item_size;
4204        u32 orig_offset;
4205        struct btrfs_disk_key disk_key;
4206
4207        leaf = path->nodes[0];
4208        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4209
4210        btrfs_set_path_blocking(path);
4211
4212        item = btrfs_item_nr(leaf, path->slots[0]);
4213        orig_offset = btrfs_item_offset(leaf, item);
4214        item_size = btrfs_item_size(leaf, item);
4215
4216        buf = kmalloc(item_size, GFP_NOFS);
4217        if (!buf)
4218                return -ENOMEM;
4219
4220        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4221                            path->slots[0]), item_size);
4222
4223        slot = path->slots[0] + 1;
4224        nritems = btrfs_header_nritems(leaf);
4225        if (slot != nritems) {
4226                /* shift the items */
4227                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4228                                btrfs_item_nr_offset(slot),
4229                                (nritems - slot) * sizeof(struct btrfs_item));
4230        }
4231
4232        btrfs_cpu_key_to_disk(&disk_key, new_key);
4233        btrfs_set_item_key(leaf, &disk_key, slot);
4234
4235        new_item = btrfs_item_nr(leaf, slot);
4236
4237        btrfs_set_item_offset(leaf, new_item, orig_offset);
4238        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4239
4240        btrfs_set_item_offset(leaf, item,
4241                              orig_offset + item_size - split_offset);
4242        btrfs_set_item_size(leaf, item, split_offset);
4243
4244        btrfs_set_header_nritems(leaf, nritems + 1);
4245
4246        /* write the data for the start of the original item */
4247        write_extent_buffer(leaf, buf,
4248                            btrfs_item_ptr_offset(leaf, path->slots[0]),
4249                            split_offset);
4250
4251        /* write the data for the new item */
4252        write_extent_buffer(leaf, buf + split_offset,
4253                            btrfs_item_ptr_offset(leaf, slot),
4254                            item_size - split_offset);
4255        btrfs_mark_buffer_dirty(leaf);
4256
4257        BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4258        kfree(buf);
4259        return 0;
4260}
4261
4262/*
4263 * This function splits a single item into two items,
4264 * giving 'new_key' to the new item and splitting the
4265 * old one at split_offset (from the start of the item).
4266 *
4267 * The path may be released by this operation.  After
4268 * the split, the path is pointing to the old item.  The
4269 * new item is going to be in the same node as the old one.
4270 *
4271 * Note, the item being split must be smaller enough to live alone on
4272 * a tree block with room for one extra struct btrfs_item
4273 *
4274 * This allows us to split the item in place, keeping a lock on the
4275 * leaf the entire time.
4276 */
4277int btrfs_split_item(struct btrfs_trans_handle *trans,
4278                     struct btrfs_root *root,
4279                     struct btrfs_path *path,
4280                     struct btrfs_key *new_key,
4281                     unsigned long split_offset)
4282{
4283        int ret;
4284        ret = setup_leaf_for_split(trans, root, path,
4285                                   sizeof(struct btrfs_item));
4286        if (ret)
4287                return ret;
4288
4289        ret = split_item(trans, root, path, new_key, split_offset);
4290        return ret;
4291}
4292
4293/*
4294 * This function duplicate a item, giving 'new_key' to the new item.
4295 * It guarantees both items live in the same tree leaf and the new item
4296 * is contiguous with the original item.
4297 *
4298 * This allows us to split file extent in place, keeping a lock on the
4299 * leaf the entire time.
4300 */
4301int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4302                         struct btrfs_root *root,
4303                         struct btrfs_path *path,
4304                         struct btrfs_key *new_key)
4305{
4306        struct extent_buffer *leaf;
4307        int ret;
4308        u32 item_size;
4309
4310        leaf = path->nodes[0];
4311        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4312        ret = setup_leaf_for_split(trans, root, path,
4313                                   item_size + sizeof(struct btrfs_item));
4314        if (ret)
4315                return ret;
4316
4317        path->slots[0]++;
4318        setup_items_for_insert(root, path, new_key, &item_size,
4319                               item_size, item_size +
4320                               sizeof(struct btrfs_item), 1);
4321        leaf = path->nodes[0];
4322        memcpy_extent_buffer(leaf,
4323                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4324                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4325                             item_size);
4326        return 0;
4327}
4328
4329/*
4330 * make the item pointed to by the path smaller.  new_size indicates
4331 * how small to make it, and from_end tells us if we just chop bytes
4332 * off the end of the item or if we shift the item to chop bytes off
4333 * the front.
4334 */
4335void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4336                         u32 new_size, int from_end)
4337{
4338        int slot;
4339        struct extent_buffer *leaf;
4340        struct btrfs_item *item;
4341        u32 nritems;
4342        unsigned int data_end;
4343        unsigned int old_data_start;
4344        unsigned int old_size;
4345        unsigned int size_diff;
4346        int i;
4347        struct btrfs_map_token token;
4348
4349        btrfs_init_map_token(&token);
4350
4351        leaf = path->nodes[0];
4352        slot = path->slots[0];
4353
4354        old_size = btrfs_item_size_nr(leaf, slot);
4355        if (old_size == new_size)
4356                return;
4357
4358        nritems = btrfs_header_nritems(leaf);
4359        data_end = leaf_data_end(root, leaf);
4360
4361        old_data_start = btrfs_item_offset_nr(leaf, slot);
4362
4363        size_diff = old_size - new_size;
4364
4365        BUG_ON(slot < 0);
4366        BUG_ON(slot >= nritems);
4367
4368        /*
4369         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4370         */
4371        /* first correct the data pointers */
4372        for (i = slot; i < nritems; i++) {
4373                u32 ioff;
4374                item = btrfs_item_nr(leaf, i);
4375
4376                ioff = btrfs_token_item_offset(leaf, item, &token);
4377                btrfs_set_token_item_offset(leaf, item,
4378                                            ioff + size_diff, &token);
4379        }
4380
4381        /* shift the data */
4382        if (from_end) {
4383                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4384                              data_end + size_diff, btrfs_leaf_data(leaf) +
4385                              data_end, old_data_start + new_size - data_end);
4386        } else {
4387                struct btrfs_disk_key disk_key;
4388                u64 offset;
4389
4390                btrfs_item_key(leaf, &disk_key, slot);
4391
4392                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4393                        unsigned long ptr;
4394                        struct btrfs_file_extent_item *fi;
4395
4396                        fi = btrfs_item_ptr(leaf, slot,
4397                                            struct btrfs_file_extent_item);
4398                        fi = (struct btrfs_file_extent_item *)(
4399                             (unsigned long)fi - size_diff);
4400
4401                        if (btrfs_file_extent_type(leaf, fi) ==
4402                            BTRFS_FILE_EXTENT_INLINE) {
4403                                ptr = btrfs_item_ptr_offset(leaf, slot);
4404                                memmove_extent_buffer(leaf, ptr,
4405                                      (unsigned long)fi,
4406                                      offsetof(struct btrfs_file_extent_item,
4407                                                 disk_bytenr));
4408                        }
4409                }
4410
4411                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4412                              data_end + size_diff, btrfs_leaf_data(leaf) +
4413                              data_end, old_data_start - data_end);
4414
4415                offset = btrfs_disk_key_offset(&disk_key);
4416                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4417                btrfs_set_item_key(leaf, &disk_key, slot);
4418                if (slot == 0)
4419                        fixup_low_keys(root, path, &disk_key, 1);
4420        }
4421
4422        item = btrfs_item_nr(leaf, slot);
4423        btrfs_set_item_size(leaf, item, new_size);
4424        btrfs_mark_buffer_dirty(leaf);
4425
4426        if (btrfs_leaf_free_space(root, leaf) < 0) {
4427                btrfs_print_leaf(root, leaf);
4428                BUG();
4429        }
4430}
4431
4432/*
4433 * make the item pointed to by the path bigger, data_size is the new size.
4434 */
4435void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4436                       u32 data_size)
4437{
4438        int slot;
4439        struct extent_buffer *leaf;
4440        struct btrfs_item *item;
4441        u32 nritems;
4442        unsigned int data_end;
4443        unsigned int old_data;
4444        unsigned int old_size;
4445        int i;
4446        struct btrfs_map_token token;
4447
4448        btrfs_init_map_token(&token);
4449
4450        leaf = path->nodes[0];
4451
4452        nritems = btrfs_header_nritems(leaf);
4453        data_end = leaf_data_end(root, leaf);
4454
4455        if (btrfs_leaf_free_space(root, leaf) < data_size) {
4456                btrfs_print_leaf(root, leaf);
4457                BUG();
4458        }
4459        slot = path->slots[0];
4460        old_data = btrfs_item_end_nr(leaf, slot);
4461
4462        BUG_ON(slot < 0);
4463        if (slot >= nritems) {
4464                btrfs_print_leaf(root, leaf);
4465                printk(KERN_CRIT "slot %d too large, nritems %d\n",
4466                       slot, nritems);
4467                BUG_ON(1);
4468        }
4469
4470        /*
4471         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4472         */
4473        /* first correct the data pointers */
4474        for (i = slot; i < nritems; i++) {
4475                u32 ioff;
4476                item = btrfs_item_nr(leaf, i);
4477
4478                ioff = btrfs_token_item_offset(leaf, item, &token);
4479                btrfs_set_token_item_offset(leaf, item,
4480                                            ioff - data_size, &token);
4481        }
4482
4483        /* shift the data */
4484        memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4485                      data_end - data_size, btrfs_leaf_data(leaf) +
4486                      data_end, old_data - data_end);
4487
4488        data_end = old_data;
4489        old_size = btrfs_item_size_nr(leaf, slot);
4490        item = btrfs_item_nr(leaf, slot);
4491        btrfs_set_item_size(leaf, item, old_size + data_size);
4492        btrfs_mark_buffer_dirty(leaf);
4493
4494        if (btrfs_leaf_free_space(root, leaf) < 0) {
4495                btrfs_print_leaf(root, leaf);
4496                BUG();
4497        }
4498}
4499
4500/*
4501 * this is a helper for btrfs_insert_empty_items, the main goal here is
4502 * to save stack depth by doing the bulk of the work in a function
4503 * that doesn't call btrfs_search_slot
4504 */
4505void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4506                            struct btrfs_key *cpu_key, u32 *data_size,
4507                            u32 total_data, u32 total_size, int nr)
4508{
4509        struct btrfs_item *item;
4510        int i;
4511        u32 nritems;
4512        unsigned int data_end;
4513        struct btrfs_disk_key disk_key;
4514        struct extent_buffer *leaf;
4515        int slot;
4516        struct btrfs_map_token token;
4517
4518        btrfs_init_map_token(&token);
4519
4520        leaf = path->nodes[0];
4521        slot = path->slots[0];
4522
4523        nritems = btrfs_header_nritems(leaf);
4524        data_end = leaf_data_end(root, leaf);
4525
4526        if (btrfs_leaf_free_space(root, leaf) < total_size) {
4527                btrfs_print_leaf(root, leaf);
4528                printk(KERN_CRIT "not enough freespace need %u have %d\n",
4529                       total_size, btrfs_leaf_free_space(root, leaf));
4530                BUG();
4531        }
4532
4533        if (slot != nritems) {
4534                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4535
4536                if (old_data < data_end) {
4537                        btrfs_print_leaf(root, leaf);
4538                        printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
4539                               slot, old_data, data_end);
4540                        BUG_ON(1);
4541                }
4542                /*
4543                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4544                 */
4545                /* first correct the data pointers */
4546                for (i = slot; i < nritems; i++) {
4547                        u32 ioff;
4548
4549                        item = btrfs_item_nr(leaf, i);
4550                        ioff = btrfs_token_item_offset(leaf, item, &token);
4551                        btrfs_set_token_item_offset(leaf, item,
4552                                                    ioff - total_data, &token);
4553                }
4554                /* shift the items */
4555                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4556                              btrfs_item_nr_offset(slot),
4557                              (nritems - slot) * sizeof(struct btrfs_item));
4558
4559                /* shift the data */
4560                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4561                              data_end - total_data, btrfs_leaf_data(leaf) +
4562                              data_end, old_data - data_end);
4563                data_end = old_data;
4564        }
4565
4566        /* setup the item for the new data */
4567        for (i = 0; i < nr; i++) {
4568                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4569                btrfs_set_item_key(leaf, &disk_key, slot + i);
4570                item = btrfs_item_nr(leaf, slot + i);
4571                btrfs_set_token_item_offset(leaf, item,
4572                                            data_end - data_size[i], &token);
4573                data_end -= data_size[i];
4574                btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4575        }
4576
4577        btrfs_set_header_nritems(leaf, nritems + nr);
4578
4579        if (slot == 0) {
4580                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4581                fixup_low_keys(root, path, &disk_key, 1);
4582        }
4583        btrfs_unlock_up_safe(path, 1);
4584        btrfs_mark_buffer_dirty(leaf);
4585
4586        if (btrfs_leaf_free_space(root, leaf) < 0) {
4587                btrfs_print_leaf(root, leaf);
4588                BUG();
4589        }
4590}
4591
4592/*
4593 * Given a key and some data, insert items into the tree.
4594 * This does all the path init required, making room in the tree if needed.
4595 */
4596int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4597                            struct btrfs_root *root,
4598                            struct btrfs_path *path,
4599                            struct btrfs_key *cpu_key, u32 *data_size,
4600                            int nr)
4601{
4602        int ret = 0;
4603        int slot;
4604        int i;
4605        u32 total_size = 0;
4606        u32 total_data = 0;
4607
4608        for (i = 0; i < nr; i++)
4609                total_data += data_size[i];
4610
4611        total_size = total_data + (nr * sizeof(struct btrfs_item));
4612        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4613        if (ret == 0)
4614                return -EEXIST;
4615        if (ret < 0)
4616                return ret;
4617
4618        slot = path->slots[0];
4619        BUG_ON(slot < 0);
4620
4621        setup_items_for_insert(root, path, cpu_key, data_size,
4622                               total_data, total_size, nr);
4623        return 0;
4624}
4625
4626/*
4627 * Given a key and some data, insert an item into the tree.
4628 * This does all the path init required, making room in the tree if needed.
4629 */
4630int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4631                      *root, struct btrfs_key *cpu_key, void *data, u32
4632                      data_size)
4633{
4634        int ret = 0;
4635        struct btrfs_path *path;
4636        struct extent_buffer *leaf;
4637        unsigned long ptr;
4638
4639        path = btrfs_alloc_path();
4640        if (!path)
4641                return -ENOMEM;
4642        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4643        if (!ret) {
4644                leaf = path->nodes[0];
4645                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4646                write_extent_buffer(leaf, data, ptr, data_size);
4647                btrfs_mark_buffer_dirty(leaf);
4648        }
4649        btrfs_free_path(path);
4650        return ret;
4651}
4652
4653/*
4654 * delete the pointer from a given node.
4655 *
4656 * the tree should have been previously balanced so the deletion does not
4657 * empty a node.
4658 */
4659static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4660                    int level, int slot)
4661{
4662        struct extent_buffer *parent = path->nodes[level];
4663        u32 nritems;
4664        int ret;
4665
4666        nritems = btrfs_header_nritems(parent);
4667        if (slot != nritems - 1) {
4668                if (level)
4669                        tree_mod_log_eb_move(root->fs_info, parent, slot,
4670                                             slot + 1, nritems - slot - 1);
4671                memmove_extent_buffer(parent,
4672                              btrfs_node_key_ptr_offset(slot),
4673                              btrfs_node_key_ptr_offset(slot + 1),
4674                              sizeof(struct btrfs_key_ptr) *
4675                              (nritems - slot - 1));
4676        } else if (level) {
4677                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4678                                              MOD_LOG_KEY_REMOVE);
4679                BUG_ON(ret < 0);
4680        }
4681
4682        nritems--;
4683        btrfs_set_header_nritems(parent, nritems);
4684        if (nritems == 0 && parent == root->node) {
4685                BUG_ON(btrfs_header_level(root->node) != 1);
4686                /* just turn the root into a leaf and break */
4687                btrfs_set_header_level(root->node, 0);
4688        } else if (slot == 0) {
4689                struct btrfs_disk_key disk_key;
4690
4691                btrfs_node_key(parent, &disk_key, 0);
4692                fixup_low_keys(root, path, &disk_key, level + 1);
4693        }
4694        btrfs_mark_buffer_dirty(parent);
4695}
4696
4697/*
4698 * a helper function to delete the leaf pointed to by path->slots[1] and
4699 * path->nodes[1].
4700 *
4701 * This deletes the pointer in path->nodes[1] and frees the leaf
4702 * block extent.  zero is returned if it all worked out, < 0 otherwise.
4703 *
4704 * The path must have already been setup for deleting the leaf, including
4705 * all the proper balancing.  path->nodes[1] must be locked.
4706 */
4707static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4708                                    struct btrfs_root *root,
4709                                    struct btrfs_path *path,
4710                                    struct extent_buffer *leaf)
4711{
4712        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4713        del_ptr(root, path, 1, path->slots[1]);
4714
4715        /*
4716         * btrfs_free_extent is expensive, we want to make sure we
4717         * aren't holding any locks when we call it
4718         */
4719        btrfs_unlock_up_safe(path, 0);
4720
4721        root_sub_used(root, leaf->len);
4722
4723        extent_buffer_get(leaf);
4724        btrfs_free_tree_block(trans, root, leaf, 0, 1);
4725        free_extent_buffer_stale(leaf);
4726}
4727/*
4728 * delete the item at the leaf level in path.  If that empties
4729 * the leaf, remove it from the tree
4730 */
4731int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4732                    struct btrfs_path *path, int slot, int nr)
4733{
4734        struct extent_buffer *leaf;
4735        struct btrfs_item *item;
4736        int last_off;
4737        int dsize = 0;
4738        int ret = 0;
4739        int wret;
4740        int i;
4741        u32 nritems;
4742        struct btrfs_map_token token;
4743
4744        btrfs_init_map_token(&token);
4745
4746        leaf = path->nodes[0];
4747        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4748
4749        for (i = 0; i < nr; i++)
4750                dsize += btrfs_item_size_nr(leaf, slot + i);
4751
4752        nritems = btrfs_header_nritems(leaf);
4753
4754        if (slot + nr != nritems) {
4755                int data_end = leaf_data_end(root, leaf);
4756
4757                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4758                              data_end + dsize,
4759                              btrfs_leaf_data(leaf) + data_end,
4760                              last_off - data_end);
4761
4762                for (i = slot + nr; i < nritems; i++) {
4763                        u32 ioff;
4764
4765                        item = btrfs_item_nr(leaf, i);
4766                        ioff = btrfs_token_item_offset(leaf, item, &token);
4767                        btrfs_set_token_item_offset(leaf, item,
4768                                                    ioff + dsize, &token);
4769                }
4770
4771                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4772                              btrfs_item_nr_offset(slot + nr),
4773                              sizeof(struct btrfs_item) *
4774                              (nritems - slot - nr));
4775        }
4776        btrfs_set_header_nritems(leaf, nritems - nr);
4777        nritems -= nr;
4778
4779        /* delete the leaf if we've emptied it */
4780        if (nritems == 0) {
4781                if (leaf == root->node) {
4782                        btrfs_set_header_level(leaf, 0);
4783                } else {
4784                        btrfs_set_path_blocking(path);
4785                        clean_tree_block(trans, root, leaf);
4786                        btrfs_del_leaf(trans, root, path, leaf);
4787                }
4788        } else {
4789                int used = leaf_space_used(leaf, 0, nritems);
4790                if (slot == 0) {
4791                        struct btrfs_disk_key disk_key;
4792
4793                        btrfs_item_key(leaf, &disk_key, 0);
4794                        fixup_low_keys(root, path, &disk_key, 1);
4795                }
4796
4797                /* delete the leaf if it is mostly empty */
4798                if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
4799                        /* push_leaf_left fixes the path.
4800                         * make sure the path still points to our leaf
4801                         * for possible call to del_ptr below
4802                         */
4803                        slot = path->slots[1];
4804                        extent_buffer_get(leaf);
4805
4806                        btrfs_set_path_blocking(path);
4807                        wret = push_leaf_left(trans, root, path, 1, 1,
4808                                              1, (u32)-1);
4809                        if (wret < 0 && wret != -ENOSPC)
4810                                ret = wret;
4811
4812                        if (path->nodes[0] == leaf &&
4813                            btrfs_header_nritems(leaf)) {
4814                                wret = push_leaf_right(trans, root, path, 1,
4815                                                       1, 1, 0);
4816                                if (wret < 0 && wret != -ENOSPC)
4817                                        ret = wret;
4818                        }
4819
4820                        if (btrfs_header_nritems(leaf) == 0) {
4821                                path->slots[1] = slot;
4822                                btrfs_del_leaf(trans, root, path, leaf);
4823                                free_extent_buffer(leaf);
4824                                ret = 0;
4825                        } else {
4826                                /* if we're still in the path, make sure
4827                                 * we're dirty.  Otherwise, one of the
4828                                 * push_leaf functions must have already
4829                                 * dirtied this buffer
4830                                 */
4831                                if (path->nodes[0] == leaf)
4832                                        btrfs_mark_buffer_dirty(leaf);
4833                                free_extent_buffer(leaf);
4834                        }
4835                } else {
4836                        btrfs_mark_buffer_dirty(leaf);
4837                }
4838        }
4839        return ret;
4840}
4841
4842/*
4843 * search the tree again to find a leaf with lesser keys
4844 * returns 0 if it found something or 1 if there are no lesser leaves.
4845 * returns < 0 on io errors.
4846 *
4847 * This may release the path, and so you may lose any locks held at the
4848 * time you call it.
4849 */
4850int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4851{
4852        struct btrfs_key key;
4853        struct btrfs_disk_key found_key;
4854        int ret;
4855
4856        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4857
4858        if (key.offset > 0)
4859                key.offset--;
4860        else if (key.type > 0)
4861                key.type--;
4862        else if (key.objectid > 0)
4863                key.objectid--;
4864        else
4865                return 1;
4866
4867        btrfs_release_path(path);
4868        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4869        if (ret < 0)
4870                return ret;
4871        btrfs_item_key(path->nodes[0], &found_key, 0);
4872        ret = comp_keys(&found_key, &key);
4873        if (ret < 0)
4874                return 0;
4875        return 1;
4876}
4877
4878/*
4879 * A helper function to walk down the tree starting at min_key, and looking
4880 * for nodes or leaves that are have a minimum transaction id.
4881 * This is used by the btree defrag code, and tree logging
4882 *
4883 * This does not cow, but it does stuff the starting key it finds back
4884 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4885 * key and get a writable path.
4886 *
4887 * This does lock as it descends, and path->keep_locks should be set
4888 * to 1 by the caller.
4889 *
4890 * This honors path->lowest_level to prevent descent past a given level
4891 * of the tree.
4892 *
4893 * min_trans indicates the oldest transaction that you are interested
4894 * in walking through.  Any nodes or leaves older than min_trans are
4895 * skipped over (without reading them).
4896 *
4897 * returns zero if something useful was found, < 0 on error and 1 if there
4898 * was nothing in the tree that matched the search criteria.
4899 */
4900int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4901                         struct btrfs_key *max_key,
4902                         struct btrfs_path *path,
4903                         u64 min_trans)
4904{
4905        struct extent_buffer *cur;
4906        struct btrfs_key found_key;
4907        int slot;
4908        int sret;
4909        u32 nritems;
4910        int level;
4911        int ret = 1;
4912
4913        WARN_ON(!path->keep_locks);
4914again:
4915        cur = btrfs_read_lock_root_node(root);
4916        level = btrfs_header_level(cur);
4917        WARN_ON(path->nodes[level]);
4918        path->nodes[level] = cur;
4919        path->locks[level] = BTRFS_READ_LOCK;
4920
4921        if (btrfs_header_generation(cur) < min_trans) {
4922                ret = 1;
4923                goto out;
4924        }
4925        while (1) {
4926                nritems = btrfs_header_nritems(cur);
4927                level = btrfs_header_level(cur);
4928                sret = bin_search(cur, min_key, level, &slot);
4929
4930                /* at the lowest level, we're done, setup the path and exit */
4931                if (level == path->lowest_level) {
4932                        if (slot >= nritems)
4933                                goto find_next_key;
4934                        ret = 0;
4935                        path->slots[level] = slot;
4936                        btrfs_item_key_to_cpu(cur, &found_key, slot);
4937                        goto out;
4938                }
4939                if (sret && slot > 0)
4940                        slot--;
4941                /*
4942                 * check this node pointer against the min_trans parameters.
4943                 * If it is too old, old, skip to the next one.
4944                 */
4945                while (slot < nritems) {
4946                        u64 blockptr;
4947                        u64 gen;
4948
4949                        blockptr = btrfs_node_blockptr(cur, slot);
4950                        gen = btrfs_node_ptr_generation(cur, slot);
4951                        if (gen < min_trans) {
4952                                slot++;
4953                                continue;
4954                        }
4955                        break;
4956                }
4957find_next_key:
4958                /*
4959                 * we didn't find a candidate key in this node, walk forward
4960                 * and find another one
4961                 */
4962                if (slot >= nritems) {
4963                        path->slots[level] = slot;
4964                        btrfs_set_path_blocking(path);
4965                        sret = btrfs_find_next_key(root, path, min_key, level,
4966                                                  min_trans);
4967                        if (sret == 0) {
4968                                btrfs_release_path(path);
4969                                goto again;
4970                        } else {
4971                                goto out;
4972                        }
4973                }
4974                /* save our key for returning back */
4975                btrfs_node_key_to_cpu(cur, &found_key, slot);
4976                path->slots[level] = slot;
4977                if (level == path->lowest_level) {
4978                        ret = 0;
4979                        unlock_up(path, level, 1, 0, NULL);
4980                        goto out;
4981                }
4982                btrfs_set_path_blocking(path);
4983                cur = read_node_slot(root, cur, slot);
4984                BUG_ON(!cur); /* -ENOMEM */
4985
4986                btrfs_tree_read_lock(cur);
4987
4988                path->locks[level - 1] = BTRFS_READ_LOCK;
4989                path->nodes[level - 1] = cur;
4990                unlock_up(path, level, 1, 0, NULL);
4991                btrfs_clear_path_blocking(path, NULL, 0);
4992        }
4993out:
4994        if (ret == 0)
4995                memcpy(min_key, &found_key, sizeof(found_key));
4996        btrfs_set_path_blocking(path);
4997        return ret;
4998}
4999
5000static void tree_move_down(struct btrfs_root *root,
5001                           struct btrfs_path *path,
5002                           int *level, int root_level)
5003{
5004        BUG_ON(*level == 0);
5005        path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5006                                        path->slots[*level]);
5007        path->slots[*level - 1] = 0;
5008        (*level)--;
5009}
5010
5011static int tree_move_next_or_upnext(struct btrfs_root *root,
5012                                    struct btrfs_path *path,
5013                                    int *level, int root_level)
5014{
5015        int ret = 0;
5016        int nritems;
5017        nritems = btrfs_header_nritems(path->nodes[*level]);
5018
5019        path->slots[*level]++;
5020
5021        while (path->slots[*level] >= nritems) {
5022                if (*level == root_level)
5023                        return -1;
5024
5025                /* move upnext */
5026                path->slots[*level] = 0;
5027                free_extent_buffer(path->nodes[*level]);
5028                path->nodes[*level] = NULL;
5029                (*level)++;
5030                path->slots[*level]++;
5031
5032                nritems = btrfs_header_nritems(path->nodes[*level]);
5033                ret = 1;
5034        }
5035        return ret;
5036}
5037
5038/*
5039 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5040 * or down.
5041 */
5042static int tree_advance(struct btrfs_root *root,
5043                        struct btrfs_path *path,
5044                        int *level, int root_level,
5045                        int allow_down,
5046                        struct btrfs_key *key)
5047{
5048        int ret;
5049
5050        if (*level == 0 || !allow_down) {
5051                ret = tree_move_next_or_upnext(root, path, level, root_level);
5052        } else {
5053                tree_move_down(root, path, level, root_level);
5054                ret = 0;
5055        }
5056        if (ret >= 0) {
5057                if (*level == 0)
5058                        btrfs_item_key_to_cpu(path->nodes[*level], key,
5059                                        path->slots[*level]);
5060                else
5061                        btrfs_node_key_to_cpu(path->nodes[*level], key,
5062                                        path->slots[*level]);
5063        }
5064        return ret;
5065}
5066
5067static int tree_compare_item(struct btrfs_root *left_root,
5068                             struct btrfs_path *left_path,
5069                             struct btrfs_path *right_path,
5070                             char *tmp_buf)
5071{
5072        int cmp;
5073        int len1, len2;
5074        unsigned long off1, off2;
5075
5076        len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5077        len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5078        if (len1 != len2)
5079                return 1;
5080
5081        off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5082        off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5083                                right_path->slots[0]);
5084
5085        read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5086
5087        cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5088        if (cmp)
5089                return 1;
5090        return 0;
5091}
5092
5093#define ADVANCE 1
5094#define ADVANCE_ONLY_NEXT -1
5095
5096/*
5097 * This function compares two trees and calls the provided callback for
5098 * every changed/new/deleted item it finds.
5099 * If shared tree blocks are encountered, whole subtrees are skipped, making
5100 * the compare pretty fast on snapshotted subvolumes.
5101 *
5102 * This currently works on commit roots only. As commit roots are read only,
5103 * we don't do any locking. The commit roots are protected with transactions.
5104 * Transactions are ended and rejoined when a commit is tried in between.
5105 *
5106 * This function checks for modifications done to the trees while comparing.
5107 * If it detects a change, it aborts immediately.
5108 */
5109int btrfs_compare_trees(struct btrfs_root *left_root,
5110                        struct btrfs_root *right_root,
5111                        btrfs_changed_cb_t changed_cb, void *ctx)
5112{
5113        int ret;
5114        int cmp;
5115        struct btrfs_trans_handle *trans = NULL;
5116        struct btrfs_path *left_path = NULL;
5117        struct btrfs_path *right_path = NULL;
5118        struct btrfs_key left_key;
5119        struct btrfs_key right_key;
5120        char *tmp_buf = NULL;
5121        int left_root_level;
5122        int right_root_level;
5123        int left_level;
5124        int right_level;
5125        int left_end_reached;
5126        int right_end_reached;
5127        int advance_left;
5128        int advance_right;
5129        u64 left_blockptr;
5130        u64 right_blockptr;
5131        u64 left_start_ctransid;
5132        u64 right_start_ctransid;
5133        u64 ctransid;
5134
5135        left_path = btrfs_alloc_path();
5136        if (!left_path) {
5137                ret = -ENOMEM;
5138                goto out;
5139        }
5140        right_path = btrfs_alloc_path();
5141        if (!right_path) {
5142                ret = -ENOMEM;
5143                goto out;
5144        }
5145
5146        tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
5147        if (!tmp_buf) {
5148                ret = -ENOMEM;
5149                goto out;
5150        }
5151
5152        left_path->search_commit_root = 1;
5153        left_path->skip_locking = 1;
5154        right_path->search_commit_root = 1;
5155        right_path->skip_locking = 1;
5156
5157        spin_lock(&left_root->root_item_lock);
5158        left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
5159        spin_unlock(&left_root->root_item_lock);
5160
5161        spin_lock(&right_root->root_item_lock);
5162        right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
5163        spin_unlock(&right_root->root_item_lock);
5164
5165        trans = btrfs_join_transaction(left_root);
5166        if (IS_ERR(trans)) {
5167                ret = PTR_ERR(trans);
5168                trans = NULL;
5169                goto out;
5170        }
5171
5172        /*
5173         * Strategy: Go to the first items of both trees. Then do
5174         *
5175         * If both trees are at level 0
5176         *   Compare keys of current items
5177         *     If left < right treat left item as new, advance left tree
5178         *       and repeat
5179         *     If left > right treat right item as deleted, advance right tree
5180         *       and repeat
5181         *     If left == right do deep compare of items, treat as changed if
5182         *       needed, advance both trees and repeat
5183         * If both trees are at the same level but not at level 0
5184         *   Compare keys of current nodes/leafs
5185         *     If left < right advance left tree and repeat
5186         *     If left > right advance right tree and repeat
5187         *     If left == right compare blockptrs of the next nodes/leafs
5188         *       If they match advance both trees but stay at the same level
5189         *         and repeat
5190         *       If they don't match advance both trees while allowing to go
5191         *         deeper and repeat
5192         * If tree levels are different
5193         *   Advance the tree that needs it and repeat
5194         *
5195         * Advancing a tree means:
5196         *   If we are at level 0, try to go to the next slot. If that's not
5197         *   possible, go one level up and repeat. Stop when we found a level
5198         *   where we could go to the next slot. We may at this point be on a
5199         *   node or a leaf.
5200         *
5201         *   If we are not at level 0 and not on shared tree blocks, go one
5202         *   level deeper.
5203         *
5204         *   If we are not at level 0 and on shared tree blocks, go one slot to
5205         *   the right if possible or go up and right.
5206         */
5207
5208        left_level = btrfs_header_level(left_root->commit_root);
5209        left_root_level = left_level;
5210        left_path->nodes[left_level] = left_root->commit_root;
5211        extent_buffer_get(left_path->nodes[left_level]);
5212
5213        right_level = btrfs_header_level(right_root->commit_root);
5214        right_root_level = right_level;
5215        right_path->nodes[right_level] = right_root->commit_root;
5216        extent_buffer_get(right_path->nodes[right_level]);
5217
5218        if (left_level == 0)
5219                btrfs_item_key_to_cpu(left_path->nodes[left_level],
5220                                &left_key, left_path->slots[left_level]);
5221        else
5222                btrfs_node_key_to_cpu(left_path->nodes[left_level],
5223                                &left_key, left_path->slots[left_level]);
5224        if (right_level == 0)
5225                btrfs_item_key_to_cpu(right_path->nodes[right_level],
5226                                &right_key, right_path->slots[right_level]);
5227        else
5228                btrfs_node_key_to_cpu(right_path->nodes[right_level],
5229                                &right_key, right_path->slots[right_level]);
5230
5231        left_end_reached = right_end_reached = 0;
5232        advance_left = advance_right = 0;
5233
5234        while (1) {
5235                /*
5236                 * We need to make sure the transaction does not get committed
5237                 * while we do anything on commit roots. This means, we need to
5238                 * join and leave transactions for every item that we process.
5239                 */
5240                if (trans && btrfs_should_end_transaction(trans, left_root)) {
5241                        btrfs_release_path(left_path);
5242                        btrfs_release_path(right_path);
5243
5244                        ret = btrfs_end_transaction(trans, left_root);
5245                        trans = NULL;
5246                        if (ret < 0)
5247                                goto out;
5248                }
5249                /* now rejoin the transaction */
5250                if (!trans) {
5251                        trans = btrfs_join_transaction(left_root);
5252                        if (IS_ERR(trans)) {
5253                                ret = PTR_ERR(trans);
5254                                trans = NULL;
5255                                goto out;
5256                        }
5257
5258                        spin_lock(&left_root->root_item_lock);
5259                        ctransid = btrfs_root_ctransid(&left_root->root_item);
5260                        spin_unlock(&left_root->root_item_lock);
5261                        if (ctransid != left_start_ctransid)
5262                                left_start_ctransid = 0;
5263
5264                        spin_lock(&right_root->root_item_lock);
5265                        ctransid = btrfs_root_ctransid(&right_root->root_item);
5266                        spin_unlock(&right_root->root_item_lock);
5267                        if (ctransid != right_start_ctransid)
5268                                right_start_ctransid = 0;
5269
5270                        if (!left_start_ctransid || !right_start_ctransid) {
5271                                WARN(1, KERN_WARNING
5272                                        "btrfs: btrfs_compare_tree detected "
5273                                        "a change in one of the trees while "
5274                                        "iterating. This is probably a "
5275                                        "bug.\n");
5276                                ret = -EIO;
5277                                goto out;
5278                        }
5279
5280                        /*
5281                         * the commit root may have changed, so start again
5282                         * where we stopped
5283                         */
5284                        left_path->lowest_level = left_level;
5285                        right_path->lowest_level = right_level;
5286                        ret = btrfs_search_slot(NULL, left_root,
5287                                        &left_key, left_path, 0, 0);
5288                        if (ret < 0)
5289                                goto out;
5290                        ret = btrfs_search_slot(NULL, right_root,
5291                                        &right_key, right_path, 0, 0);
5292                        if (ret < 0)
5293                                goto out;
5294                }
5295
5296                if (advance_left && !left_end_reached) {
5297                        ret = tree_advance(left_root, left_path, &left_level,
5298                                        left_root_level,
5299                                        advance_left != ADVANCE_ONLY_NEXT,
5300                                        &left_key);
5301                        if (ret < 0)
5302                                left_end_reached = ADVANCE;
5303                        advance_left = 0;
5304                }
5305                if (advance_right && !right_end_reached) {
5306                        ret = tree_advance(right_root, right_path, &right_level,
5307                                        right_root_level,
5308                                        advance_right != ADVANCE_ONLY_NEXT,
5309                                        &right_key);
5310                        if (ret < 0)
5311                                right_end_reached = ADVANCE;
5312                        advance_right = 0;
5313                }
5314
5315                if (left_end_reached && right_end_reached) {
5316                        ret = 0;
5317                        goto out;
5318                } else if (left_end_reached) {
5319                        if (right_level == 0) {
5320                                ret = changed_cb(left_root, right_root,
5321                                                left_path, right_path,
5322                                                &right_key,
5323                                                BTRFS_COMPARE_TREE_DELETED,
5324                                                ctx);
5325                                if (ret < 0)
5326                                        goto out;
5327                        }
5328                        advance_right = ADVANCE;
5329                        continue;
5330                } else if (right_end_reached) {
5331                        if (left_level == 0) {
5332                                ret = changed_cb(left_root, right_root,
5333                                                left_path, right_path,
5334                                                &left_key,
5335                                                BTRFS_COMPARE_TREE_NEW,
5336                                                ctx);
5337                                if (ret < 0)
5338                                        goto out;
5339                        }
5340                        advance_left = ADVANCE;
5341                        continue;
5342                }
5343
5344                if (left_level == 0 && right_level == 0) {
5345                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5346                        if (cmp < 0) {
5347                                ret = changed_cb(left_root, right_root,
5348                                                left_path, right_path,
5349                                                &left_key,
5350                                                BTRFS_COMPARE_TREE_NEW,
5351                                                ctx);
5352                                if (ret < 0)
5353                                        goto out;
5354                                advance_left = ADVANCE;
5355                        } else if (cmp > 0) {
5356                                ret = changed_cb(left_root, right_root,
5357                                                left_path, right_path,
5358                                                &right_key,
5359                                                BTRFS_COMPARE_TREE_DELETED,
5360                                                ctx);
5361                                if (ret < 0)
5362                                        goto out;
5363                                advance_right = ADVANCE;
5364                        } else {
5365                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5366                                ret = tree_compare_item(left_root, left_path,
5367                                                right_path, tmp_buf);
5368                                if (ret) {
5369                                        WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5370                                        ret = changed_cb(left_root, right_root,
5371                                                left_path, right_path,
5372                                                &left_key,
5373                                                BTRFS_COMPARE_TREE_CHANGED,
5374                                                ctx);
5375                                        if (ret < 0)
5376                                                goto out;
5377                                }
5378                                advance_left = ADVANCE;
5379                                advance_right = ADVANCE;
5380                        }
5381                } else if (left_level == right_level) {
5382                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5383                        if (cmp < 0) {
5384                                advance_left = ADVANCE;
5385                        } else if (cmp > 0) {
5386                                advance_right = ADVANCE;
5387                        } else {
5388                                left_blockptr = btrfs_node_blockptr(
5389                                                left_path->nodes[left_level],
5390                                                left_path->slots[left_level]);
5391                                right_blockptr = btrfs_node_blockptr(
5392                                                right_path->nodes[right_level],
5393                                                right_path->slots[right_level]);
5394                                if (left_blockptr == right_blockptr) {
5395                                        /*
5396                                         * As we're on a shared block, don't
5397                                         * allow to go deeper.
5398                                         */
5399                                        advance_left = ADVANCE_ONLY_NEXT;
5400                                        advance_right = ADVANCE_ONLY_NEXT;
5401                                } else {
5402                                        advance_left = ADVANCE;
5403                                        advance_right = ADVANCE;
5404                                }
5405                        }
5406                } else if (left_level < right_level) {
5407                        advance_right = ADVANCE;
5408                } else {
5409                        advance_left = ADVANCE;
5410                }
5411        }
5412
5413out:
5414        btrfs_free_path(left_path);
5415        btrfs_free_path(right_path);
5416        kfree(tmp_buf);
5417
5418        if (trans) {
5419                if (!ret)
5420                        ret = btrfs_end_transaction(trans, left_root);
5421                else
5422                        btrfs_end_transaction(trans, left_root);
5423        }
5424
5425        return ret;
5426}
5427
5428/*
5429 * this is similar to btrfs_next_leaf, but does not try to preserve
5430 * and fixup the path.  It looks for and returns the next key in the
5431 * tree based on the current path and the min_trans parameters.
5432 *
5433 * 0 is returned if another key is found, < 0 if there are any errors
5434 * and 1 is returned if there are no higher keys in the tree
5435 *
5436 * path->keep_locks should be set to 1 on the search made before
5437 * calling this function.
5438 */
5439int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5440                        struct btrfs_key *key, int level, u64 min_trans)
5441{
5442        int slot;
5443        struct extent_buffer *c;
5444
5445        WARN_ON(!path->keep_locks);
5446        while (level < BTRFS_MAX_LEVEL) {
5447                if (!path->nodes[level])
5448                        return 1;
5449
5450                slot = path->slots[level] + 1;
5451                c = path->nodes[level];
5452next:
5453                if (slot >= btrfs_header_nritems(c)) {
5454                        int ret;
5455                        int orig_lowest;
5456                        struct btrfs_key cur_key;
5457                        if (level + 1 >= BTRFS_MAX_LEVEL ||
5458                            !path->nodes[level + 1])
5459                                return 1;
5460
5461                        if (path->locks[level + 1]) {
5462                                level++;
5463                                continue;
5464                        }
5465
5466                        slot = btrfs_header_nritems(c) - 1;
5467                        if (level == 0)
5468                                btrfs_item_key_to_cpu(c, &cur_key, slot);
5469                        else
5470                                btrfs_node_key_to_cpu(c, &cur_key, slot);
5471
5472                        orig_lowest = path->lowest_level;
5473                        btrfs_release_path(path);
5474                        path->lowest_level = level;
5475                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
5476                                                0, 0);
5477                        path->lowest_level = orig_lowest;
5478                        if (ret < 0)
5479                                return ret;
5480
5481                        c = path->nodes[level];
5482                        slot = path->slots[level];
5483                        if (ret == 0)
5484                                slot++;
5485                        goto next;
5486                }
5487
5488                if (level == 0)
5489                        btrfs_item_key_to_cpu(c, key, slot);
5490                else {
5491                        u64 gen = btrfs_node_ptr_generation(c, slot);
5492
5493                        if (gen < min_trans) {
5494                                slot++;
5495                                goto next;
5496                        }
5497                        btrfs_node_key_to_cpu(c, key, slot);
5498                }
5499                return 0;
5500        }
5501        return 1;
5502}
5503
5504/*
5505 * search the tree again to find a leaf with greater keys
5506 * returns 0 if it found something or 1 if there are no greater leaves.
5507 * returns < 0 on io errors.
5508 */
5509int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5510{
5511        return btrfs_next_old_leaf(root, path, 0);
5512}
5513
5514int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5515                        u64 time_seq)
5516{
5517        int slot;
5518        int level;
5519        struct extent_buffer *c;
5520        struct extent_buffer *next;
5521        struct btrfs_key key;
5522        u32 nritems;
5523        int ret;
5524        int old_spinning = path->leave_spinning;
5525        int next_rw_lock = 0;
5526
5527        nritems = btrfs_header_nritems(path->nodes[0]);
5528        if (nritems == 0)
5529                return 1;
5530
5531        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5532again:
5533        level = 1;
5534        next = NULL;
5535        next_rw_lock = 0;
5536        btrfs_release_path(path);
5537
5538        path->keep_locks = 1;
5539        path->leave_spinning = 1;
5540
5541        if (time_seq)
5542                ret = btrfs_search_old_slot(root, &key, path, time_seq);
5543        else
5544                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5545        path->keep_locks = 0;
5546
5547        if (ret < 0)
5548                return ret;
5549
5550        nritems = btrfs_header_nritems(path->nodes[0]);
5551        /*
5552         * by releasing the path above we dropped all our locks.  A balance
5553         * could have added more items next to the key that used to be
5554         * at the very end of the block.  So, check again here and
5555         * advance the path if there are now more items available.
5556         */
5557        if (nritems > 0 && path->slots[0] < nritems - 1) {
5558                if (ret == 0)
5559                        path->slots[0]++;
5560                ret = 0;
5561                goto done;
5562        }
5563
5564        while (level < BTRFS_MAX_LEVEL) {
5565                if (!path->nodes[level]) {
5566                        ret = 1;
5567                        goto done;
5568                }
5569
5570                slot = path->slots[level] + 1;
5571                c = path->nodes[level];
5572                if (slot >= btrfs_header_nritems(c)) {
5573                        level++;
5574                        if (level == BTRFS_MAX_LEVEL) {
5575                                ret = 1;
5576                                goto done;
5577                        }
5578                        continue;
5579                }
5580
5581                if (next) {
5582                        btrfs_tree_unlock_rw(next, next_rw_lock);
5583                        free_extent_buffer(next);
5584                }
5585
5586                next = c;
5587                next_rw_lock = path->locks[level];
5588                ret = read_block_for_search(NULL, root, path, &next, level,
5589                                            slot, &key, 0);
5590                if (ret == -EAGAIN)
5591                        goto again;
5592
5593                if (ret < 0) {
5594                        btrfs_release_path(path);
5595                        goto done;
5596                }
5597
5598                if (!path->skip_locking) {
5599                        ret = btrfs_try_tree_read_lock(next);
5600                        if (!ret && time_seq) {
5601                                /*
5602                                 * If we don't get the lock, we may be racing
5603                                 * with push_leaf_left, holding that lock while
5604                                 * itself waiting for the leaf we've currently
5605                                 * locked. To solve this situation, we give up
5606                                 * on our lock and cycle.
5607                                 */
5608                                free_extent_buffer(next);
5609                                btrfs_release_path(path);
5610                                cond_resched();
5611                                goto again;
5612                        }
5613                        if (!ret) {
5614                                btrfs_set_path_blocking(path);
5615                                btrfs_tree_read_lock(next);
5616                                btrfs_clear_path_blocking(path, next,
5617                                                          BTRFS_READ_LOCK);
5618                        }
5619                        next_rw_lock = BTRFS_READ_LOCK;
5620                }
5621                break;
5622        }
5623        path->slots[level] = slot;
5624        while (1) {
5625                level--;
5626                c = path->nodes[level];
5627                if (path->locks[level])
5628                        btrfs_tree_unlock_rw(c, path->locks[level]);
5629
5630                free_extent_buffer(c);
5631                path->nodes[level] = next;
5632                path->slots[level] = 0;
5633                if (!path->skip_locking)
5634                        path->locks[level] = next_rw_lock;
5635                if (!level)
5636                        break;
5637
5638                ret = read_block_for_search(NULL, root, path, &next, level,
5639                                            0, &key, 0);
5640                if (ret == -EAGAIN)
5641                        goto again;
5642
5643                if (ret < 0) {
5644                        btrfs_release_path(path);
5645                        goto done;
5646                }
5647
5648                if (!path->skip_locking) {
5649                        ret = btrfs_try_tree_read_lock(next);
5650                        if (!ret) {
5651                                btrfs_set_path_blocking(path);
5652                                btrfs_tree_read_lock(next);
5653                                btrfs_clear_path_blocking(path, next,
5654                                                          BTRFS_READ_LOCK);
5655                        }
5656                        next_rw_lock = BTRFS_READ_LOCK;
5657                }
5658        }
5659        ret = 0;
5660done:
5661        unlock_up(path, 0, 1, 0, NULL);
5662        path->leave_spinning = old_spinning;
5663        if (!old_spinning)
5664                btrfs_set_path_blocking(path);
5665
5666        return ret;
5667}
5668
5669/*
5670 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5671 * searching until it gets past min_objectid or finds an item of 'type'
5672 *
5673 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5674 */
5675int btrfs_previous_item(struct btrfs_root *root,
5676                        struct btrfs_path *path, u64 min_objectid,
5677                        int type)
5678{
5679        struct btrfs_key found_key;
5680        struct extent_buffer *leaf;
5681        u32 nritems;
5682        int ret;
5683
5684        while (1) {
5685                if (path->slots[0] == 0) {
5686                        btrfs_set_path_blocking(path);
5687                        ret = btrfs_prev_leaf(root, path);
5688                        if (ret != 0)
5689                                return ret;
5690                } else {
5691                        path->slots[0]--;
5692                }
5693                leaf = path->nodes[0];
5694                nritems = btrfs_header_nritems(leaf);
5695                if (nritems == 0)
5696                        return 1;
5697                if (path->slots[0] == nritems)
5698                        path->slots[0]--;
5699
5700                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5701                if (found_key.objectid < min_objectid)
5702                        break;
5703                if (found_key.type == type)
5704                        return 0;
5705                if (found_key.objectid == min_objectid &&
5706                    found_key.type < type)
5707                        break;
5708        }
5709        return 1;
5710}
5711