linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/slab.h>
  21#include <linux/rbtree.h>
  22#include "ctree.h"
  23#include "disk-io.h"
  24#include "transaction.h"
  25#include "print-tree.h"
  26#include "locking.h"
  27
  28static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  29                      *root, struct btrfs_path *path, int level);
  30static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
  31                      *root, struct btrfs_key *ins_key,
  32                      struct btrfs_path *path, int data_size, int extend);
  33static int push_node_left(struct btrfs_trans_handle *trans,
  34                          struct btrfs_root *root, struct extent_buffer *dst,
  35                          struct extent_buffer *src, int empty);
  36static int balance_node_right(struct btrfs_trans_handle *trans,
  37                              struct btrfs_root *root,
  38                              struct extent_buffer *dst_buf,
  39                              struct extent_buffer *src_buf);
  40static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
  41                    int level, int slot);
  42static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
  43                                 struct extent_buffer *eb);
  44
  45struct btrfs_path *btrfs_alloc_path(void)
  46{
  47        struct btrfs_path *path;
  48        path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  49        return path;
  50}
  51
  52/*
  53 * set all locked nodes in the path to blocking locks.  This should
  54 * be done before scheduling
  55 */
  56noinline void btrfs_set_path_blocking(struct btrfs_path *p)
  57{
  58        int i;
  59        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  60                if (!p->nodes[i] || !p->locks[i])
  61                        continue;
  62                btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
  63                if (p->locks[i] == BTRFS_READ_LOCK)
  64                        p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
  65                else if (p->locks[i] == BTRFS_WRITE_LOCK)
  66                        p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
  67        }
  68}
  69
  70/*
  71 * reset all the locked nodes in the patch to spinning locks.
  72 *
  73 * held is used to keep lockdep happy, when lockdep is enabled
  74 * we set held to a blocking lock before we go around and
  75 * retake all the spinlocks in the path.  You can safely use NULL
  76 * for held
  77 */
  78noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
  79                                        struct extent_buffer *held, int held_rw)
  80{
  81        int i;
  82
  83        if (held) {
  84                btrfs_set_lock_blocking_rw(held, held_rw);
  85                if (held_rw == BTRFS_WRITE_LOCK)
  86                        held_rw = BTRFS_WRITE_LOCK_BLOCKING;
  87                else if (held_rw == BTRFS_READ_LOCK)
  88                        held_rw = BTRFS_READ_LOCK_BLOCKING;
  89        }
  90        btrfs_set_path_blocking(p);
  91
  92        for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
  93                if (p->nodes[i] && p->locks[i]) {
  94                        btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
  95                        if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
  96                                p->locks[i] = BTRFS_WRITE_LOCK;
  97                        else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
  98                                p->locks[i] = BTRFS_READ_LOCK;
  99                }
 100        }
 101
 102        if (held)
 103                btrfs_clear_lock_blocking_rw(held, held_rw);
 104}
 105
 106/* this also releases the path */
 107void btrfs_free_path(struct btrfs_path *p)
 108{
 109        if (!p)
 110                return;
 111        btrfs_release_path(p);
 112        kmem_cache_free(btrfs_path_cachep, p);
 113}
 114
 115/*
 116 * path release drops references on the extent buffers in the path
 117 * and it drops any locks held by this path
 118 *
 119 * It is safe to call this on paths that no locks or extent buffers held.
 120 */
 121noinline void btrfs_release_path(struct btrfs_path *p)
 122{
 123        int i;
 124
 125        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 126                p->slots[i] = 0;
 127                if (!p->nodes[i])
 128                        continue;
 129                if (p->locks[i]) {
 130                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
 131                        p->locks[i] = 0;
 132                }
 133                free_extent_buffer(p->nodes[i]);
 134                p->nodes[i] = NULL;
 135        }
 136}
 137
 138/*
 139 * safely gets a reference on the root node of a tree.  A lock
 140 * is not taken, so a concurrent writer may put a different node
 141 * at the root of the tree.  See btrfs_lock_root_node for the
 142 * looping required.
 143 *
 144 * The extent buffer returned by this has a reference taken, so
 145 * it won't disappear.  It may stop being the root of the tree
 146 * at any time because there are no locks held.
 147 */
 148struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 149{
 150        struct extent_buffer *eb;
 151
 152        while (1) {
 153                rcu_read_lock();
 154                eb = rcu_dereference(root->node);
 155
 156                /*
 157                 * RCU really hurts here, we could free up the root node because
 158                 * it was cow'ed but we may not get the new root node yet so do
 159                 * the inc_not_zero dance and if it doesn't work then
 160                 * synchronize_rcu and try again.
 161                 */
 162                if (atomic_inc_not_zero(&eb->refs)) {
 163                        rcu_read_unlock();
 164                        break;
 165                }
 166                rcu_read_unlock();
 167                synchronize_rcu();
 168        }
 169        return eb;
 170}
 171
 172/* loop around taking references on and locking the root node of the
 173 * tree until you end up with a lock on the root.  A locked buffer
 174 * is returned, with a reference held.
 175 */
 176struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 177{
 178        struct extent_buffer *eb;
 179
 180        while (1) {
 181                eb = btrfs_root_node(root);
 182                btrfs_tree_lock(eb);
 183                if (eb == root->node)
 184                        break;
 185                btrfs_tree_unlock(eb);
 186                free_extent_buffer(eb);
 187        }
 188        return eb;
 189}
 190
 191/* loop around taking references on and locking the root node of the
 192 * tree until you end up with a lock on the root.  A locked buffer
 193 * is returned, with a reference held.
 194 */
 195static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 196{
 197        struct extent_buffer *eb;
 198
 199        while (1) {
 200                eb = btrfs_root_node(root);
 201                btrfs_tree_read_lock(eb);
 202                if (eb == root->node)
 203                        break;
 204                btrfs_tree_read_unlock(eb);
 205                free_extent_buffer(eb);
 206        }
 207        return eb;
 208}
 209
 210/* cowonly root (everything not a reference counted cow subvolume), just get
 211 * put onto a simple dirty list.  transaction.c walks this to make sure they
 212 * get properly updated on disk.
 213 */
 214static void add_root_to_dirty_list(struct btrfs_root *root)
 215{
 216        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
 217            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
 218                return;
 219
 220        spin_lock(&root->fs_info->trans_lock);
 221        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
 222                /* Want the extent tree to be the last on the list */
 223                if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
 224                        list_move_tail(&root->dirty_list,
 225                                       &root->fs_info->dirty_cowonly_roots);
 226                else
 227                        list_move(&root->dirty_list,
 228                                  &root->fs_info->dirty_cowonly_roots);
 229        }
 230        spin_unlock(&root->fs_info->trans_lock);
 231}
 232
 233/*
 234 * used by snapshot creation to make a copy of a root for a tree with
 235 * a given objectid.  The buffer with the new root node is returned in
 236 * cow_ret, and this func returns zero on success or a negative error code.
 237 */
 238int btrfs_copy_root(struct btrfs_trans_handle *trans,
 239                      struct btrfs_root *root,
 240                      struct extent_buffer *buf,
 241                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 242{
 243        struct extent_buffer *cow;
 244        int ret = 0;
 245        int level;
 246        struct btrfs_disk_key disk_key;
 247
 248        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 249                trans->transid != root->fs_info->running_transaction->transid);
 250        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 251                trans->transid != root->last_trans);
 252
 253        level = btrfs_header_level(buf);
 254        if (level == 0)
 255                btrfs_item_key(buf, &disk_key, 0);
 256        else
 257                btrfs_node_key(buf, &disk_key, 0);
 258
 259        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 260                        &disk_key, level, buf->start, 0);
 261        if (IS_ERR(cow))
 262                return PTR_ERR(cow);
 263
 264        copy_extent_buffer(cow, buf, 0, 0, cow->len);
 265        btrfs_set_header_bytenr(cow, cow->start);
 266        btrfs_set_header_generation(cow, trans->transid);
 267        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 268        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 269                                     BTRFS_HEADER_FLAG_RELOC);
 270        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 271                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 272        else
 273                btrfs_set_header_owner(cow, new_root_objectid);
 274
 275        write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
 276                            BTRFS_FSID_SIZE);
 277
 278        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 279        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 280                ret = btrfs_inc_ref(trans, root, cow, 1);
 281        else
 282                ret = btrfs_inc_ref(trans, root, cow, 0);
 283
 284        if (ret)
 285                return ret;
 286
 287        btrfs_mark_buffer_dirty(cow);
 288        *cow_ret = cow;
 289        return 0;
 290}
 291
 292enum mod_log_op {
 293        MOD_LOG_KEY_REPLACE,
 294        MOD_LOG_KEY_ADD,
 295        MOD_LOG_KEY_REMOVE,
 296        MOD_LOG_KEY_REMOVE_WHILE_FREEING,
 297        MOD_LOG_KEY_REMOVE_WHILE_MOVING,
 298        MOD_LOG_MOVE_KEYS,
 299        MOD_LOG_ROOT_REPLACE,
 300};
 301
 302struct tree_mod_move {
 303        int dst_slot;
 304        int nr_items;
 305};
 306
 307struct tree_mod_root {
 308        u64 logical;
 309        u8 level;
 310};
 311
 312struct tree_mod_elem {
 313        struct rb_node node;
 314        u64 index;              /* shifted logical */
 315        u64 seq;
 316        enum mod_log_op op;
 317
 318        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
 319        int slot;
 320
 321        /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
 322        u64 generation;
 323
 324        /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
 325        struct btrfs_disk_key key;
 326        u64 blockptr;
 327
 328        /* this is used for op == MOD_LOG_MOVE_KEYS */
 329        struct tree_mod_move move;
 330
 331        /* this is used for op == MOD_LOG_ROOT_REPLACE */
 332        struct tree_mod_root old_root;
 333};
 334
 335static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
 336{
 337        read_lock(&fs_info->tree_mod_log_lock);
 338}
 339
 340static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
 341{
 342        read_unlock(&fs_info->tree_mod_log_lock);
 343}
 344
 345static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
 346{
 347        write_lock(&fs_info->tree_mod_log_lock);
 348}
 349
 350static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
 351{
 352        write_unlock(&fs_info->tree_mod_log_lock);
 353}
 354
 355/*
 356 * Pull a new tree mod seq number for our operation.
 357 */
 358static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 359{
 360        return atomic64_inc_return(&fs_info->tree_mod_seq);
 361}
 362
 363/*
 364 * This adds a new blocker to the tree mod log's blocker list if the @elem
 365 * passed does not already have a sequence number set. So when a caller expects
 366 * to record tree modifications, it should ensure to set elem->seq to zero
 367 * before calling btrfs_get_tree_mod_seq.
 368 * Returns a fresh, unused tree log modification sequence number, even if no new
 369 * blocker was added.
 370 */
 371u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 372                           struct seq_list *elem)
 373{
 374        tree_mod_log_write_lock(fs_info);
 375        spin_lock(&fs_info->tree_mod_seq_lock);
 376        if (!elem->seq) {
 377                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 378                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 379        }
 380        spin_unlock(&fs_info->tree_mod_seq_lock);
 381        tree_mod_log_write_unlock(fs_info);
 382
 383        return elem->seq;
 384}
 385
 386void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 387                            struct seq_list *elem)
 388{
 389        struct rb_root *tm_root;
 390        struct rb_node *node;
 391        struct rb_node *next;
 392        struct seq_list *cur_elem;
 393        struct tree_mod_elem *tm;
 394        u64 min_seq = (u64)-1;
 395        u64 seq_putting = elem->seq;
 396
 397        if (!seq_putting)
 398                return;
 399
 400        spin_lock(&fs_info->tree_mod_seq_lock);
 401        list_del(&elem->list);
 402        elem->seq = 0;
 403
 404        list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
 405                if (cur_elem->seq < min_seq) {
 406                        if (seq_putting > cur_elem->seq) {
 407                                /*
 408                                 * blocker with lower sequence number exists, we
 409                                 * cannot remove anything from the log
 410                                 */
 411                                spin_unlock(&fs_info->tree_mod_seq_lock);
 412                                return;
 413                        }
 414                        min_seq = cur_elem->seq;
 415                }
 416        }
 417        spin_unlock(&fs_info->tree_mod_seq_lock);
 418
 419        /*
 420         * anything that's lower than the lowest existing (read: blocked)
 421         * sequence number can be removed from the tree.
 422         */
 423        tree_mod_log_write_lock(fs_info);
 424        tm_root = &fs_info->tree_mod_log;
 425        for (node = rb_first(tm_root); node; node = next) {
 426                next = rb_next(node);
 427                tm = container_of(node, struct tree_mod_elem, node);
 428                if (tm->seq > min_seq)
 429                        continue;
 430                rb_erase(node, tm_root);
 431                kfree(tm);
 432        }
 433        tree_mod_log_write_unlock(fs_info);
 434}
 435
 436/*
 437 * key order of the log:
 438 *       index -> sequence
 439 *
 440 * the index is the shifted logical of the *new* root node for root replace
 441 * operations, or the shifted logical of the affected block for all other
 442 * operations.
 443 *
 444 * Note: must be called with write lock (tree_mod_log_write_lock).
 445 */
 446static noinline int
 447__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 448{
 449        struct rb_root *tm_root;
 450        struct rb_node **new;
 451        struct rb_node *parent = NULL;
 452        struct tree_mod_elem *cur;
 453
 454        BUG_ON(!tm);
 455
 456        tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 457
 458        tm_root = &fs_info->tree_mod_log;
 459        new = &tm_root->rb_node;
 460        while (*new) {
 461                cur = container_of(*new, struct tree_mod_elem, node);
 462                parent = *new;
 463                if (cur->index < tm->index)
 464                        new = &((*new)->rb_left);
 465                else if (cur->index > tm->index)
 466                        new = &((*new)->rb_right);
 467                else if (cur->seq < tm->seq)
 468                        new = &((*new)->rb_left);
 469                else if (cur->seq > tm->seq)
 470                        new = &((*new)->rb_right);
 471                else
 472                        return -EEXIST;
 473        }
 474
 475        rb_link_node(&tm->node, parent, new);
 476        rb_insert_color(&tm->node, tm_root);
 477        return 0;
 478}
 479
 480/*
 481 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 482 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 483 * this until all tree mod log insertions are recorded in the rb tree and then
 484 * call tree_mod_log_write_unlock() to release.
 485 */
 486static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 487                                    struct extent_buffer *eb) {
 488        smp_mb();
 489        if (list_empty(&(fs_info)->tree_mod_seq_list))
 490                return 1;
 491        if (eb && btrfs_header_level(eb) == 0)
 492                return 1;
 493
 494        tree_mod_log_write_lock(fs_info);
 495        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 496                tree_mod_log_write_unlock(fs_info);
 497                return 1;
 498        }
 499
 500        return 0;
 501}
 502
 503/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 504static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 505                                    struct extent_buffer *eb)
 506{
 507        smp_mb();
 508        if (list_empty(&(fs_info)->tree_mod_seq_list))
 509                return 0;
 510        if (eb && btrfs_header_level(eb) == 0)
 511                return 0;
 512
 513        return 1;
 514}
 515
 516static struct tree_mod_elem *
 517alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
 518                    enum mod_log_op op, gfp_t flags)
 519{
 520        struct tree_mod_elem *tm;
 521
 522        tm = kzalloc(sizeof(*tm), flags);
 523        if (!tm)
 524                return NULL;
 525
 526        tm->index = eb->start >> PAGE_CACHE_SHIFT;
 527        if (op != MOD_LOG_KEY_ADD) {
 528                btrfs_node_key(eb, &tm->key, slot);
 529                tm->blockptr = btrfs_node_blockptr(eb, slot);
 530        }
 531        tm->op = op;
 532        tm->slot = slot;
 533        tm->generation = btrfs_node_ptr_generation(eb, slot);
 534        RB_CLEAR_NODE(&tm->node);
 535
 536        return tm;
 537}
 538
 539static noinline int
 540tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 541                        struct extent_buffer *eb, int slot,
 542                        enum mod_log_op op, gfp_t flags)
 543{
 544        struct tree_mod_elem *tm;
 545        int ret;
 546
 547        if (!tree_mod_need_log(fs_info, eb))
 548                return 0;
 549
 550        tm = alloc_tree_mod_elem(eb, slot, op, flags);
 551        if (!tm)
 552                return -ENOMEM;
 553
 554        if (tree_mod_dont_log(fs_info, eb)) {
 555                kfree(tm);
 556                return 0;
 557        }
 558
 559        ret = __tree_mod_log_insert(fs_info, tm);
 560        tree_mod_log_write_unlock(fs_info);
 561        if (ret)
 562                kfree(tm);
 563
 564        return ret;
 565}
 566
 567static noinline int
 568tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 569                         struct extent_buffer *eb, int dst_slot, int src_slot,
 570                         int nr_items, gfp_t flags)
 571{
 572        struct tree_mod_elem *tm = NULL;
 573        struct tree_mod_elem **tm_list = NULL;
 574        int ret = 0;
 575        int i;
 576        int locked = 0;
 577
 578        if (!tree_mod_need_log(fs_info, eb))
 579                return 0;
 580
 581        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
 582        if (!tm_list)
 583                return -ENOMEM;
 584
 585        tm = kzalloc(sizeof(*tm), flags);
 586        if (!tm) {
 587                ret = -ENOMEM;
 588                goto free_tms;
 589        }
 590
 591        tm->index = eb->start >> PAGE_CACHE_SHIFT;
 592        tm->slot = src_slot;
 593        tm->move.dst_slot = dst_slot;
 594        tm->move.nr_items = nr_items;
 595        tm->op = MOD_LOG_MOVE_KEYS;
 596
 597        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 598                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 599                    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
 600                if (!tm_list[i]) {
 601                        ret = -ENOMEM;
 602                        goto free_tms;
 603                }
 604        }
 605
 606        if (tree_mod_dont_log(fs_info, eb))
 607                goto free_tms;
 608        locked = 1;
 609
 610        /*
 611         * When we override something during the move, we log these removals.
 612         * This can only happen when we move towards the beginning of the
 613         * buffer, i.e. dst_slot < src_slot.
 614         */
 615        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 616                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 617                if (ret)
 618                        goto free_tms;
 619        }
 620
 621        ret = __tree_mod_log_insert(fs_info, tm);
 622        if (ret)
 623                goto free_tms;
 624        tree_mod_log_write_unlock(fs_info);
 625        kfree(tm_list);
 626
 627        return 0;
 628free_tms:
 629        for (i = 0; i < nr_items; i++) {
 630                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 631                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 632                kfree(tm_list[i]);
 633        }
 634        if (locked)
 635                tree_mod_log_write_unlock(fs_info);
 636        kfree(tm_list);
 637        kfree(tm);
 638
 639        return ret;
 640}
 641
 642static inline int
 643__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 644                       struct tree_mod_elem **tm_list,
 645                       int nritems)
 646{
 647        int i, j;
 648        int ret;
 649
 650        for (i = nritems - 1; i >= 0; i--) {
 651                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 652                if (ret) {
 653                        for (j = nritems - 1; j > i; j--)
 654                                rb_erase(&tm_list[j]->node,
 655                                         &fs_info->tree_mod_log);
 656                        return ret;
 657                }
 658        }
 659
 660        return 0;
 661}
 662
 663static noinline int
 664tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 665                         struct extent_buffer *old_root,
 666                         struct extent_buffer *new_root, gfp_t flags,
 667                         int log_removal)
 668{
 669        struct tree_mod_elem *tm = NULL;
 670        struct tree_mod_elem **tm_list = NULL;
 671        int nritems = 0;
 672        int ret = 0;
 673        int i;
 674
 675        if (!tree_mod_need_log(fs_info, NULL))
 676                return 0;
 677
 678        if (log_removal && btrfs_header_level(old_root) > 0) {
 679                nritems = btrfs_header_nritems(old_root);
 680                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 681                                  flags);
 682                if (!tm_list) {
 683                        ret = -ENOMEM;
 684                        goto free_tms;
 685                }
 686                for (i = 0; i < nritems; i++) {
 687                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
 688                            MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
 689                        if (!tm_list[i]) {
 690                                ret = -ENOMEM;
 691                                goto free_tms;
 692                        }
 693                }
 694        }
 695
 696        tm = kzalloc(sizeof(*tm), flags);
 697        if (!tm) {
 698                ret = -ENOMEM;
 699                goto free_tms;
 700        }
 701
 702        tm->index = new_root->start >> PAGE_CACHE_SHIFT;
 703        tm->old_root.logical = old_root->start;
 704        tm->old_root.level = btrfs_header_level(old_root);
 705        tm->generation = btrfs_header_generation(old_root);
 706        tm->op = MOD_LOG_ROOT_REPLACE;
 707
 708        if (tree_mod_dont_log(fs_info, NULL))
 709                goto free_tms;
 710
 711        if (tm_list)
 712                ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 713        if (!ret)
 714                ret = __tree_mod_log_insert(fs_info, tm);
 715
 716        tree_mod_log_write_unlock(fs_info);
 717        if (ret)
 718                goto free_tms;
 719        kfree(tm_list);
 720
 721        return ret;
 722
 723free_tms:
 724        if (tm_list) {
 725                for (i = 0; i < nritems; i++)
 726                        kfree(tm_list[i]);
 727                kfree(tm_list);
 728        }
 729        kfree(tm);
 730
 731        return ret;
 732}
 733
 734static struct tree_mod_elem *
 735__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 736                      int smallest)
 737{
 738        struct rb_root *tm_root;
 739        struct rb_node *node;
 740        struct tree_mod_elem *cur = NULL;
 741        struct tree_mod_elem *found = NULL;
 742        u64 index = start >> PAGE_CACHE_SHIFT;
 743
 744        tree_mod_log_read_lock(fs_info);
 745        tm_root = &fs_info->tree_mod_log;
 746        node = tm_root->rb_node;
 747        while (node) {
 748                cur = container_of(node, struct tree_mod_elem, node);
 749                if (cur->index < index) {
 750                        node = node->rb_left;
 751                } else if (cur->index > index) {
 752                        node = node->rb_right;
 753                } else if (cur->seq < min_seq) {
 754                        node = node->rb_left;
 755                } else if (!smallest) {
 756                        /* we want the node with the highest seq */
 757                        if (found)
 758                                BUG_ON(found->seq > cur->seq);
 759                        found = cur;
 760                        node = node->rb_left;
 761                } else if (cur->seq > min_seq) {
 762                        /* we want the node with the smallest seq */
 763                        if (found)
 764                                BUG_ON(found->seq < cur->seq);
 765                        found = cur;
 766                        node = node->rb_right;
 767                } else {
 768                        found = cur;
 769                        break;
 770                }
 771        }
 772        tree_mod_log_read_unlock(fs_info);
 773
 774        return found;
 775}
 776
 777/*
 778 * this returns the element from the log with the smallest time sequence
 779 * value that's in the log (the oldest log item). any element with a time
 780 * sequence lower than min_seq will be ignored.
 781 */
 782static struct tree_mod_elem *
 783tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
 784                           u64 min_seq)
 785{
 786        return __tree_mod_log_search(fs_info, start, min_seq, 1);
 787}
 788
 789/*
 790 * this returns the element from the log with the largest time sequence
 791 * value that's in the log (the most recent log item). any element with
 792 * a time sequence lower than min_seq will be ignored.
 793 */
 794static struct tree_mod_elem *
 795tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 796{
 797        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 798}
 799
 800static noinline int
 801tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 802                     struct extent_buffer *src, unsigned long dst_offset,
 803                     unsigned long src_offset, int nr_items)
 804{
 805        int ret = 0;
 806        struct tree_mod_elem **tm_list = NULL;
 807        struct tree_mod_elem **tm_list_add, **tm_list_rem;
 808        int i;
 809        int locked = 0;
 810
 811        if (!tree_mod_need_log(fs_info, NULL))
 812                return 0;
 813
 814        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 815                return 0;
 816
 817        tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 818                          GFP_NOFS);
 819        if (!tm_list)
 820                return -ENOMEM;
 821
 822        tm_list_add = tm_list;
 823        tm_list_rem = tm_list + nr_items;
 824        for (i = 0; i < nr_items; i++) {
 825                tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 826                    MOD_LOG_KEY_REMOVE, GFP_NOFS);
 827                if (!tm_list_rem[i]) {
 828                        ret = -ENOMEM;
 829                        goto free_tms;
 830                }
 831
 832                tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 833                    MOD_LOG_KEY_ADD, GFP_NOFS);
 834                if (!tm_list_add[i]) {
 835                        ret = -ENOMEM;
 836                        goto free_tms;
 837                }
 838        }
 839
 840        if (tree_mod_dont_log(fs_info, NULL))
 841                goto free_tms;
 842        locked = 1;
 843
 844        for (i = 0; i < nr_items; i++) {
 845                ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
 846                if (ret)
 847                        goto free_tms;
 848                ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
 849                if (ret)
 850                        goto free_tms;
 851        }
 852
 853        tree_mod_log_write_unlock(fs_info);
 854        kfree(tm_list);
 855
 856        return 0;
 857
 858free_tms:
 859        for (i = 0; i < nr_items * 2; i++) {
 860                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 861                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 862                kfree(tm_list[i]);
 863        }
 864        if (locked)
 865                tree_mod_log_write_unlock(fs_info);
 866        kfree(tm_list);
 867
 868        return ret;
 869}
 870
 871static inline void
 872tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 873                     int dst_offset, int src_offset, int nr_items)
 874{
 875        int ret;
 876        ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
 877                                       nr_items, GFP_NOFS);
 878        BUG_ON(ret < 0);
 879}
 880
 881static noinline void
 882tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 883                          struct extent_buffer *eb, int slot, int atomic)
 884{
 885        int ret;
 886
 887        ret = tree_mod_log_insert_key(fs_info, eb, slot,
 888                                        MOD_LOG_KEY_REPLACE,
 889                                        atomic ? GFP_ATOMIC : GFP_NOFS);
 890        BUG_ON(ret < 0);
 891}
 892
 893static noinline int
 894tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 895{
 896        struct tree_mod_elem **tm_list = NULL;
 897        int nritems = 0;
 898        int i;
 899        int ret = 0;
 900
 901        if (btrfs_header_level(eb) == 0)
 902                return 0;
 903
 904        if (!tree_mod_need_log(fs_info, NULL))
 905                return 0;
 906
 907        nritems = btrfs_header_nritems(eb);
 908        tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 909        if (!tm_list)
 910                return -ENOMEM;
 911
 912        for (i = 0; i < nritems; i++) {
 913                tm_list[i] = alloc_tree_mod_elem(eb, i,
 914                    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 915                if (!tm_list[i]) {
 916                        ret = -ENOMEM;
 917                        goto free_tms;
 918                }
 919        }
 920
 921        if (tree_mod_dont_log(fs_info, eb))
 922                goto free_tms;
 923
 924        ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 925        tree_mod_log_write_unlock(fs_info);
 926        if (ret)
 927                goto free_tms;
 928        kfree(tm_list);
 929
 930        return 0;
 931
 932free_tms:
 933        for (i = 0; i < nritems; i++)
 934                kfree(tm_list[i]);
 935        kfree(tm_list);
 936
 937        return ret;
 938}
 939
 940static noinline void
 941tree_mod_log_set_root_pointer(struct btrfs_root *root,
 942                              struct extent_buffer *new_root_node,
 943                              int log_removal)
 944{
 945        int ret;
 946        ret = tree_mod_log_insert_root(root->fs_info, root->node,
 947                                       new_root_node, GFP_NOFS, log_removal);
 948        BUG_ON(ret < 0);
 949}
 950
 951/*
 952 * check if the tree block can be shared by multiple trees
 953 */
 954int btrfs_block_can_be_shared(struct btrfs_root *root,
 955                              struct extent_buffer *buf)
 956{
 957        /*
 958         * Tree blocks not in refernece counted trees and tree roots
 959         * are never shared. If a block was allocated after the last
 960         * snapshot and the block was not allocated by tree relocation,
 961         * we know the block is not shared.
 962         */
 963        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 964            buf != root->node && buf != root->commit_root &&
 965            (btrfs_header_generation(buf) <=
 966             btrfs_root_last_snapshot(&root->root_item) ||
 967             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 968                return 1;
 969#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 970        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 971            btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 972                return 1;
 973#endif
 974        return 0;
 975}
 976
 977static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 978                                       struct btrfs_root *root,
 979                                       struct extent_buffer *buf,
 980                                       struct extent_buffer *cow,
 981                                       int *last_ref)
 982{
 983        u64 refs;
 984        u64 owner;
 985        u64 flags;
 986        u64 new_flags = 0;
 987        int ret;
 988
 989        /*
 990         * Backrefs update rules:
 991         *
 992         * Always use full backrefs for extent pointers in tree block
 993         * allocated by tree relocation.
 994         *
 995         * If a shared tree block is no longer referenced by its owner
 996         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 997         * use full backrefs for extent pointers in tree block.
 998         *
 999         * If a tree block is been relocating
1000         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
1001         * use full backrefs for extent pointers in tree block.
1002         * The reason for this is some operations (such as drop tree)
1003         * are only allowed for blocks use full backrefs.
1004         */
1005
1006        if (btrfs_block_can_be_shared(root, buf)) {
1007                ret = btrfs_lookup_extent_info(trans, root, buf->start,
1008                                               btrfs_header_level(buf), 1,
1009                                               &refs, &flags);
1010                if (ret)
1011                        return ret;
1012                if (refs == 0) {
1013                        ret = -EROFS;
1014                        btrfs_std_error(root->fs_info, ret);
1015                        return ret;
1016                }
1017        } else {
1018                refs = 1;
1019                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1020                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1021                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1022                else
1023                        flags = 0;
1024        }
1025
1026        owner = btrfs_header_owner(buf);
1027        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1028               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1029
1030        if (refs > 1) {
1031                if ((owner == root->root_key.objectid ||
1032                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1033                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1034                        ret = btrfs_inc_ref(trans, root, buf, 1);
1035                        BUG_ON(ret); /* -ENOMEM */
1036
1037                        if (root->root_key.objectid ==
1038                            BTRFS_TREE_RELOC_OBJECTID) {
1039                                ret = btrfs_dec_ref(trans, root, buf, 0);
1040                                BUG_ON(ret); /* -ENOMEM */
1041                                ret = btrfs_inc_ref(trans, root, cow, 1);
1042                                BUG_ON(ret); /* -ENOMEM */
1043                        }
1044                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1045                } else {
1046
1047                        if (root->root_key.objectid ==
1048                            BTRFS_TREE_RELOC_OBJECTID)
1049                                ret = btrfs_inc_ref(trans, root, cow, 1);
1050                        else
1051                                ret = btrfs_inc_ref(trans, root, cow, 0);
1052                        BUG_ON(ret); /* -ENOMEM */
1053                }
1054                if (new_flags != 0) {
1055                        int level = btrfs_header_level(buf);
1056
1057                        ret = btrfs_set_disk_extent_flags(trans, root,
1058                                                          buf->start,
1059                                                          buf->len,
1060                                                          new_flags, level, 0);
1061                        if (ret)
1062                                return ret;
1063                }
1064        } else {
1065                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1066                        if (root->root_key.objectid ==
1067                            BTRFS_TREE_RELOC_OBJECTID)
1068                                ret = btrfs_inc_ref(trans, root, cow, 1);
1069                        else
1070                                ret = btrfs_inc_ref(trans, root, cow, 0);
1071                        BUG_ON(ret); /* -ENOMEM */
1072                        ret = btrfs_dec_ref(trans, root, buf, 1);
1073                        BUG_ON(ret); /* -ENOMEM */
1074                }
1075                clean_tree_block(trans, root->fs_info, buf);
1076                *last_ref = 1;
1077        }
1078        return 0;
1079}
1080
1081/*
1082 * does the dirty work in cow of a single block.  The parent block (if
1083 * supplied) is updated to point to the new cow copy.  The new buffer is marked
1084 * dirty and returned locked.  If you modify the block it needs to be marked
1085 * dirty again.
1086 *
1087 * search_start -- an allocation hint for the new block
1088 *
1089 * empty_size -- a hint that you plan on doing more cow.  This is the size in
1090 * bytes the allocator should try to find free next to the block it returns.
1091 * This is just a hint and may be ignored by the allocator.
1092 */
1093static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1094                             struct btrfs_root *root,
1095                             struct extent_buffer *buf,
1096                             struct extent_buffer *parent, int parent_slot,
1097                             struct extent_buffer **cow_ret,
1098                             u64 search_start, u64 empty_size)
1099{
1100        struct btrfs_disk_key disk_key;
1101        struct extent_buffer *cow;
1102        int level, ret;
1103        int last_ref = 0;
1104        int unlock_orig = 0;
1105        u64 parent_start;
1106
1107        if (*cow_ret == buf)
1108                unlock_orig = 1;
1109
1110        btrfs_assert_tree_locked(buf);
1111
1112        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1113                trans->transid != root->fs_info->running_transaction->transid);
1114        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1115                trans->transid != root->last_trans);
1116
1117        level = btrfs_header_level(buf);
1118
1119        if (level == 0)
1120                btrfs_item_key(buf, &disk_key, 0);
1121        else
1122                btrfs_node_key(buf, &disk_key, 0);
1123
1124        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1125                if (parent)
1126                        parent_start = parent->start;
1127                else
1128                        parent_start = 0;
1129        } else
1130                parent_start = 0;
1131
1132        cow = btrfs_alloc_tree_block(trans, root, parent_start,
1133                        root->root_key.objectid, &disk_key, level,
1134                        search_start, empty_size);
1135        if (IS_ERR(cow))
1136                return PTR_ERR(cow);
1137
1138        /* cow is set to blocking by btrfs_init_new_buffer */
1139
1140        copy_extent_buffer(cow, buf, 0, 0, cow->len);
1141        btrfs_set_header_bytenr(cow, cow->start);
1142        btrfs_set_header_generation(cow, trans->transid);
1143        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1144        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1145                                     BTRFS_HEADER_FLAG_RELOC);
1146        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1147                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1148        else
1149                btrfs_set_header_owner(cow, root->root_key.objectid);
1150
1151        write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1152                            BTRFS_FSID_SIZE);
1153
1154        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1155        if (ret) {
1156                btrfs_abort_transaction(trans, root, ret);
1157                return ret;
1158        }
1159
1160        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1161                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1162                if (ret)
1163                        return ret;
1164        }
1165
1166        if (buf == root->node) {
1167                WARN_ON(parent && parent != buf);
1168                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1169                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1170                        parent_start = buf->start;
1171                else
1172                        parent_start = 0;
1173
1174                extent_buffer_get(cow);
1175                tree_mod_log_set_root_pointer(root, cow, 1);
1176                rcu_assign_pointer(root->node, cow);
1177
1178                btrfs_free_tree_block(trans, root, buf, parent_start,
1179                                      last_ref);
1180                free_extent_buffer(buf);
1181                add_root_to_dirty_list(root);
1182        } else {
1183                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1184                        parent_start = parent->start;
1185                else
1186                        parent_start = 0;
1187
1188                WARN_ON(trans->transid != btrfs_header_generation(parent));
1189                tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1190                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1191                btrfs_set_node_blockptr(parent, parent_slot,
1192                                        cow->start);
1193                btrfs_set_node_ptr_generation(parent, parent_slot,
1194                                              trans->transid);
1195                btrfs_mark_buffer_dirty(parent);
1196                if (last_ref) {
1197                        ret = tree_mod_log_free_eb(root->fs_info, buf);
1198                        if (ret) {
1199                                btrfs_abort_transaction(trans, root, ret);
1200                                return ret;
1201                        }
1202                }
1203                btrfs_free_tree_block(trans, root, buf, parent_start,
1204                                      last_ref);
1205        }
1206        if (unlock_orig)
1207                btrfs_tree_unlock(buf);
1208        free_extent_buffer_stale(buf);
1209        btrfs_mark_buffer_dirty(cow);
1210        *cow_ret = cow;
1211        return 0;
1212}
1213
1214/*
1215 * returns the logical address of the oldest predecessor of the given root.
1216 * entries older than time_seq are ignored.
1217 */
1218static struct tree_mod_elem *
1219__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1220                           struct extent_buffer *eb_root, u64 time_seq)
1221{
1222        struct tree_mod_elem *tm;
1223        struct tree_mod_elem *found = NULL;
1224        u64 root_logical = eb_root->start;
1225        int looped = 0;
1226
1227        if (!time_seq)
1228                return NULL;
1229
1230        /*
1231         * the very last operation that's logged for a root is the replacement
1232         * operation (if it is replaced at all). this has the index of the *new*
1233         * root, making it the very first operation that's logged for this root.
1234         */
1235        while (1) {
1236                tm = tree_mod_log_search_oldest(fs_info, root_logical,
1237                                                time_seq);
1238                if (!looped && !tm)
1239                        return NULL;
1240                /*
1241                 * if there are no tree operation for the oldest root, we simply
1242                 * return it. this should only happen if that (old) root is at
1243                 * level 0.
1244                 */
1245                if (!tm)
1246                        break;
1247
1248                /*
1249                 * if there's an operation that's not a root replacement, we
1250                 * found the oldest version of our root. normally, we'll find a
1251                 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1252                 */
1253                if (tm->op != MOD_LOG_ROOT_REPLACE)
1254                        break;
1255
1256                found = tm;
1257                root_logical = tm->old_root.logical;
1258                looped = 1;
1259        }
1260
1261        /* if there's no old root to return, return what we found instead */
1262        if (!found)
1263                found = tm;
1264
1265        return found;
1266}
1267
1268/*
1269 * tm is a pointer to the first operation to rewind within eb. then, all
1270 * previous operations will be rewinded (until we reach something older than
1271 * time_seq).
1272 */
1273static void
1274__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1275                      u64 time_seq, struct tree_mod_elem *first_tm)
1276{
1277        u32 n;
1278        struct rb_node *next;
1279        struct tree_mod_elem *tm = first_tm;
1280        unsigned long o_dst;
1281        unsigned long o_src;
1282        unsigned long p_size = sizeof(struct btrfs_key_ptr);
1283
1284        n = btrfs_header_nritems(eb);
1285        tree_mod_log_read_lock(fs_info);
1286        while (tm && tm->seq >= time_seq) {
1287                /*
1288                 * all the operations are recorded with the operator used for
1289                 * the modification. as we're going backwards, we do the
1290                 * opposite of each operation here.
1291                 */
1292                switch (tm->op) {
1293                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1294                        BUG_ON(tm->slot < n);
1295                        /* Fallthrough */
1296                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1297                case MOD_LOG_KEY_REMOVE:
1298                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1299                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1300                        btrfs_set_node_ptr_generation(eb, tm->slot,
1301                                                      tm->generation);
1302                        n++;
1303                        break;
1304                case MOD_LOG_KEY_REPLACE:
1305                        BUG_ON(tm->slot >= n);
1306                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1307                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1308                        btrfs_set_node_ptr_generation(eb, tm->slot,
1309                                                      tm->generation);
1310                        break;
1311                case MOD_LOG_KEY_ADD:
1312                        /* if a move operation is needed it's in the log */
1313                        n--;
1314                        break;
1315                case MOD_LOG_MOVE_KEYS:
1316                        o_dst = btrfs_node_key_ptr_offset(tm->slot);
1317                        o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1318                        memmove_extent_buffer(eb, o_dst, o_src,
1319                                              tm->move.nr_items * p_size);
1320                        break;
1321                case MOD_LOG_ROOT_REPLACE:
1322                        /*
1323                         * this operation is special. for roots, this must be
1324                         * handled explicitly before rewinding.
1325                         * for non-roots, this operation may exist if the node
1326                         * was a root: root A -> child B; then A gets empty and
1327                         * B is promoted to the new root. in the mod log, we'll
1328                         * have a root-replace operation for B, a tree block
1329                         * that is no root. we simply ignore that operation.
1330                         */
1331                        break;
1332                }
1333                next = rb_next(&tm->node);
1334                if (!next)
1335                        break;
1336                tm = container_of(next, struct tree_mod_elem, node);
1337                if (tm->index != first_tm->index)
1338                        break;
1339        }
1340        tree_mod_log_read_unlock(fs_info);
1341        btrfs_set_header_nritems(eb, n);
1342}
1343
1344/*
1345 * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1346 * is returned. If rewind operations happen, a fresh buffer is returned. The
1347 * returned buffer is always read-locked. If the returned buffer is not the
1348 * input buffer, the lock on the input buffer is released and the input buffer
1349 * is freed (its refcount is decremented).
1350 */
1351static struct extent_buffer *
1352tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1353                    struct extent_buffer *eb, u64 time_seq)
1354{
1355        struct extent_buffer *eb_rewin;
1356        struct tree_mod_elem *tm;
1357
1358        if (!time_seq)
1359                return eb;
1360
1361        if (btrfs_header_level(eb) == 0)
1362                return eb;
1363
1364        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1365        if (!tm)
1366                return eb;
1367
1368        btrfs_set_path_blocking(path);
1369        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1370
1371        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1372                BUG_ON(tm->slot != 0);
1373                eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1374                if (!eb_rewin) {
1375                        btrfs_tree_read_unlock_blocking(eb);
1376                        free_extent_buffer(eb);
1377                        return NULL;
1378                }
1379                btrfs_set_header_bytenr(eb_rewin, eb->start);
1380                btrfs_set_header_backref_rev(eb_rewin,
1381                                             btrfs_header_backref_rev(eb));
1382                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1383                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1384        } else {
1385                eb_rewin = btrfs_clone_extent_buffer(eb);
1386                if (!eb_rewin) {
1387                        btrfs_tree_read_unlock_blocking(eb);
1388                        free_extent_buffer(eb);
1389                        return NULL;
1390                }
1391        }
1392
1393        btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1394        btrfs_tree_read_unlock_blocking(eb);
1395        free_extent_buffer(eb);
1396
1397        extent_buffer_get(eb_rewin);
1398        btrfs_tree_read_lock(eb_rewin);
1399        __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1400        WARN_ON(btrfs_header_nritems(eb_rewin) >
1401                BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1402
1403        return eb_rewin;
1404}
1405
1406/*
1407 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1408 * value. If there are no changes, the current root->root_node is returned. If
1409 * anything changed in between, there's a fresh buffer allocated on which the
1410 * rewind operations are done. In any case, the returned buffer is read locked.
1411 * Returns NULL on error (with no locks held).
1412 */
1413static inline struct extent_buffer *
1414get_old_root(struct btrfs_root *root, u64 time_seq)
1415{
1416        struct tree_mod_elem *tm;
1417        struct extent_buffer *eb = NULL;
1418        struct extent_buffer *eb_root;
1419        struct extent_buffer *old;
1420        struct tree_mod_root *old_root = NULL;
1421        u64 old_generation = 0;
1422        u64 logical;
1423
1424        eb_root = btrfs_read_lock_root_node(root);
1425        tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1426        if (!tm)
1427                return eb_root;
1428
1429        if (tm->op == MOD_LOG_ROOT_REPLACE) {
1430                old_root = &tm->old_root;
1431                old_generation = tm->generation;
1432                logical = old_root->logical;
1433        } else {
1434                logical = eb_root->start;
1435        }
1436
1437        tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1438        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1439                btrfs_tree_read_unlock(eb_root);
1440                free_extent_buffer(eb_root);
1441                old = read_tree_block(root, logical, 0);
1442                if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1443                        free_extent_buffer(old);
1444                        btrfs_warn(root->fs_info,
1445                                "failed to read tree block %llu from get_old_root", logical);
1446                } else {
1447                        eb = btrfs_clone_extent_buffer(old);
1448                        free_extent_buffer(old);
1449                }
1450        } else if (old_root) {
1451                btrfs_tree_read_unlock(eb_root);
1452                free_extent_buffer(eb_root);
1453                eb = alloc_dummy_extent_buffer(root->fs_info, logical);
1454        } else {
1455                btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1456                eb = btrfs_clone_extent_buffer(eb_root);
1457                btrfs_tree_read_unlock_blocking(eb_root);
1458                free_extent_buffer(eb_root);
1459        }
1460
1461        if (!eb)
1462                return NULL;
1463        extent_buffer_get(eb);
1464        btrfs_tree_read_lock(eb);
1465        if (old_root) {
1466                btrfs_set_header_bytenr(eb, eb->start);
1467                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1468                btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1469                btrfs_set_header_level(eb, old_root->level);
1470                btrfs_set_header_generation(eb, old_generation);
1471        }
1472        if (tm)
1473                __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1474        else
1475                WARN_ON(btrfs_header_level(eb) != 0);
1476        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1477
1478        return eb;
1479}
1480
1481int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1482{
1483        struct tree_mod_elem *tm;
1484        int level;
1485        struct extent_buffer *eb_root = btrfs_root_node(root);
1486
1487        tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1488        if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1489                level = tm->old_root.level;
1490        } else {
1491                level = btrfs_header_level(eb_root);
1492        }
1493        free_extent_buffer(eb_root);
1494
1495        return level;
1496}
1497
1498static inline int should_cow_block(struct btrfs_trans_handle *trans,
1499                                   struct btrfs_root *root,
1500                                   struct extent_buffer *buf)
1501{
1502        if (btrfs_test_is_dummy_root(root))
1503                return 0;
1504
1505        /* ensure we can see the force_cow */
1506        smp_rmb();
1507
1508        /*
1509         * We do not need to cow a block if
1510         * 1) this block is not created or changed in this transaction;
1511         * 2) this block does not belong to TREE_RELOC tree;
1512         * 3) the root is not forced COW.
1513         *
1514         * What is forced COW:
1515         *    when we create snapshot during commiting the transaction,
1516         *    after we've finished coping src root, we must COW the shared
1517         *    block to ensure the metadata consistency.
1518         */
1519        if (btrfs_header_generation(buf) == trans->transid &&
1520            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1521            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1522              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1523            !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1524                return 0;
1525        return 1;
1526}
1527
1528/*
1529 * cows a single block, see __btrfs_cow_block for the real work.
1530 * This version of it has extra checks so that a block isn't cow'd more than
1531 * once per transaction, as long as it hasn't been written yet
1532 */
1533noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1534                    struct btrfs_root *root, struct extent_buffer *buf,
1535                    struct extent_buffer *parent, int parent_slot,
1536                    struct extent_buffer **cow_ret)
1537{
1538        u64 search_start;
1539        int ret;
1540
1541        if (trans->transaction != root->fs_info->running_transaction)
1542                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1543                       trans->transid,
1544                       root->fs_info->running_transaction->transid);
1545
1546        if (trans->transid != root->fs_info->generation)
1547                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1548                       trans->transid, root->fs_info->generation);
1549
1550        if (!should_cow_block(trans, root, buf)) {
1551                *cow_ret = buf;
1552                return 0;
1553        }
1554
1555        search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1556
1557        if (parent)
1558                btrfs_set_lock_blocking(parent);
1559        btrfs_set_lock_blocking(buf);
1560
1561        ret = __btrfs_cow_block(trans, root, buf, parent,
1562                                 parent_slot, cow_ret, search_start, 0);
1563
1564        trace_btrfs_cow_block(root, buf, *cow_ret);
1565
1566        return ret;
1567}
1568
1569/*
1570 * helper function for defrag to decide if two blocks pointed to by a
1571 * node are actually close by
1572 */
1573static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1574{
1575        if (blocknr < other && other - (blocknr + blocksize) < 32768)
1576                return 1;
1577        if (blocknr > other && blocknr - (other + blocksize) < 32768)
1578                return 1;
1579        return 0;
1580}
1581
1582/*
1583 * compare two keys in a memcmp fashion
1584 */
1585static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1586{
1587        struct btrfs_key k1;
1588
1589        btrfs_disk_key_to_cpu(&k1, disk);
1590
1591        return btrfs_comp_cpu_keys(&k1, k2);
1592}
1593
1594/*
1595 * same as comp_keys only with two btrfs_key's
1596 */
1597int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1598{
1599        if (k1->objectid > k2->objectid)
1600                return 1;
1601        if (k1->objectid < k2->objectid)
1602                return -1;
1603        if (k1->type > k2->type)
1604                return 1;
1605        if (k1->type < k2->type)
1606                return -1;
1607        if (k1->offset > k2->offset)
1608                return 1;
1609        if (k1->offset < k2->offset)
1610                return -1;
1611        return 0;
1612}
1613
1614/*
1615 * this is used by the defrag code to go through all the
1616 * leaves pointed to by a node and reallocate them so that
1617 * disk order is close to key order
1618 */
1619int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1620                       struct btrfs_root *root, struct extent_buffer *parent,
1621                       int start_slot, u64 *last_ret,
1622                       struct btrfs_key *progress)
1623{
1624        struct extent_buffer *cur;
1625        u64 blocknr;
1626        u64 gen;
1627        u64 search_start = *last_ret;
1628        u64 last_block = 0;
1629        u64 other;
1630        u32 parent_nritems;
1631        int end_slot;
1632        int i;
1633        int err = 0;
1634        int parent_level;
1635        int uptodate;
1636        u32 blocksize;
1637        int progress_passed = 0;
1638        struct btrfs_disk_key disk_key;
1639
1640        parent_level = btrfs_header_level(parent);
1641
1642        WARN_ON(trans->transaction != root->fs_info->running_transaction);
1643        WARN_ON(trans->transid != root->fs_info->generation);
1644
1645        parent_nritems = btrfs_header_nritems(parent);
1646        blocksize = root->nodesize;
1647        end_slot = parent_nritems - 1;
1648
1649        if (parent_nritems <= 1)
1650                return 0;
1651
1652        btrfs_set_lock_blocking(parent);
1653
1654        for (i = start_slot; i <= end_slot; i++) {
1655                int close = 1;
1656
1657                btrfs_node_key(parent, &disk_key, i);
1658                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1659                        continue;
1660
1661                progress_passed = 1;
1662                blocknr = btrfs_node_blockptr(parent, i);
1663                gen = btrfs_node_ptr_generation(parent, i);
1664                if (last_block == 0)
1665                        last_block = blocknr;
1666
1667                if (i > 0) {
1668                        other = btrfs_node_blockptr(parent, i - 1);
1669                        close = close_blocks(blocknr, other, blocksize);
1670                }
1671                if (!close && i < end_slot) {
1672                        other = btrfs_node_blockptr(parent, i + 1);
1673                        close = close_blocks(blocknr, other, blocksize);
1674                }
1675                if (close) {
1676                        last_block = blocknr;
1677                        continue;
1678                }
1679
1680                cur = btrfs_find_tree_block(root->fs_info, blocknr);
1681                if (cur)
1682                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1683                else
1684                        uptodate = 0;
1685                if (!cur || !uptodate) {
1686                        if (!cur) {
1687                                cur = read_tree_block(root, blocknr, gen);
1688                                if (!cur || !extent_buffer_uptodate(cur)) {
1689                                        free_extent_buffer(cur);
1690                                        return -EIO;
1691                                }
1692                        } else if (!uptodate) {
1693                                err = btrfs_read_buffer(cur, gen);
1694                                if (err) {
1695                                        free_extent_buffer(cur);
1696                                        return err;
1697                                }
1698                        }
1699                }
1700                if (search_start == 0)
1701                        search_start = last_block;
1702
1703                btrfs_tree_lock(cur);
1704                btrfs_set_lock_blocking(cur);
1705                err = __btrfs_cow_block(trans, root, cur, parent, i,
1706                                        &cur, search_start,
1707                                        min(16 * blocksize,
1708                                            (end_slot - i) * blocksize));
1709                if (err) {
1710                        btrfs_tree_unlock(cur);
1711                        free_extent_buffer(cur);
1712                        break;
1713                }
1714                search_start = cur->start;
1715                last_block = cur->start;
1716                *last_ret = search_start;
1717                btrfs_tree_unlock(cur);
1718                free_extent_buffer(cur);
1719        }
1720        return err;
1721}
1722
1723/*
1724 * The leaf data grows from end-to-front in the node.
1725 * this returns the address of the start of the last item,
1726 * which is the stop of the leaf data stack
1727 */
1728static inline unsigned int leaf_data_end(struct btrfs_root *root,
1729                                         struct extent_buffer *leaf)
1730{
1731        u32 nr = btrfs_header_nritems(leaf);
1732        if (nr == 0)
1733                return BTRFS_LEAF_DATA_SIZE(root);
1734        return btrfs_item_offset_nr(leaf, nr - 1);
1735}
1736
1737
1738/*
1739 * search for key in the extent_buffer.  The items start at offset p,
1740 * and they are item_size apart.  There are 'max' items in p.
1741 *
1742 * the slot in the array is returned via slot, and it points to
1743 * the place where you would insert key if it is not found in
1744 * the array.
1745 *
1746 * slot may point to max if the key is bigger than all of the keys
1747 */
1748static noinline int generic_bin_search(struct extent_buffer *eb,
1749                                       unsigned long p,
1750                                       int item_size, struct btrfs_key *key,
1751                                       int max, int *slot)
1752{
1753        int low = 0;
1754        int high = max;
1755        int mid;
1756        int ret;
1757        struct btrfs_disk_key *tmp = NULL;
1758        struct btrfs_disk_key unaligned;
1759        unsigned long offset;
1760        char *kaddr = NULL;
1761        unsigned long map_start = 0;
1762        unsigned long map_len = 0;
1763        int err;
1764
1765        while (low < high) {
1766                mid = (low + high) / 2;
1767                offset = p + mid * item_size;
1768
1769                if (!kaddr || offset < map_start ||
1770                    (offset + sizeof(struct btrfs_disk_key)) >
1771                    map_start + map_len) {
1772
1773                        err = map_private_extent_buffer(eb, offset,
1774                                                sizeof(struct btrfs_disk_key),
1775                                                &kaddr, &map_start, &map_len);
1776
1777                        if (!err) {
1778                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
1779                                                        map_start);
1780                        } else {
1781                                read_extent_buffer(eb, &unaligned,
1782                                                   offset, sizeof(unaligned));
1783                                tmp = &unaligned;
1784                        }
1785
1786                } else {
1787                        tmp = (struct btrfs_disk_key *)(kaddr + offset -
1788                                                        map_start);
1789                }
1790                ret = comp_keys(tmp, key);
1791
1792                if (ret < 0)
1793                        low = mid + 1;
1794                else if (ret > 0)
1795                        high = mid;
1796                else {
1797                        *slot = mid;
1798                        return 0;
1799                }
1800        }
1801        *slot = low;
1802        return 1;
1803}
1804
1805/*
1806 * simple bin_search frontend that does the right thing for
1807 * leaves vs nodes
1808 */
1809static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1810                      int level, int *slot)
1811{
1812        if (level == 0)
1813                return generic_bin_search(eb,
1814                                          offsetof(struct btrfs_leaf, items),
1815                                          sizeof(struct btrfs_item),
1816                                          key, btrfs_header_nritems(eb),
1817                                          slot);
1818        else
1819                return generic_bin_search(eb,
1820                                          offsetof(struct btrfs_node, ptrs),
1821                                          sizeof(struct btrfs_key_ptr),
1822                                          key, btrfs_header_nritems(eb),
1823                                          slot);
1824}
1825
1826int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1827                     int level, int *slot)
1828{
1829        return bin_search(eb, key, level, slot);
1830}
1831
1832static void root_add_used(struct btrfs_root *root, u32 size)
1833{
1834        spin_lock(&root->accounting_lock);
1835        btrfs_set_root_used(&root->root_item,
1836                            btrfs_root_used(&root->root_item) + size);
1837        spin_unlock(&root->accounting_lock);
1838}
1839
1840static void root_sub_used(struct btrfs_root *root, u32 size)
1841{
1842        spin_lock(&root->accounting_lock);
1843        btrfs_set_root_used(&root->root_item,
1844                            btrfs_root_used(&root->root_item) - size);
1845        spin_unlock(&root->accounting_lock);
1846}
1847
1848/* given a node and slot number, this reads the blocks it points to.  The
1849 * extent buffer is returned with a reference taken (but unlocked).
1850 * NULL is returned on error.
1851 */
1852static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1853                                   struct extent_buffer *parent, int slot)
1854{
1855        int level = btrfs_header_level(parent);
1856        struct extent_buffer *eb;
1857
1858        if (slot < 0)
1859                return NULL;
1860        if (slot >= btrfs_header_nritems(parent))
1861                return NULL;
1862
1863        BUG_ON(level == 0);
1864
1865        eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1866                             btrfs_node_ptr_generation(parent, slot));
1867        if (eb && !extent_buffer_uptodate(eb)) {
1868                free_extent_buffer(eb);
1869                eb = NULL;
1870        }
1871
1872        return eb;
1873}
1874
1875/*
1876 * node level balancing, used to make sure nodes are in proper order for
1877 * item deletion.  We balance from the top down, so we have to make sure
1878 * that a deletion won't leave an node completely empty later on.
1879 */
1880static noinline int balance_level(struct btrfs_trans_handle *trans,
1881                         struct btrfs_root *root,
1882                         struct btrfs_path *path, int level)
1883{
1884        struct extent_buffer *right = NULL;
1885        struct extent_buffer *mid;
1886        struct extent_buffer *left = NULL;
1887        struct extent_buffer *parent = NULL;
1888        int ret = 0;
1889        int wret;
1890        int pslot;
1891        int orig_slot = path->slots[level];
1892        u64 orig_ptr;
1893
1894        if (level == 0)
1895                return 0;
1896
1897        mid = path->nodes[level];
1898
1899        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1900                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1901        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1902
1903        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1904
1905        if (level < BTRFS_MAX_LEVEL - 1) {
1906                parent = path->nodes[level + 1];
1907                pslot = path->slots[level + 1];
1908        }
1909
1910        /*
1911         * deal with the case where there is only one pointer in the root
1912         * by promoting the node below to a root
1913         */
1914        if (!parent) {
1915                struct extent_buffer *child;
1916
1917                if (btrfs_header_nritems(mid) != 1)
1918                        return 0;
1919
1920                /* promote the child to a root */
1921                child = read_node_slot(root, mid, 0);
1922                if (!child) {
1923                        ret = -EROFS;
1924                        btrfs_std_error(root->fs_info, ret);
1925                        goto enospc;
1926                }
1927
1928                btrfs_tree_lock(child);
1929                btrfs_set_lock_blocking(child);
1930                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1931                if (ret) {
1932                        btrfs_tree_unlock(child);
1933                        free_extent_buffer(child);
1934                        goto enospc;
1935                }
1936
1937                tree_mod_log_set_root_pointer(root, child, 1);
1938                rcu_assign_pointer(root->node, child);
1939
1940                add_root_to_dirty_list(root);
1941                btrfs_tree_unlock(child);
1942
1943                path->locks[level] = 0;
1944                path->nodes[level] = NULL;
1945                clean_tree_block(trans, root->fs_info, mid);
1946                btrfs_tree_unlock(mid);
1947                /* once for the path */
1948                free_extent_buffer(mid);
1949
1950                root_sub_used(root, mid->len);
1951                btrfs_free_tree_block(trans, root, mid, 0, 1);
1952                /* once for the root ptr */
1953                free_extent_buffer_stale(mid);
1954                return 0;
1955        }
1956        if (btrfs_header_nritems(mid) >
1957            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1958                return 0;
1959
1960        left = read_node_slot(root, parent, pslot - 1);
1961        if (left) {
1962                btrfs_tree_lock(left);
1963                btrfs_set_lock_blocking(left);
1964                wret = btrfs_cow_block(trans, root, left,
1965                                       parent, pslot - 1, &left);
1966                if (wret) {
1967                        ret = wret;
1968                        goto enospc;
1969                }
1970        }
1971        right = read_node_slot(root, parent, pslot + 1);
1972        if (right) {
1973                btrfs_tree_lock(right);
1974                btrfs_set_lock_blocking(right);
1975                wret = btrfs_cow_block(trans, root, right,
1976                                       parent, pslot + 1, &right);
1977                if (wret) {
1978                        ret = wret;
1979                        goto enospc;
1980                }
1981        }
1982
1983        /* first, try to make some room in the middle buffer */
1984        if (left) {
1985                orig_slot += btrfs_header_nritems(left);
1986                wret = push_node_left(trans, root, left, mid, 1);
1987                if (wret < 0)
1988                        ret = wret;
1989        }
1990
1991        /*
1992         * then try to empty the right most buffer into the middle
1993         */
1994        if (right) {
1995                wret = push_node_left(trans, root, mid, right, 1);
1996                if (wret < 0 && wret != -ENOSPC)
1997                        ret = wret;
1998                if (btrfs_header_nritems(right) == 0) {
1999                        clean_tree_block(trans, root->fs_info, right);
2000                        btrfs_tree_unlock(right);
2001                        del_ptr(root, path, level + 1, pslot + 1);
2002                        root_sub_used(root, right->len);
2003                        btrfs_free_tree_block(trans, root, right, 0, 1);
2004                        free_extent_buffer_stale(right);
2005                        right = NULL;
2006                } else {
2007                        struct btrfs_disk_key right_key;
2008                        btrfs_node_key(right, &right_key, 0);
2009                        tree_mod_log_set_node_key(root->fs_info, parent,
2010                                                  pslot + 1, 0);
2011                        btrfs_set_node_key(parent, &right_key, pslot + 1);
2012                        btrfs_mark_buffer_dirty(parent);
2013                }
2014        }
2015        if (btrfs_header_nritems(mid) == 1) {
2016                /*
2017                 * we're not allowed to leave a node with one item in the
2018                 * tree during a delete.  A deletion from lower in the tree
2019                 * could try to delete the only pointer in this node.
2020                 * So, pull some keys from the left.
2021                 * There has to be a left pointer at this point because
2022                 * otherwise we would have pulled some pointers from the
2023                 * right
2024                 */
2025                if (!left) {
2026                        ret = -EROFS;
2027                        btrfs_std_error(root->fs_info, ret);
2028                        goto enospc;
2029                }
2030                wret = balance_node_right(trans, root, mid, left);
2031                if (wret < 0) {
2032                        ret = wret;
2033                        goto enospc;
2034                }
2035                if (wret == 1) {
2036                        wret = push_node_left(trans, root, left, mid, 1);
2037                        if (wret < 0)
2038                                ret = wret;
2039                }
2040                BUG_ON(wret == 1);
2041        }
2042        if (btrfs_header_nritems(mid) == 0) {
2043                clean_tree_block(trans, root->fs_info, mid);
2044                btrfs_tree_unlock(mid);
2045                del_ptr(root, path, level + 1, pslot);
2046                root_sub_used(root, mid->len);
2047                btrfs_free_tree_block(trans, root, mid, 0, 1);
2048                free_extent_buffer_stale(mid);
2049                mid = NULL;
2050        } else {
2051                /* update the parent key to reflect our changes */
2052                struct btrfs_disk_key mid_key;
2053                btrfs_node_key(mid, &mid_key, 0);
2054                tree_mod_log_set_node_key(root->fs_info, parent,
2055                                          pslot, 0);
2056                btrfs_set_node_key(parent, &mid_key, pslot);
2057                btrfs_mark_buffer_dirty(parent);
2058        }
2059
2060        /* update the path */
2061        if (left) {
2062                if (btrfs_header_nritems(left) > orig_slot) {
2063                        extent_buffer_get(left);
2064                        /* left was locked after cow */
2065                        path->nodes[level] = left;
2066                        path->slots[level + 1] -= 1;
2067                        path->slots[level] = orig_slot;
2068                        if (mid) {
2069                                btrfs_tree_unlock(mid);
2070                                free_extent_buffer(mid);
2071                        }
2072                } else {
2073                        orig_slot -= btrfs_header_nritems(left);
2074                        path->slots[level] = orig_slot;
2075                }
2076        }
2077        /* double check we haven't messed things up */
2078        if (orig_ptr !=
2079            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2080                BUG();
2081enospc:
2082        if (right) {
2083                btrfs_tree_unlock(right);
2084                free_extent_buffer(right);
2085        }
2086        if (left) {
2087                if (path->nodes[level] != left)
2088                        btrfs_tree_unlock(left);
2089                free_extent_buffer(left);
2090        }
2091        return ret;
2092}
2093
2094/* Node balancing for insertion.  Here we only split or push nodes around
2095 * when they are completely full.  This is also done top down, so we
2096 * have to be pessimistic.
2097 */
2098static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2099                                          struct btrfs_root *root,
2100                                          struct btrfs_path *path, int level)
2101{
2102        struct extent_buffer *right = NULL;
2103        struct extent_buffer *mid;
2104        struct extent_buffer *left = NULL;
2105        struct extent_buffer *parent = NULL;
2106        int ret = 0;
2107        int wret;
2108        int pslot;
2109        int orig_slot = path->slots[level];
2110
2111        if (level == 0)
2112                return 1;
2113
2114        mid = path->nodes[level];
2115        WARN_ON(btrfs_header_generation(mid) != trans->transid);
2116
2117        if (level < BTRFS_MAX_LEVEL - 1) {
2118                parent = path->nodes[level + 1];
2119                pslot = path->slots[level + 1];
2120        }
2121
2122        if (!parent)
2123                return 1;
2124
2125        left = read_node_slot(root, parent, pslot - 1);
2126
2127        /* first, try to make some room in the middle buffer */
2128        if (left) {
2129                u32 left_nr;
2130
2131                btrfs_tree_lock(left);
2132                btrfs_set_lock_blocking(left);
2133
2134                left_nr = btrfs_header_nritems(left);
2135                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2136                        wret = 1;
2137                } else {
2138                        ret = btrfs_cow_block(trans, root, left, parent,
2139                                              pslot - 1, &left);
2140                        if (ret)
2141                                wret = 1;
2142                        else {
2143                                wret = push_node_left(trans, root,
2144                                                      left, mid, 0);
2145                        }
2146                }
2147                if (wret < 0)
2148                        ret = wret;
2149                if (wret == 0) {
2150                        struct btrfs_disk_key disk_key;
2151                        orig_slot += left_nr;
2152                        btrfs_node_key(mid, &disk_key, 0);
2153                        tree_mod_log_set_node_key(root->fs_info, parent,
2154                                                  pslot, 0);
2155                        btrfs_set_node_key(parent, &disk_key, pslot);
2156                        btrfs_mark_buffer_dirty(parent);
2157                        if (btrfs_header_nritems(left) > orig_slot) {
2158                                path->nodes[level] = left;
2159                                path->slots[level + 1] -= 1;
2160                                path->slots[level] = orig_slot;
2161                                btrfs_tree_unlock(mid);
2162                                free_extent_buffer(mid);
2163                        } else {
2164                                orig_slot -=
2165                                        btrfs_header_nritems(left);
2166                                path->slots[level] = orig_slot;
2167                                btrfs_tree_unlock(left);
2168                                free_extent_buffer(left);
2169                        }
2170                        return 0;
2171                }
2172                btrfs_tree_unlock(left);
2173                free_extent_buffer(left);
2174        }
2175        right = read_node_slot(root, parent, pslot + 1);
2176
2177        /*
2178         * then try to empty the right most buffer into the middle
2179         */
2180        if (right) {
2181                u32 right_nr;
2182
2183                btrfs_tree_lock(right);
2184                btrfs_set_lock_blocking(right);
2185
2186                right_nr = btrfs_header_nritems(right);
2187                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2188                        wret = 1;
2189                } else {
2190                        ret = btrfs_cow_block(trans, root, right,
2191                                              parent, pslot + 1,
2192                                              &right);
2193                        if (ret)
2194                                wret = 1;
2195                        else {
2196                                wret = balance_node_right(trans, root,
2197                                                          right, mid);
2198                        }
2199                }
2200                if (wret < 0)
2201                        ret = wret;
2202                if (wret == 0) {
2203                        struct btrfs_disk_key disk_key;
2204
2205                        btrfs_node_key(right, &disk_key, 0);
2206                        tree_mod_log_set_node_key(root->fs_info, parent,
2207                                                  pslot + 1, 0);
2208                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
2209                        btrfs_mark_buffer_dirty(parent);
2210
2211                        if (btrfs_header_nritems(mid) <= orig_slot) {
2212                                path->nodes[level] = right;
2213                                path->slots[level + 1] += 1;
2214                                path->slots[level] = orig_slot -
2215                                        btrfs_header_nritems(mid);
2216                                btrfs_tree_unlock(mid);
2217                                free_extent_buffer(mid);
2218                        } else {
2219                                btrfs_tree_unlock(right);
2220                                free_extent_buffer(right);
2221                        }
2222                        return 0;
2223                }
2224                btrfs_tree_unlock(right);
2225                free_extent_buffer(right);
2226        }
2227        return 1;
2228}
2229
2230/*
2231 * readahead one full node of leaves, finding things that are close
2232 * to the block in 'slot', and triggering ra on them.
2233 */
2234static void reada_for_search(struct btrfs_root *root,
2235                             struct btrfs_path *path,
2236                             int level, int slot, u64 objectid)
2237{
2238        struct extent_buffer *node;
2239        struct btrfs_disk_key disk_key;
2240        u32 nritems;
2241        u64 search;
2242        u64 target;
2243        u64 nread = 0;
2244        u64 gen;
2245        int direction = path->reada;
2246        struct extent_buffer *eb;
2247        u32 nr;
2248        u32 blocksize;
2249        u32 nscan = 0;
2250
2251        if (level != 1)
2252                return;
2253
2254        if (!path->nodes[level])
2255                return;
2256
2257        node = path->nodes[level];
2258
2259        search = btrfs_node_blockptr(node, slot);
2260        blocksize = root->nodesize;
2261        eb = btrfs_find_tree_block(root->fs_info, search);
2262        if (eb) {
2263                free_extent_buffer(eb);
2264                return;
2265        }
2266
2267        target = search;
2268
2269        nritems = btrfs_header_nritems(node);
2270        nr = slot;
2271
2272        while (1) {
2273                if (direction < 0) {
2274                        if (nr == 0)
2275                                break;
2276                        nr--;
2277                } else if (direction > 0) {
2278                        nr++;
2279                        if (nr >= nritems)
2280                                break;
2281                }
2282                if (path->reada < 0 && objectid) {
2283                        btrfs_node_key(node, &disk_key, nr);
2284                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
2285                                break;
2286                }
2287                search = btrfs_node_blockptr(node, nr);
2288                if ((search <= target && target - search <= 65536) ||
2289                    (search > target && search - target <= 65536)) {
2290                        gen = btrfs_node_ptr_generation(node, nr);
2291                        readahead_tree_block(root, search);
2292                        nread += blocksize;
2293                }
2294                nscan++;
2295                if ((nread > 65536 || nscan > 32))
2296                        break;
2297        }
2298}
2299
2300static noinline void reada_for_balance(struct btrfs_root *root,
2301                                       struct btrfs_path *path, int level)
2302{
2303        int slot;
2304        int nritems;
2305        struct extent_buffer *parent;
2306        struct extent_buffer *eb;
2307        u64 gen;
2308        u64 block1 = 0;
2309        u64 block2 = 0;
2310
2311        parent = path->nodes[level + 1];
2312        if (!parent)
2313                return;
2314
2315        nritems = btrfs_header_nritems(parent);
2316        slot = path->slots[level + 1];
2317
2318        if (slot > 0) {
2319                block1 = btrfs_node_blockptr(parent, slot - 1);
2320                gen = btrfs_node_ptr_generation(parent, slot - 1);
2321                eb = btrfs_find_tree_block(root->fs_info, block1);
2322                /*
2323                 * if we get -eagain from btrfs_buffer_uptodate, we
2324                 * don't want to return eagain here.  That will loop
2325                 * forever
2326                 */
2327                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2328                        block1 = 0;
2329                free_extent_buffer(eb);
2330        }
2331        if (slot + 1 < nritems) {
2332                block2 = btrfs_node_blockptr(parent, slot + 1);
2333                gen = btrfs_node_ptr_generation(parent, slot + 1);
2334                eb = btrfs_find_tree_block(root->fs_info, block2);
2335                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2336                        block2 = 0;
2337                free_extent_buffer(eb);
2338        }
2339
2340        if (block1)
2341                readahead_tree_block(root, block1);
2342        if (block2)
2343                readahead_tree_block(root, block2);
2344}
2345
2346
2347/*
2348 * when we walk down the tree, it is usually safe to unlock the higher layers
2349 * in the tree.  The exceptions are when our path goes through slot 0, because
2350 * operations on the tree might require changing key pointers higher up in the
2351 * tree.
2352 *
2353 * callers might also have set path->keep_locks, which tells this code to keep
2354 * the lock if the path points to the last slot in the block.  This is part of
2355 * walking through the tree, and selecting the next slot in the higher block.
2356 *
2357 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2358 * if lowest_unlock is 1, level 0 won't be unlocked
2359 */
2360static noinline void unlock_up(struct btrfs_path *path, int level,
2361                               int lowest_unlock, int min_write_lock_level,
2362                               int *write_lock_level)
2363{
2364        int i;
2365        int skip_level = level;
2366        int no_skips = 0;
2367        struct extent_buffer *t;
2368
2369        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2370                if (!path->nodes[i])
2371                        break;
2372                if (!path->locks[i])
2373                        break;
2374                if (!no_skips && path->slots[i] == 0) {
2375                        skip_level = i + 1;
2376                        continue;
2377                }
2378                if (!no_skips && path->keep_locks) {
2379                        u32 nritems;
2380                        t = path->nodes[i];
2381                        nritems = btrfs_header_nritems(t);
2382                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
2383                                skip_level = i + 1;
2384                                continue;
2385                        }
2386                }
2387                if (skip_level < i && i >= lowest_unlock)
2388                        no_skips = 1;
2389
2390                t = path->nodes[i];
2391                if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2392                        btrfs_tree_unlock_rw(t, path->locks[i]);
2393                        path->locks[i] = 0;
2394                        if (write_lock_level &&
2395                            i > min_write_lock_level &&
2396                            i <= *write_lock_level) {
2397                                *write_lock_level = i - 1;
2398                        }
2399                }
2400        }
2401}
2402
2403/*
2404 * This releases any locks held in the path starting at level and
2405 * going all the way up to the root.
2406 *
2407 * btrfs_search_slot will keep the lock held on higher nodes in a few
2408 * corner cases, such as COW of the block at slot zero in the node.  This
2409 * ignores those rules, and it should only be called when there are no
2410 * more updates to be done higher up in the tree.
2411 */
2412noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2413{
2414        int i;
2415
2416        if (path->keep_locks)
2417                return;
2418
2419        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2420                if (!path->nodes[i])
2421                        continue;
2422                if (!path->locks[i])
2423                        continue;
2424                btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2425                path->locks[i] = 0;
2426        }
2427}
2428
2429/*
2430 * helper function for btrfs_search_slot.  The goal is to find a block
2431 * in cache without setting the path to blocking.  If we find the block
2432 * we return zero and the path is unchanged.
2433 *
2434 * If we can't find the block, we set the path blocking and do some
2435 * reada.  -EAGAIN is returned and the search must be repeated.
2436 */
2437static int
2438read_block_for_search(struct btrfs_trans_handle *trans,
2439                       struct btrfs_root *root, struct btrfs_path *p,
2440                       struct extent_buffer **eb_ret, int level, int slot,
2441                       struct btrfs_key *key, u64 time_seq)
2442{
2443        u64 blocknr;
2444        u64 gen;
2445        struct extent_buffer *b = *eb_ret;
2446        struct extent_buffer *tmp;
2447        int ret;
2448
2449        blocknr = btrfs_node_blockptr(b, slot);
2450        gen = btrfs_node_ptr_generation(b, slot);
2451
2452        tmp = btrfs_find_tree_block(root->fs_info, blocknr);
2453        if (tmp) {
2454                /* first we do an atomic uptodate check */
2455                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2456                        *eb_ret = tmp;
2457                        return 0;
2458                }
2459
2460                /* the pages were up to date, but we failed
2461                 * the generation number check.  Do a full
2462                 * read for the generation number that is correct.
2463                 * We must do this without dropping locks so
2464                 * we can trust our generation number
2465                 */
2466                btrfs_set_path_blocking(p);
2467
2468                /* now we're allowed to do a blocking uptodate check */
2469                ret = btrfs_read_buffer(tmp, gen);
2470                if (!ret) {
2471                        *eb_ret = tmp;
2472                        return 0;
2473                }
2474                free_extent_buffer(tmp);
2475                btrfs_release_path(p);
2476                return -EIO;
2477        }
2478
2479        /*
2480         * reduce lock contention at high levels
2481         * of the btree by dropping locks before
2482         * we read.  Don't release the lock on the current
2483         * level because we need to walk this node to figure
2484         * out which blocks to read.
2485         */
2486        btrfs_unlock_up_safe(p, level + 1);
2487        btrfs_set_path_blocking(p);
2488
2489        free_extent_buffer(tmp);
2490        if (p->reada)
2491                reada_for_search(root, p, level, slot, key->objectid);
2492
2493        btrfs_release_path(p);
2494
2495        ret = -EAGAIN;
2496        tmp = read_tree_block(root, blocknr, 0);
2497        if (tmp) {
2498                /*
2499                 * If the read above didn't mark this buffer up to date,
2500                 * it will never end up being up to date.  Set ret to EIO now
2501                 * and give up so that our caller doesn't loop forever
2502                 * on our EAGAINs.
2503                 */
2504                if (!btrfs_buffer_uptodate(tmp, 0, 0))
2505                        ret = -EIO;
2506                free_extent_buffer(tmp);
2507        }
2508        return ret;
2509}
2510
2511/*
2512 * helper function for btrfs_search_slot.  This does all of the checks
2513 * for node-level blocks and does any balancing required based on
2514 * the ins_len.
2515 *
2516 * If no extra work was required, zero is returned.  If we had to
2517 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2518 * start over
2519 */
2520static int
2521setup_nodes_for_search(struct btrfs_trans_handle *trans,
2522                       struct btrfs_root *root, struct btrfs_path *p,
2523                       struct extent_buffer *b, int level, int ins_len,
2524                       int *write_lock_level)
2525{
2526        int ret;
2527        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2528            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2529                int sret;
2530
2531                if (*write_lock_level < level + 1) {
2532                        *write_lock_level = level + 1;
2533                        btrfs_release_path(p);
2534                        goto again;
2535                }
2536
2537                btrfs_set_path_blocking(p);
2538                reada_for_balance(root, p, level);
2539                sret = split_node(trans, root, p, level);
2540                btrfs_clear_path_blocking(p, NULL, 0);
2541
2542                BUG_ON(sret > 0);
2543                if (sret) {
2544                        ret = sret;
2545                        goto done;
2546                }
2547                b = p->nodes[level];
2548        } else if (ins_len < 0 && btrfs_header_nritems(b) <
2549                   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2550                int sret;
2551
2552                if (*write_lock_level < level + 1) {
2553                        *write_lock_level = level + 1;
2554                        btrfs_release_path(p);
2555                        goto again;
2556                }
2557
2558                btrfs_set_path_blocking(p);
2559                reada_for_balance(root, p, level);
2560                sret = balance_level(trans, root, p, level);
2561                btrfs_clear_path_blocking(p, NULL, 0);
2562
2563                if (sret) {
2564                        ret = sret;
2565                        goto done;
2566                }
2567                b = p->nodes[level];
2568                if (!b) {
2569                        btrfs_release_path(p);
2570                        goto again;
2571                }
2572                BUG_ON(btrfs_header_nritems(b) == 1);
2573        }
2574        return 0;
2575
2576again:
2577        ret = -EAGAIN;
2578done:
2579        return ret;
2580}
2581
2582static void key_search_validate(struct extent_buffer *b,
2583                                struct btrfs_key *key,
2584                                int level)
2585{
2586#ifdef CONFIG_BTRFS_ASSERT
2587        struct btrfs_disk_key disk_key;
2588
2589        btrfs_cpu_key_to_disk(&disk_key, key);
2590
2591        if (level == 0)
2592                ASSERT(!memcmp_extent_buffer(b, &disk_key,
2593                    offsetof(struct btrfs_leaf, items[0].key),
2594                    sizeof(disk_key)));
2595        else
2596                ASSERT(!memcmp_extent_buffer(b, &disk_key,
2597                    offsetof(struct btrfs_node, ptrs[0].key),
2598                    sizeof(disk_key)));
2599#endif
2600}
2601
2602static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2603                      int level, int *prev_cmp, int *slot)
2604{
2605        if (*prev_cmp != 0) {
2606                *prev_cmp = bin_search(b, key, level, slot);
2607                return *prev_cmp;
2608        }
2609
2610        key_search_validate(b, key, level);
2611        *slot = 0;
2612
2613        return 0;
2614}
2615
2616int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2617                u64 iobjectid, u64 ioff, u8 key_type,
2618                struct btrfs_key *found_key)
2619{
2620        int ret;
2621        struct btrfs_key key;
2622        struct extent_buffer *eb;
2623
2624        ASSERT(path);
2625        ASSERT(found_key);
2626
2627        key.type = key_type;
2628        key.objectid = iobjectid;
2629        key.offset = ioff;
2630
2631        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2632        if (ret < 0)
2633                return ret;
2634
2635        eb = path->nodes[0];
2636        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2637                ret = btrfs_next_leaf(fs_root, path);
2638                if (ret)
2639                        return ret;
2640                eb = path->nodes[0];
2641        }
2642
2643        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2644        if (found_key->type != key.type ||
2645                        found_key->objectid != key.objectid)
2646                return 1;
2647
2648        return 0;
2649}
2650
2651/*
2652 * look for key in the tree.  path is filled in with nodes along the way
2653 * if key is found, we return zero and you can find the item in the leaf
2654 * level of the path (level 0)
2655 *
2656 * If the key isn't found, the path points to the slot where it should
2657 * be inserted, and 1 is returned.  If there are other errors during the
2658 * search a negative error number is returned.
2659 *
2660 * if ins_len > 0, nodes and leaves will be split as we walk down the
2661 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2662 * possible)
2663 */
2664int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2665                      *root, struct btrfs_key *key, struct btrfs_path *p, int
2666                      ins_len, int cow)
2667{
2668        struct extent_buffer *b;
2669        int slot;
2670        int ret;
2671        int err;
2672        int level;
2673        int lowest_unlock = 1;
2674        int root_lock;
2675        /* everything at write_lock_level or lower must be write locked */
2676        int write_lock_level = 0;
2677        u8 lowest_level = 0;
2678        int min_write_lock_level;
2679        int prev_cmp;
2680
2681        lowest_level = p->lowest_level;
2682        WARN_ON(lowest_level && ins_len > 0);
2683        WARN_ON(p->nodes[0] != NULL);
2684        BUG_ON(!cow && ins_len);
2685
2686        if (ins_len < 0) {
2687                lowest_unlock = 2;
2688
2689                /* when we are removing items, we might have to go up to level
2690                 * two as we update tree pointers  Make sure we keep write
2691                 * for those levels as well
2692                 */
2693                write_lock_level = 2;
2694        } else if (ins_len > 0) {
2695                /*
2696                 * for inserting items, make sure we have a write lock on
2697                 * level 1 so we can update keys
2698                 */
2699                write_lock_level = 1;
2700        }
2701
2702        if (!cow)
2703                write_lock_level = -1;
2704
2705        if (cow && (p->keep_locks || p->lowest_level))
2706                write_lock_level = BTRFS_MAX_LEVEL;
2707
2708        min_write_lock_level = write_lock_level;
2709
2710again:
2711        prev_cmp = -1;
2712        /*
2713         * we try very hard to do read locks on the root
2714         */
2715        root_lock = BTRFS_READ_LOCK;
2716        level = 0;
2717        if (p->search_commit_root) {
2718                /*
2719                 * the commit roots are read only
2720                 * so we always do read locks
2721                 */
2722                if (p->need_commit_sem)
2723                        down_read(&root->fs_info->commit_root_sem);
2724                b = root->commit_root;
2725                extent_buffer_get(b);
2726                level = btrfs_header_level(b);
2727                if (p->need_commit_sem)
2728                        up_read(&root->fs_info->commit_root_sem);
2729                if (!p->skip_locking)
2730                        btrfs_tree_read_lock(b);
2731        } else {
2732                if (p->skip_locking) {
2733                        b = btrfs_root_node(root);
2734                        level = btrfs_header_level(b);
2735                } else {
2736                        /* we don't know the level of the root node
2737                         * until we actually have it read locked
2738                         */
2739                        b = btrfs_read_lock_root_node(root);
2740                        level = btrfs_header_level(b);
2741                        if (level <= write_lock_level) {
2742                                /* whoops, must trade for write lock */
2743                                btrfs_tree_read_unlock(b);
2744                                free_extent_buffer(b);
2745                                b = btrfs_lock_root_node(root);
2746                                root_lock = BTRFS_WRITE_LOCK;
2747
2748                                /* the level might have changed, check again */
2749                                level = btrfs_header_level(b);
2750                        }
2751                }
2752        }
2753        p->nodes[level] = b;
2754        if (!p->skip_locking)
2755                p->locks[level] = root_lock;
2756
2757        while (b) {
2758                level = btrfs_header_level(b);
2759
2760                /*
2761                 * setup the path here so we can release it under lock
2762                 * contention with the cow code
2763                 */
2764                if (cow) {
2765                        /*
2766                         * if we don't really need to cow this block
2767                         * then we don't want to set the path blocking,
2768                         * so we test it here
2769                         */
2770                        if (!should_cow_block(trans, root, b))
2771                                goto cow_done;
2772
2773                        /*
2774                         * must have write locks on this node and the
2775                         * parent
2776                         */
2777                        if (level > write_lock_level ||
2778                            (level + 1 > write_lock_level &&
2779                            level + 1 < BTRFS_MAX_LEVEL &&
2780                            p->nodes[level + 1])) {
2781                                write_lock_level = level + 1;
2782                                btrfs_release_path(p);
2783                                goto again;
2784                        }
2785
2786                        btrfs_set_path_blocking(p);
2787                        err = btrfs_cow_block(trans, root, b,
2788                                              p->nodes[level + 1],
2789                                              p->slots[level + 1], &b);
2790                        if (err) {
2791                                ret = err;
2792                                goto done;
2793                        }
2794                }
2795cow_done:
2796                p->nodes[level] = b;
2797                btrfs_clear_path_blocking(p, NULL, 0);
2798
2799                /*
2800                 * we have a lock on b and as long as we aren't changing
2801                 * the tree, there is no way to for the items in b to change.
2802                 * It is safe to drop the lock on our parent before we
2803                 * go through the expensive btree search on b.
2804                 *
2805                 * If we're inserting or deleting (ins_len != 0), then we might
2806                 * be changing slot zero, which may require changing the parent.
2807                 * So, we can't drop the lock until after we know which slot
2808                 * we're operating on.
2809                 */
2810                if (!ins_len && !p->keep_locks) {
2811                        int u = level + 1;
2812
2813                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2814                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2815                                p->locks[u] = 0;
2816                        }
2817                }
2818
2819                ret = key_search(b, key, level, &prev_cmp, &slot);
2820
2821                if (level != 0) {
2822                        int dec = 0;
2823                        if (ret && slot > 0) {
2824                                dec = 1;
2825                                slot -= 1;
2826                        }
2827                        p->slots[level] = slot;
2828                        err = setup_nodes_for_search(trans, root, p, b, level,
2829                                             ins_len, &write_lock_level);
2830                        if (err == -EAGAIN)
2831                                goto again;
2832                        if (err) {
2833                                ret = err;
2834                                goto done;
2835                        }
2836                        b = p->nodes[level];
2837                        slot = p->slots[level];
2838
2839                        /*
2840                         * slot 0 is special, if we change the key
2841                         * we have to update the parent pointer
2842                         * which means we must have a write lock
2843                         * on the parent
2844                         */
2845                        if (slot == 0 && ins_len &&
2846                            write_lock_level < level + 1) {
2847                                write_lock_level = level + 1;
2848                                btrfs_release_path(p);
2849                                goto again;
2850                        }
2851
2852                        unlock_up(p, level, lowest_unlock,
2853                                  min_write_lock_level, &write_lock_level);
2854
2855                        if (level == lowest_level) {
2856                                if (dec)
2857                                        p->slots[level]++;
2858                                goto done;
2859                        }
2860
2861                        err = read_block_for_search(trans, root, p,
2862                                                    &b, level, slot, key, 0);
2863                        if (err == -EAGAIN)
2864                                goto again;
2865                        if (err) {
2866                                ret = err;
2867                                goto done;
2868                        }
2869
2870                        if (!p->skip_locking) {
2871                                level = btrfs_header_level(b);
2872                                if (level <= write_lock_level) {
2873                                        err = btrfs_try_tree_write_lock(b);
2874                                        if (!err) {
2875                                                btrfs_set_path_blocking(p);
2876                                                btrfs_tree_lock(b);
2877                                                btrfs_clear_path_blocking(p, b,
2878                                                                  BTRFS_WRITE_LOCK);
2879                                        }
2880                                        p->locks[level] = BTRFS_WRITE_LOCK;
2881                                } else {
2882                                        err = btrfs_tree_read_lock_atomic(b);
2883                                        if (!err) {
2884                                                btrfs_set_path_blocking(p);
2885                                                btrfs_tree_read_lock(b);
2886                                                btrfs_clear_path_blocking(p, b,
2887                                                                  BTRFS_READ_LOCK);
2888                                        }
2889                                        p->locks[level] = BTRFS_READ_LOCK;
2890                                }
2891                                p->nodes[level] = b;
2892                        }
2893                } else {
2894                        p->slots[level] = slot;
2895                        if (ins_len > 0 &&
2896                            btrfs_leaf_free_space(root, b) < ins_len) {
2897                                if (write_lock_level < 1) {
2898                                        write_lock_level = 1;
2899                                        btrfs_release_path(p);
2900                                        goto again;
2901                                }
2902
2903                                btrfs_set_path_blocking(p);
2904                                err = split_leaf(trans, root, key,
2905                                                 p, ins_len, ret == 0);
2906                                btrfs_clear_path_blocking(p, NULL, 0);
2907
2908                                BUG_ON(err > 0);
2909                                if (err) {
2910                                        ret = err;
2911                                        goto done;
2912                                }
2913                        }
2914                        if (!p->search_for_split)
2915                                unlock_up(p, level, lowest_unlock,
2916                                          min_write_lock_level, &write_lock_level);
2917                        goto done;
2918                }
2919        }
2920        ret = 1;
2921done:
2922        /*
2923         * we don't really know what they plan on doing with the path
2924         * from here on, so for now just mark it as blocking
2925         */
2926        if (!p->leave_spinning)
2927                btrfs_set_path_blocking(p);
2928        if (ret < 0 && !p->skip_release_on_error)
2929                btrfs_release_path(p);
2930        return ret;
2931}
2932
2933/*
2934 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2935 * current state of the tree together with the operations recorded in the tree
2936 * modification log to search for the key in a previous version of this tree, as
2937 * denoted by the time_seq parameter.
2938 *
2939 * Naturally, there is no support for insert, delete or cow operations.
2940 *
2941 * The resulting path and return value will be set up as if we called
2942 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2943 */
2944int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2945                          struct btrfs_path *p, u64 time_seq)
2946{
2947        struct extent_buffer *b;
2948        int slot;
2949        int ret;
2950        int err;
2951        int level;
2952        int lowest_unlock = 1;
2953        u8 lowest_level = 0;
2954        int prev_cmp = -1;
2955
2956        lowest_level = p->lowest_level;
2957        WARN_ON(p->nodes[0] != NULL);
2958
2959        if (p->search_commit_root) {
2960                BUG_ON(time_seq);
2961                return btrfs_search_slot(NULL, root, key, p, 0, 0);
2962        }
2963
2964again:
2965        b = get_old_root(root, time_seq);
2966        level = btrfs_header_level(b);
2967        p->locks[level] = BTRFS_READ_LOCK;
2968
2969        while (b) {
2970                level = btrfs_header_level(b);
2971                p->nodes[level] = b;
2972                btrfs_clear_path_blocking(p, NULL, 0);
2973
2974                /*
2975                 * we have a lock on b and as long as we aren't changing
2976                 * the tree, there is no way to for the items in b to change.
2977                 * It is safe to drop the lock on our parent before we
2978                 * go through the expensive btree search on b.
2979                 */
2980                btrfs_unlock_up_safe(p, level + 1);
2981
2982                /*
2983                 * Since we can unwind eb's we want to do a real search every
2984                 * time.
2985                 */
2986                prev_cmp = -1;
2987                ret = key_search(b, key, level, &prev_cmp, &slot);
2988
2989                if (level != 0) {
2990                        int dec = 0;
2991                        if (ret && slot > 0) {
2992                                dec = 1;
2993                                slot -= 1;
2994                        }
2995                        p->slots[level] = slot;
2996                        unlock_up(p, level, lowest_unlock, 0, NULL);
2997
2998                        if (level == lowest_level) {
2999                                if (dec)
3000                                        p->slots[level]++;
3001                                goto done;
3002                        }
3003
3004                        err = read_block_for_search(NULL, root, p, &b, level,
3005                                                    slot, key, time_seq);
3006                        if (err == -EAGAIN)
3007                                goto again;
3008                        if (err) {
3009                                ret = err;
3010                                goto done;
3011                        }
3012
3013                        level = btrfs_header_level(b);
3014                        err = btrfs_tree_read_lock_atomic(b);
3015                        if (!err) {
3016                                btrfs_set_path_blocking(p);
3017                                btrfs_tree_read_lock(b);
3018                                btrfs_clear_path_blocking(p, b,
3019                                                          BTRFS_READ_LOCK);
3020                        }
3021                        b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3022                        if (!b) {
3023                                ret = -ENOMEM;
3024                                goto done;
3025                        }
3026                        p->locks[level] = BTRFS_READ_LOCK;
3027                        p->nodes[level] = b;
3028                } else {
3029                        p->slots[level] = slot;
3030                        unlock_up(p, level, lowest_unlock, 0, NULL);
3031                        goto done;
3032                }
3033        }
3034        ret = 1;
3035done:
3036        if (!p->leave_spinning)
3037                btrfs_set_path_blocking(p);
3038        if (ret < 0)
3039                btrfs_release_path(p);
3040
3041        return ret;
3042}
3043
3044/*
3045 * helper to use instead of search slot if no exact match is needed but
3046 * instead the next or previous item should be returned.
3047 * When find_higher is true, the next higher item is returned, the next lower
3048 * otherwise.
3049 * When return_any and find_higher are both true, and no higher item is found,
3050 * return the next lower instead.
3051 * When return_any is true and find_higher is false, and no lower item is found,
3052 * return the next higher instead.
3053 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3054 * < 0 on error
3055 */
3056int btrfs_search_slot_for_read(struct btrfs_root *root,
3057                               struct btrfs_key *key, struct btrfs_path *p,
3058                               int find_higher, int return_any)
3059{
3060        int ret;
3061        struct extent_buffer *leaf;
3062
3063again:
3064        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3065        if (ret <= 0)
3066                return ret;
3067        /*
3068         * a return value of 1 means the path is at the position where the
3069         * item should be inserted. Normally this is the next bigger item,
3070         * but in case the previous item is the last in a leaf, path points
3071         * to the first free slot in the previous leaf, i.e. at an invalid
3072         * item.
3073         */
3074        leaf = p->nodes[0];
3075
3076        if (find_higher) {
3077                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3078                        ret = btrfs_next_leaf(root, p);
3079                        if (ret <= 0)
3080                                return ret;
3081                        if (!return_any)
3082                                return 1;
3083                        /*
3084                         * no higher item found, return the next
3085                         * lower instead
3086                         */
3087                        return_any = 0;
3088                        find_higher = 0;
3089                        btrfs_release_path(p);
3090                        goto again;
3091                }
3092        } else {
3093                if (p->slots[0] == 0) {
3094                        ret = btrfs_prev_leaf(root, p);
3095                        if (ret < 0)
3096                                return ret;
3097                        if (!ret) {
3098                                leaf = p->nodes[0];
3099                                if (p->slots[0] == btrfs_header_nritems(leaf))
3100                                        p->slots[0]--;
3101                                return 0;
3102                        }
3103                        if (!return_any)
3104                                return 1;
3105                        /*
3106                         * no lower item found, return the next
3107                         * higher instead
3108                         */
3109                        return_any = 0;
3110                        find_higher = 1;
3111                        btrfs_release_path(p);
3112                        goto again;
3113                } else {
3114                        --p->slots[0];
3115                }
3116        }
3117        return 0;
3118}
3119
3120/*
3121 * adjust the pointers going up the tree, starting at level
3122 * making sure the right key of each node is points to 'key'.
3123 * This is used after shifting pointers to the left, so it stops
3124 * fixing up pointers when a given leaf/node is not in slot 0 of the
3125 * higher levels
3126 *
3127 */
3128static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3129                           struct btrfs_path *path,
3130                           struct btrfs_disk_key *key, int level)
3131{
3132        int i;
3133        struct extent_buffer *t;
3134
3135        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3136                int tslot = path->slots[i];
3137                if (!path->nodes[i])
3138                        break;
3139                t = path->nodes[i];
3140                tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3141                btrfs_set_node_key(t, key, tslot);
3142                btrfs_mark_buffer_dirty(path->nodes[i]);
3143                if (tslot != 0)
3144                        break;
3145        }
3146}
3147
3148/*
3149 * update item key.
3150 *
3151 * This function isn't completely safe. It's the caller's responsibility
3152 * that the new key won't break the order
3153 */
3154void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3155                             struct btrfs_path *path,
3156                             struct btrfs_key *new_key)
3157{
3158        struct btrfs_disk_key disk_key;
3159        struct extent_buffer *eb;
3160        int slot;
3161
3162        eb = path->nodes[0];
3163        slot = path->slots[0];
3164        if (slot > 0) {
3165                btrfs_item_key(eb, &disk_key, slot - 1);
3166                BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3167        }
3168        if (slot < btrfs_header_nritems(eb) - 1) {
3169                btrfs_item_key(eb, &disk_key, slot + 1);
3170                BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3171        }
3172
3173        btrfs_cpu_key_to_disk(&disk_key, new_key);
3174        btrfs_set_item_key(eb, &disk_key, slot);
3175        btrfs_mark_buffer_dirty(eb);
3176        if (slot == 0)
3177                fixup_low_keys(fs_info, path, &disk_key, 1);
3178}
3179
3180/*
3181 * try to push data from one node into the next node left in the
3182 * tree.
3183 *
3184 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3185 * error, and > 0 if there was no room in the left hand block.
3186 */
3187static int push_node_left(struct btrfs_trans_handle *trans,
3188                          struct btrfs_root *root, struct extent_buffer *dst,
3189                          struct extent_buffer *src, int empty)
3190{
3191        int push_items = 0;
3192        int src_nritems;
3193        int dst_nritems;
3194        int ret = 0;
3195
3196        src_nritems = btrfs_header_nritems(src);
3197        dst_nritems = btrfs_header_nritems(dst);
3198        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3199        WARN_ON(btrfs_header_generation(src) != trans->transid);
3200        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3201
3202        if (!empty && src_nritems <= 8)
3203                return 1;
3204
3205        if (push_items <= 0)
3206                return 1;
3207
3208        if (empty) {
3209                push_items = min(src_nritems, push_items);
3210                if (push_items < src_nritems) {
3211                        /* leave at least 8 pointers in the node if
3212                         * we aren't going to empty it
3213                         */
3214                        if (src_nritems - push_items < 8) {
3215                                if (push_items <= 8)
3216                                        return 1;
3217                                push_items -= 8;
3218                        }
3219                }
3220        } else
3221                push_items = min(src_nritems - 8, push_items);
3222
3223        ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3224                                   push_items);
3225        if (ret) {
3226                btrfs_abort_transaction(trans, root, ret);
3227                return ret;
3228        }
3229        copy_extent_buffer(dst, src,
3230                           btrfs_node_key_ptr_offset(dst_nritems),
3231                           btrfs_node_key_ptr_offset(0),
3232                           push_items * sizeof(struct btrfs_key_ptr));
3233
3234        if (push_items < src_nritems) {
3235                /*
3236                 * don't call tree_mod_log_eb_move here, key removal was already
3237                 * fully logged by tree_mod_log_eb_copy above.
3238                 */
3239                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3240                                      btrfs_node_key_ptr_offset(push_items),
3241                                      (src_nritems - push_items) *
3242                                      sizeof(struct btrfs_key_ptr));
3243        }
3244        btrfs_set_header_nritems(src, src_nritems - push_items);
3245        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3246        btrfs_mark_buffer_dirty(src);
3247        btrfs_mark_buffer_dirty(dst);
3248
3249        return ret;
3250}
3251
3252/*
3253 * try to push data from one node into the next node right in the
3254 * tree.
3255 *
3256 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3257 * error, and > 0 if there was no room in the right hand block.
3258 *
3259 * this will  only push up to 1/2 the contents of the left node over
3260 */
3261static int balance_node_right(struct btrfs_trans_handle *trans,
3262                              struct btrfs_root *root,
3263                              struct extent_buffer *dst,
3264                              struct extent_buffer *src)
3265{
3266        int push_items = 0;
3267        int max_push;
3268        int src_nritems;
3269        int dst_nritems;
3270        int ret = 0;
3271
3272        WARN_ON(btrfs_header_generation(src) != trans->transid);
3273        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3274
3275        src_nritems = btrfs_header_nritems(src);
3276        dst_nritems = btrfs_header_nritems(dst);
3277        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3278        if (push_items <= 0)
3279                return 1;
3280
3281        if (src_nritems < 4)
3282                return 1;
3283
3284        max_push = src_nritems / 2 + 1;
3285        /* don't try to empty the node */
3286        if (max_push >= src_nritems)
3287                return 1;
3288
3289        if (max_push < push_items)
3290                push_items = max_push;
3291
3292        tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3293        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3294                                      btrfs_node_key_ptr_offset(0),
3295                                      (dst_nritems) *
3296                                      sizeof(struct btrfs_key_ptr));
3297
3298        ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3299                                   src_nritems - push_items, push_items);
3300        if (ret) {
3301                btrfs_abort_transaction(trans, root, ret);
3302                return ret;
3303        }
3304        copy_extent_buffer(dst, src,
3305                           btrfs_node_key_ptr_offset(0),
3306                           btrfs_node_key_ptr_offset(src_nritems - push_items),
3307                           push_items * sizeof(struct btrfs_key_ptr));
3308
3309        btrfs_set_header_nritems(src, src_nritems - push_items);
3310        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3311
3312        btrfs_mark_buffer_dirty(src);
3313        btrfs_mark_buffer_dirty(dst);
3314
3315        return ret;
3316}
3317
3318/*
3319 * helper function to insert a new root level in the tree.
3320 * A new node is allocated, and a single item is inserted to
3321 * point to the existing root
3322 *
3323 * returns zero on success or < 0 on failure.
3324 */
3325static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3326                           struct btrfs_root *root,
3327                           struct btrfs_path *path, int level)
3328{
3329        u64 lower_gen;
3330        struct extent_buffer *lower;
3331        struct extent_buffer *c;
3332        struct extent_buffer *old;
3333        struct btrfs_disk_key lower_key;
3334
3335        BUG_ON(path->nodes[level]);
3336        BUG_ON(path->nodes[level-1] != root->node);
3337
3338        lower = path->nodes[level-1];
3339        if (level == 1)
3340                btrfs_item_key(lower, &lower_key, 0);
3341        else
3342                btrfs_node_key(lower, &lower_key, 0);
3343
3344        c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3345                                   &lower_key, level, root->node->start, 0);
3346        if (IS_ERR(c))
3347                return PTR_ERR(c);
3348
3349        root_add_used(root, root->nodesize);
3350
3351        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3352        btrfs_set_header_nritems(c, 1);
3353        btrfs_set_header_level(c, level);
3354        btrfs_set_header_bytenr(c, c->start);
3355        btrfs_set_header_generation(c, trans->transid);
3356        btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3357        btrfs_set_header_owner(c, root->root_key.objectid);
3358
3359        write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3360                            BTRFS_FSID_SIZE);
3361
3362        write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3363                            btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3364
3365        btrfs_set_node_key(c, &lower_key, 0);
3366        btrfs_set_node_blockptr(c, 0, lower->start);
3367        lower_gen = btrfs_header_generation(lower);
3368        WARN_ON(lower_gen != trans->transid);
3369
3370        btrfs_set_node_ptr_generation(c, 0, lower_gen);
3371
3372        btrfs_mark_buffer_dirty(c);
3373
3374        old = root->node;
3375        tree_mod_log_set_root_pointer(root, c, 0);
3376        rcu_assign_pointer(root->node, c);
3377
3378        /* the super has an extra ref to root->node */
3379        free_extent_buffer(old);
3380
3381        add_root_to_dirty_list(root);
3382        extent_buffer_get(c);
3383        path->nodes[level] = c;
3384        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3385        path->slots[level] = 0;
3386        return 0;
3387}
3388
3389/*
3390 * worker function to insert a single pointer in a node.
3391 * the node should have enough room for the pointer already
3392 *
3393 * slot and level indicate where you want the key to go, and
3394 * blocknr is the block the key points to.
3395 */
3396static void insert_ptr(struct btrfs_trans_handle *trans,
3397                       struct btrfs_root *root, struct btrfs_path *path,
3398                       struct btrfs_disk_key *key, u64 bytenr,
3399                       int slot, int level)
3400{
3401        struct extent_buffer *lower;
3402        int nritems;
3403        int ret;
3404
3405        BUG_ON(!path->nodes[level]);
3406        btrfs_assert_tree_locked(path->nodes[level]);
3407        lower = path->nodes[level];
3408        nritems = btrfs_header_nritems(lower);
3409        BUG_ON(slot > nritems);
3410        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3411        if (slot != nritems) {
3412                if (level)
3413                        tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3414                                             slot, nritems - slot);
3415                memmove_extent_buffer(lower,
3416                              btrfs_node_key_ptr_offset(slot + 1),
3417                              btrfs_node_key_ptr_offset(slot),
3418                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
3419        }
3420        if (level) {
3421                ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3422                                              MOD_LOG_KEY_ADD, GFP_NOFS);
3423                BUG_ON(ret < 0);
3424        }
3425        btrfs_set_node_key(lower, key, slot);
3426        btrfs_set_node_blockptr(lower, slot, bytenr);
3427        WARN_ON(trans->transid == 0);
3428        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3429        btrfs_set_header_nritems(lower, nritems + 1);
3430        btrfs_mark_buffer_dirty(lower);
3431}
3432
3433/*
3434 * split the node at the specified level in path in two.
3435 * The path is corrected to point to the appropriate node after the split
3436 *
3437 * Before splitting this tries to make some room in the node by pushing
3438 * left and right, if either one works, it returns right away.
3439 *
3440 * returns 0 on success and < 0 on failure
3441 */
3442static noinline int split_node(struct btrfs_trans_handle *trans,
3443                               struct btrfs_root *root,
3444                               struct btrfs_path *path, int level)
3445{
3446        struct extent_buffer *c;
3447        struct extent_buffer *split;
3448        struct btrfs_disk_key disk_key;
3449        int mid;
3450        int ret;
3451        u32 c_nritems;
3452
3453        c = path->nodes[level];
3454        WARN_ON(btrfs_header_generation(c) != trans->transid);
3455        if (c == root->node) {
3456                /*
3457                 * trying to split the root, lets make a new one
3458                 *
3459                 * tree mod log: We don't log_removal old root in
3460                 * insert_new_root, because that root buffer will be kept as a
3461                 * normal node. We are going to log removal of half of the
3462                 * elements below with tree_mod_log_eb_copy. We're holding a
3463                 * tree lock on the buffer, which is why we cannot race with
3464                 * other tree_mod_log users.
3465                 */
3466                ret = insert_new_root(trans, root, path, level + 1);
3467                if (ret)
3468                        return ret;
3469        } else {
3470                ret = push_nodes_for_insert(trans, root, path, level);
3471                c = path->nodes[level];
3472                if (!ret && btrfs_header_nritems(c) <
3473                    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3474                        return 0;
3475                if (ret < 0)
3476                        return ret;
3477        }
3478
3479        c_nritems = btrfs_header_nritems(c);
3480        mid = (c_nritems + 1) / 2;
3481        btrfs_node_key(c, &disk_key, mid);
3482
3483        split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3484                        &disk_key, level, c->start, 0);
3485        if (IS_ERR(split))
3486                return PTR_ERR(split);
3487
3488        root_add_used(root, root->nodesize);
3489
3490        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3491        btrfs_set_header_level(split, btrfs_header_level(c));
3492        btrfs_set_header_bytenr(split, split->start);
3493        btrfs_set_header_generation(split, trans->transid);
3494        btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3495        btrfs_set_header_owner(split, root->root_key.objectid);
3496        write_extent_buffer(split, root->fs_info->fsid,
3497                            btrfs_header_fsid(), BTRFS_FSID_SIZE);
3498        write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3499                            btrfs_header_chunk_tree_uuid(split),
3500                            BTRFS_UUID_SIZE);
3501
3502        ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3503                                   mid, c_nritems - mid);
3504        if (ret) {
3505                btrfs_abort_transaction(trans, root, ret);
3506                return ret;
3507        }
3508        copy_extent_buffer(split, c,
3509                           btrfs_node_key_ptr_offset(0),
3510                           btrfs_node_key_ptr_offset(mid),
3511                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3512        btrfs_set_header_nritems(split, c_nritems - mid);
3513        btrfs_set_header_nritems(c, mid);
3514        ret = 0;
3515
3516        btrfs_mark_buffer_dirty(c);
3517        btrfs_mark_buffer_dirty(split);
3518
3519        insert_ptr(trans, root, path, &disk_key, split->start,
3520                   path->slots[level + 1] + 1, level + 1);
3521
3522        if (path->slots[level] >= mid) {
3523                path->slots[level] -= mid;
3524                btrfs_tree_unlock(c);
3525                free_extent_buffer(c);
3526                path->nodes[level] = split;
3527                path->slots[level + 1] += 1;
3528        } else {
3529                btrfs_tree_unlock(split);
3530                free_extent_buffer(split);
3531        }
3532        return ret;
3533}
3534
3535/*
3536 * how many bytes are required to store the items in a leaf.  start
3537 * and nr indicate which items in the leaf to check.  This totals up the
3538 * space used both by the item structs and the item data
3539 */
3540static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3541{
3542        struct btrfs_item *start_item;
3543        struct btrfs_item *end_item;
3544        struct btrfs_map_token token;
3545        int data_len;
3546        int nritems = btrfs_header_nritems(l);
3547        int end = min(nritems, start + nr) - 1;
3548
3549        if (!nr)
3550                return 0;
3551        btrfs_init_map_token(&token);
3552        start_item = btrfs_item_nr(start);
3553        end_item = btrfs_item_nr(end);
3554        data_len = btrfs_token_item_offset(l, start_item, &token) +
3555                btrfs_token_item_size(l, start_item, &token);
3556        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3557        data_len += sizeof(struct btrfs_item) * nr;
3558        WARN_ON(data_len < 0);
3559        return data_len;
3560}
3561
3562/*
3563 * The space between the end of the leaf items and
3564 * the start of the leaf data.  IOW, how much room
3565 * the leaf has left for both items and data
3566 */
3567noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3568                                   struct extent_buffer *leaf)
3569{
3570        int nritems = btrfs_header_nritems(leaf);
3571        int ret;
3572        ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3573        if (ret < 0) {
3574                btrfs_crit(root->fs_info,
3575                        "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3576                       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3577                       leaf_space_used(leaf, 0, nritems), nritems);
3578        }
3579        return ret;
3580}
3581
3582/*
3583 * min slot controls the lowest index we're willing to push to the
3584 * right.  We'll push up to and including min_slot, but no lower
3585 */
3586static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3587                                      struct btrfs_root *root,
3588                                      struct btrfs_path *path,
3589                                      int data_size, int empty,
3590                                      struct extent_buffer *right,
3591                                      int free_space, u32 left_nritems,
3592                                      u32 min_slot)
3593{
3594        struct extent_buffer *left = path->nodes[0];
3595        struct extent_buffer *upper = path->nodes[1];
3596        struct btrfs_map_token token;
3597        struct btrfs_disk_key disk_key;
3598        int slot;
3599        u32 i;
3600        int push_space = 0;
3601        int push_items = 0;
3602        struct btrfs_item *item;
3603        u32 nr;
3604        u32 right_nritems;
3605        u32 data_end;
3606        u32 this_item_size;
3607
3608        btrfs_init_map_token(&token);
3609
3610        if (empty)
3611                nr = 0;
3612        else
3613                nr = max_t(u32, 1, min_slot);
3614
3615        if (path->slots[0] >= left_nritems)
3616                push_space += data_size;
3617
3618        slot = path->slots[1];
3619        i = left_nritems - 1;
3620        while (i >= nr) {
3621                item = btrfs_item_nr(i);
3622
3623                if (!empty && push_items > 0) {
3624                        if (path->slots[0] > i)
3625                                break;
3626                        if (path->slots[0] == i) {
3627                                int space = btrfs_leaf_free_space(root, left);
3628                                if (space + push_space * 2 > free_space)
3629                                        break;
3630                        }
3631                }
3632
3633                if (path->slots[0] == i)
3634                        push_space += data_size;
3635
3636                this_item_size = btrfs_item_size(left, item);
3637                if (this_item_size + sizeof(*item) + push_space > free_space)
3638                        break;
3639
3640                push_items++;
3641                push_space += this_item_size + sizeof(*item);
3642                if (i == 0)
3643                        break;
3644                i--;
3645        }
3646
3647        if (push_items == 0)
3648                goto out_unlock;
3649
3650        WARN_ON(!empty && push_items == left_nritems);
3651
3652        /* push left to right */
3653        right_nritems = btrfs_header_nritems(right);
3654
3655        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3656        push_space -= leaf_data_end(root, left);
3657
3658        /* make room in the right data area */
3659        data_end = leaf_data_end(root, right);
3660        memmove_extent_buffer(right,
3661                              btrfs_leaf_data(right) + data_end - push_space,
3662                              btrfs_leaf_data(right) + data_end,
3663                              BTRFS_LEAF_DATA_SIZE(root) - data_end);
3664
3665        /* copy from the left data area */
3666        copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3667                     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3668                     btrfs_leaf_data(left) + leaf_data_end(root, left),
3669                     push_space);
3670
3671        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3672                              btrfs_item_nr_offset(0),
3673                              right_nritems * sizeof(struct btrfs_item));
3674
3675        /* copy the items from left to right */
3676        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3677                   btrfs_item_nr_offset(left_nritems - push_items),
3678                   push_items * sizeof(struct btrfs_item));
3679
3680        /* update the item pointers */
3681        right_nritems += push_items;
3682        btrfs_set_header_nritems(right, right_nritems);
3683        push_space = BTRFS_LEAF_DATA_SIZE(root);
3684        for (i = 0; i < right_nritems; i++) {
3685                item = btrfs_item_nr(i);
3686                push_space -= btrfs_token_item_size(right, item, &token);
3687                btrfs_set_token_item_offset(right, item, push_space, &token);
3688        }
3689
3690        left_nritems -= push_items;
3691        btrfs_set_header_nritems(left, left_nritems);
3692
3693        if (left_nritems)
3694                btrfs_mark_buffer_dirty(left);
3695        else
3696                clean_tree_block(trans, root->fs_info, left);
3697
3698        btrfs_mark_buffer_dirty(right);
3699
3700        btrfs_item_key(right, &disk_key, 0);
3701        btrfs_set_node_key(upper, &disk_key, slot + 1);
3702        btrfs_mark_buffer_dirty(upper);
3703
3704        /* then fixup the leaf pointer in the path */
3705        if (path->slots[0] >= left_nritems) {
3706                path->slots[0] -= left_nritems;
3707                if (btrfs_header_nritems(path->nodes[0]) == 0)
3708                        clean_tree_block(trans, root->fs_info, path->nodes[0]);
3709                btrfs_tree_unlock(path->nodes[0]);
3710                free_extent_buffer(path->nodes[0]);
3711                path->nodes[0] = right;
3712                path->slots[1] += 1;
3713        } else {
3714                btrfs_tree_unlock(right);
3715                free_extent_buffer(right);
3716        }
3717        return 0;
3718
3719out_unlock:
3720        btrfs_tree_unlock(right);
3721        free_extent_buffer(right);
3722        return 1;
3723}
3724
3725/*
3726 * push some data in the path leaf to the right, trying to free up at
3727 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3728 *
3729 * returns 1 if the push failed because the other node didn't have enough
3730 * room, 0 if everything worked out and < 0 if there were major errors.
3731 *
3732 * this will push starting from min_slot to the end of the leaf.  It won't
3733 * push any slot lower than min_slot
3734 */
3735static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3736                           *root, struct btrfs_path *path,
3737                           int min_data_size, int data_size,
3738                           int empty, u32 min_slot)
3739{
3740        struct extent_buffer *left = path->nodes[0];
3741        struct extent_buffer *right;
3742        struct extent_buffer *upper;
3743        int slot;
3744        int free_space;
3745        u32 left_nritems;
3746        int ret;
3747
3748        if (!path->nodes[1])
3749                return 1;
3750
3751        slot = path->slots[1];
3752        upper = path->nodes[1];
3753        if (slot >= btrfs_header_nritems(upper) - 1)
3754                return 1;
3755
3756        btrfs_assert_tree_locked(path->nodes[1]);
3757
3758        right = read_node_slot(root, upper, slot + 1);
3759        if (right == NULL)
3760                return 1;
3761
3762        btrfs_tree_lock(right);
3763        btrfs_set_lock_blocking(right);
3764
3765        free_space = btrfs_leaf_free_space(root, right);
3766        if (free_space < data_size)
3767                goto out_unlock;
3768
3769        /* cow and double check */
3770        ret = btrfs_cow_block(trans, root, right, upper,
3771                              slot + 1, &right);
3772        if (ret)
3773                goto out_unlock;
3774
3775        free_space = btrfs_leaf_free_space(root, right);
3776        if (free_space < data_size)
3777                goto out_unlock;
3778
3779        left_nritems = btrfs_header_nritems(left);
3780        if (left_nritems == 0)
3781                goto out_unlock;
3782
3783        if (path->slots[0] == left_nritems && !empty) {
3784                /* Key greater than all keys in the leaf, right neighbor has
3785                 * enough room for it and we're not emptying our leaf to delete
3786                 * it, therefore use right neighbor to insert the new item and
3787                 * no need to touch/dirty our left leaft. */
3788                btrfs_tree_unlock(left);
3789                free_extent_buffer(left);
3790                path->nodes[0] = right;
3791                path->slots[0] = 0;
3792                path->slots[1]++;
3793                return 0;
3794        }
3795
3796        return __push_leaf_right(trans, root, path, min_data_size, empty,
3797                                right, free_space, left_nritems, min_slot);
3798out_unlock:
3799        btrfs_tree_unlock(right);
3800        free_extent_buffer(right);
3801        return 1;
3802}
3803
3804/*
3805 * push some data in the path leaf to the left, trying to free up at
3806 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3807 *
3808 * max_slot can put a limit on how far into the leaf we'll push items.  The
3809 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3810 * items
3811 */
3812static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3813                                     struct btrfs_root *root,
3814                                     struct btrfs_path *path, int data_size,
3815                                     int empty, struct extent_buffer *left,
3816                                     int free_space, u32 right_nritems,
3817                                     u32 max_slot)
3818{
3819        struct btrfs_disk_key disk_key;
3820        struct extent_buffer *right = path->nodes[0];
3821        int i;
3822        int push_space = 0;
3823        int push_items = 0;
3824        struct btrfs_item *item;
3825        u32 old_left_nritems;
3826        u32 nr;
3827        int ret = 0;
3828        u32 this_item_size;
3829        u32 old_left_item_size;
3830        struct btrfs_map_token token;
3831
3832        btrfs_init_map_token(&token);
3833
3834        if (empty)
3835                nr = min(right_nritems, max_slot);
3836        else
3837                nr = min(right_nritems - 1, max_slot);
3838
3839        for (i = 0; i < nr; i++) {
3840                item = btrfs_item_nr(i);
3841
3842                if (!empty && push_items > 0) {
3843                        if (path->slots[0] < i)
3844                                break;
3845                        if (path->slots[0] == i) {
3846                                int space = btrfs_leaf_free_space(root, right);
3847                                if (space + push_space * 2 > free_space)
3848                                        break;
3849                        }
3850                }
3851
3852                if (path->slots[0] == i)
3853                        push_space += data_size;
3854
3855                this_item_size = btrfs_item_size(right, item);
3856                if (this_item_size + sizeof(*item) + push_space > free_space)
3857                        break;
3858
3859                push_items++;
3860                push_space += this_item_size + sizeof(*item);
3861        }
3862
3863        if (push_items == 0) {
3864                ret = 1;
3865                goto out;
3866        }
3867        WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3868
3869        /* push data from right to left */
3870        copy_extent_buffer(left, right,
3871                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
3872                           btrfs_item_nr_offset(0),
3873                           push_items * sizeof(struct btrfs_item));
3874
3875        push_space = BTRFS_LEAF_DATA_SIZE(root) -
3876                     btrfs_item_offset_nr(right, push_items - 1);
3877
3878        copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3879                     leaf_data_end(root, left) - push_space,
3880                     btrfs_leaf_data(right) +
3881                     btrfs_item_offset_nr(right, push_items - 1),
3882                     push_space);
3883        old_left_nritems = btrfs_header_nritems(left);
3884        BUG_ON(old_left_nritems <= 0);
3885
3886        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3887        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3888                u32 ioff;
3889
3890                item = btrfs_item_nr(i);
3891
3892                ioff = btrfs_token_item_offset(left, item, &token);
3893                btrfs_set_token_item_offset(left, item,
3894                      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3895                      &token);
3896        }
3897        btrfs_set_header_nritems(left, old_left_nritems + push_items);
3898
3899        /* fixup right node */
3900        if (push_items > right_nritems)
3901                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3902                       right_nritems);
3903
3904        if (push_items < right_nritems) {
3905                push_space = btrfs_item_offset_nr(right, push_items - 1) -
3906                                                  leaf_data_end(root, right);
3907                memmove_extent_buffer(right, btrfs_leaf_data(right) +
3908                                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3909                                      btrfs_leaf_data(right) +
3910                                      leaf_data_end(root, right), push_space);
3911
3912                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3913                              btrfs_item_nr_offset(push_items),
3914                             (btrfs_header_nritems(right) - push_items) *
3915                             sizeof(struct btrfs_item));
3916        }
3917        right_nritems -= push_items;
3918        btrfs_set_header_nritems(right, right_nritems);
3919        push_space = BTRFS_LEAF_DATA_SIZE(root);
3920        for (i = 0; i < right_nritems; i++) {
3921                item = btrfs_item_nr(i);
3922
3923                push_space = push_space - btrfs_token_item_size(right,
3924                                                                item, &token);
3925                btrfs_set_token_item_offset(right, item, push_space, &token);
3926        }
3927
3928        btrfs_mark_buffer_dirty(left);
3929        if (right_nritems)
3930                btrfs_mark_buffer_dirty(right);
3931        else
3932                clean_tree_block(trans, root->fs_info, right);
3933
3934        btrfs_item_key(right, &disk_key, 0);
3935        fixup_low_keys(root->fs_info, path, &disk_key, 1);
3936
3937        /* then fixup the leaf pointer in the path */
3938        if (path->slots[0] < push_items) {
3939                path->slots[0] += old_left_nritems;
3940                btrfs_tree_unlock(path->nodes[0]);
3941                free_extent_buffer(path->nodes[0]);
3942                path->nodes[0] = left;
3943                path->slots[1] -= 1;
3944        } else {
3945                btrfs_tree_unlock(left);
3946                free_extent_buffer(left);
3947                path->slots[0] -= push_items;
3948        }
3949        BUG_ON(path->slots[0] < 0);
3950        return ret;
3951out:
3952        btrfs_tree_unlock(left);
3953        free_extent_buffer(left);
3954        return ret;
3955}
3956
3957/*
3958 * push some data in the path leaf to the left, trying to free up at
3959 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3960 *
3961 * max_slot can put a limit on how far into the leaf we'll push items.  The
3962 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3963 * items
3964 */
3965static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3966                          *root, struct btrfs_path *path, int min_data_size,
3967                          int data_size, int empty, u32 max_slot)
3968{
3969        struct extent_buffer *right = path->nodes[0];
3970        struct extent_buffer *left;
3971        int slot;
3972        int free_space;
3973        u32 right_nritems;
3974        int ret = 0;
3975
3976        slot = path->slots[1];
3977        if (slot == 0)
3978                return 1;
3979        if (!path->nodes[1])
3980                return 1;
3981
3982        right_nritems = btrfs_header_nritems(right);
3983        if (right_nritems == 0)
3984                return 1;
3985
3986        btrfs_assert_tree_locked(path->nodes[1]);
3987
3988        left = read_node_slot(root, path->nodes[1], slot - 1);
3989        if (left == NULL)
3990                return 1;
3991
3992        btrfs_tree_lock(left);
3993        btrfs_set_lock_blocking(left);
3994
3995        free_space = btrfs_leaf_free_space(root, left);
3996        if (free_space < data_size) {
3997                ret = 1;
3998                goto out;
3999        }
4000
4001        /* cow and double check */
4002        ret = btrfs_cow_block(trans, root, left,
4003                              path->nodes[1], slot - 1, &left);
4004        if (ret) {
4005                /* we hit -ENOSPC, but it isn't fatal here */
4006                if (ret == -ENOSPC)
4007                        ret = 1;
4008                goto out;
4009        }
4010
4011        free_space = btrfs_leaf_free_space(root, left);
4012        if (free_space < data_size) {
4013                ret = 1;
4014                goto out;
4015        }
4016
4017        return __push_leaf_left(trans, root, path, min_data_size,
4018                               empty, left, free_space, right_nritems,
4019                               max_slot);
4020out:
4021        btrfs_tree_unlock(left);
4022        free_extent_buffer(left);
4023        return ret;
4024}
4025
4026/*
4027 * split the path's leaf in two, making sure there is at least data_size
4028 * available for the resulting leaf level of the path.
4029 */
4030static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4031                                    struct btrfs_root *root,
4032                                    struct btrfs_path *path,
4033                                    struct extent_buffer *l,
4034                                    struct extent_buffer *right,
4035                                    int slot, int mid, int nritems)
4036{
4037        int data_copy_size;
4038        int rt_data_off;
4039        int i;
4040        struct btrfs_disk_key disk_key;
4041        struct btrfs_map_token token;
4042
4043        btrfs_init_map_token(&token);
4044
4045        nritems = nritems - mid;
4046        btrfs_set_header_nritems(right, nritems);
4047        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4048
4049        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4050                           btrfs_item_nr_offset(mid),
4051                           nritems * sizeof(struct btrfs_item));
4052
4053        copy_extent_buffer(right, l,
4054                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4055                     data_copy_size, btrfs_leaf_data(l) +
4056                     leaf_data_end(root, l), data_copy_size);
4057
4058        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4059                      btrfs_item_end_nr(l, mid);
4060
4061        for (i = 0; i < nritems; i++) {
4062                struct btrfs_item *item = btrfs_item_nr(i);
4063                u32 ioff;
4064
4065                ioff = btrfs_token_item_offset(right, item, &token);
4066                btrfs_set_token_item_offset(right, item,
4067                                            ioff + rt_data_off, &token);
4068        }
4069
4070        btrfs_set_header_nritems(l, mid);
4071        btrfs_item_key(right, &disk_key, 0);
4072        insert_ptr(trans, root, path, &disk_key, right->start,
4073                   path->slots[1] + 1, 1);
4074
4075        btrfs_mark_buffer_dirty(right);
4076        btrfs_mark_buffer_dirty(l);
4077        BUG_ON(path->slots[0] != slot);
4078
4079        if (mid <= slot) {
4080                btrfs_tree_unlock(path->nodes[0]);
4081                free_extent_buffer(path->nodes[0]);
4082                path->nodes[0] = right;
4083                path->slots[0] -= mid;
4084                path->slots[1] += 1;
4085        } else {
4086                btrfs_tree_unlock(right);
4087                free_extent_buffer(right);
4088        }
4089
4090        BUG_ON(path->slots[0] < 0);
4091}
4092
4093/*
4094 * double splits happen when we need to insert a big item in the middle
4095 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4096 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4097 *          A                 B                 C
4098 *
4099 * We avoid this by trying to push the items on either side of our target
4100 * into the adjacent leaves.  If all goes well we can avoid the double split
4101 * completely.
4102 */
4103static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4104                                          struct btrfs_root *root,
4105                                          struct btrfs_path *path,
4106                                          int data_size)
4107{
4108        int ret;
4109        int progress = 0;
4110        int slot;
4111        u32 nritems;
4112        int space_needed = data_size;
4113
4114        slot = path->slots[0];
4115        if (slot < btrfs_header_nritems(path->nodes[0]))
4116                space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4117
4118        /*
4119         * try to push all the items after our slot into the
4120         * right leaf
4121         */
4122        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4123        if (ret < 0)
4124                return ret;
4125
4126        if (ret == 0)
4127                progress++;
4128
4129        nritems = btrfs_header_nritems(path->nodes[0]);
4130        /*
4131         * our goal is to get our slot at the start or end of a leaf.  If
4132         * we've done so we're done
4133         */
4134        if (path->slots[0] == 0 || path->slots[0] == nritems)
4135                return 0;
4136
4137        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4138                return 0;
4139
4140        /* try to push all the items before our slot into the next leaf */
4141        slot = path->slots[0];
4142        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4143        if (ret < 0)
4144                return ret;
4145
4146        if (ret == 0)
4147                progress++;
4148
4149        if (progress)
4150                return 0;
4151        return 1;
4152}
4153
4154/*
4155 * split the path's leaf in two, making sure there is at least data_size
4156 * available for the resulting leaf level of the path.
4157 *
4158 * returns 0 if all went well and < 0 on failure.
4159 */
4160static noinline int split_leaf(struct btrfs_trans_handle *trans,
4161                               struct btrfs_root *root,
4162                               struct btrfs_key *ins_key,
4163                               struct btrfs_path *path, int data_size,
4164                               int extend)
4165{
4166        struct btrfs_disk_key disk_key;
4167        struct extent_buffer *l;
4168        u32 nritems;
4169        int mid;
4170        int slot;
4171        struct extent_buffer *right;
4172        struct btrfs_fs_info *fs_info = root->fs_info;
4173        int ret = 0;
4174        int wret;
4175        int split;
4176        int num_doubles = 0;
4177        int tried_avoid_double = 0;
4178
4179        l = path->nodes[0];
4180        slot = path->slots[0];
4181        if (extend && data_size + btrfs_item_size_nr(l, slot) +
4182            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4183                return -EOVERFLOW;
4184
4185        /* first try to make some room by pushing left and right */
4186        if (data_size && path->nodes[1]) {
4187                int space_needed = data_size;
4188
4189                if (slot < btrfs_header_nritems(l))
4190                        space_needed -= btrfs_leaf_free_space(root, l);
4191
4192                wret = push_leaf_right(trans, root, path, space_needed,
4193                                       space_needed, 0, 0);
4194                if (wret < 0)
4195                        return wret;
4196                if (wret) {
4197                        wret = push_leaf_left(trans, root, path, space_needed,
4198                                              space_needed, 0, (u32)-1);
4199                        if (wret < 0)
4200                                return wret;
4201                }
4202                l = path->nodes[0];
4203
4204                /* did the pushes work? */
4205                if (btrfs_leaf_free_space(root, l) >= data_size)
4206                        return 0;
4207        }
4208
4209        if (!path->nodes[1]) {
4210                ret = insert_new_root(trans, root, path, 1);
4211                if (ret)
4212                        return ret;
4213        }
4214again:
4215        split = 1;
4216        l = path->nodes[0];
4217        slot = path->slots[0];
4218        nritems = btrfs_header_nritems(l);
4219        mid = (nritems + 1) / 2;
4220
4221        if (mid <= slot) {
4222                if (nritems == 1 ||
4223                    leaf_space_used(l, mid, nritems - mid) + data_size >
4224                        BTRFS_LEAF_DATA_SIZE(root)) {
4225                        if (slot >= nritems) {
4226                                split = 0;
4227                        } else {
4228                                mid = slot;
4229                                if (mid != nritems &&
4230                                    leaf_space_used(l, mid, nritems - mid) +
4231                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4232                                        if (data_size && !tried_avoid_double)
4233                                                goto push_for_double;
4234                                        split = 2;
4235                                }
4236                        }
4237                }
4238        } else {
4239                if (leaf_space_used(l, 0, mid) + data_size >
4240                        BTRFS_LEAF_DATA_SIZE(root)) {
4241                        if (!extend && data_size && slot == 0) {
4242                                split = 0;
4243                        } else if ((extend || !data_size) && slot == 0) {
4244                                mid = 1;
4245                        } else {
4246                                mid = slot;
4247                                if (mid != nritems &&
4248                                    leaf_space_used(l, mid, nritems - mid) +
4249                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4250                                        if (data_size && !tried_avoid_double)
4251                                                goto push_for_double;
4252                                        split = 2;
4253                                }
4254                        }
4255                }
4256        }
4257
4258        if (split == 0)
4259                btrfs_cpu_key_to_disk(&disk_key, ins_key);
4260        else
4261                btrfs_item_key(l, &disk_key, mid);
4262
4263        right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4264                        &disk_key, 0, l->start, 0);
4265        if (IS_ERR(right))
4266                return PTR_ERR(right);
4267
4268        root_add_used(root, root->nodesize);
4269
4270        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4271        btrfs_set_header_bytenr(right, right->start);
4272        btrfs_set_header_generation(right, trans->transid);
4273        btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4274        btrfs_set_header_owner(right, root->root_key.objectid);
4275        btrfs_set_header_level(right, 0);
4276        write_extent_buffer(right, fs_info->fsid,
4277                            btrfs_header_fsid(), BTRFS_FSID_SIZE);
4278
4279        write_extent_buffer(right, fs_info->chunk_tree_uuid,
4280                            btrfs_header_chunk_tree_uuid(right),
4281                            BTRFS_UUID_SIZE);
4282
4283        if (split == 0) {
4284                if (mid <= slot) {
4285                        btrfs_set_header_nritems(right, 0);
4286                        insert_ptr(trans, root, path, &disk_key, right->start,
4287                                   path->slots[1] + 1, 1);
4288                        btrfs_tree_unlock(path->nodes[0]);
4289                        free_extent_buffer(path->nodes[0]);
4290                        path->nodes[0] = right;
4291                        path->slots[0] = 0;
4292                        path->slots[1] += 1;
4293                } else {
4294                        btrfs_set_header_nritems(right, 0);
4295                        insert_ptr(trans, root, path, &disk_key, right->start,
4296                                          path->slots[1], 1);
4297                        btrfs_tree_unlock(path->nodes[0]);
4298                        free_extent_buffer(path->nodes[0]);
4299                        path->nodes[0] = right;
4300                        path->slots[0] = 0;
4301                        if (path->slots[1] == 0)
4302                                fixup_low_keys(fs_info, path, &disk_key, 1);
4303                }
4304                btrfs_mark_buffer_dirty(right);
4305                return ret;
4306        }
4307
4308        copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4309
4310        if (split == 2) {
4311                BUG_ON(num_doubles != 0);
4312                num_doubles++;
4313                goto again;
4314        }
4315
4316        return 0;
4317
4318push_for_double:
4319        push_for_double_split(trans, root, path, data_size);
4320        tried_avoid_double = 1;
4321        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4322                return 0;
4323        goto again;
4324}
4325
4326static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4327                                         struct btrfs_root *root,
4328                                         struct btrfs_path *path, int ins_len)
4329{
4330        struct btrfs_key key;
4331        struct extent_buffer *leaf;
4332        struct btrfs_file_extent_item *fi;
4333        u64 extent_len = 0;
4334        u32 item_size;
4335        int ret;
4336
4337        leaf = path->nodes[0];
4338        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4339
4340        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4341               key.type != BTRFS_EXTENT_CSUM_KEY);
4342
4343        if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4344                return 0;
4345
4346        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4347        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4348                fi = btrfs_item_ptr(leaf, path->slots[0],
4349                                    struct btrfs_file_extent_item);
4350                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4351        }
4352        btrfs_release_path(path);
4353
4354        path->keep_locks = 1;
4355        path->search_for_split = 1;
4356        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4357        path->search_for_split = 0;
4358        if (ret > 0)
4359                ret = -EAGAIN;
4360        if (ret < 0)
4361                goto err;
4362
4363        ret = -EAGAIN;
4364        leaf = path->nodes[0];
4365        /* if our item isn't there, return now */
4366        if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4367                goto err;
4368
4369        /* the leaf has  changed, it now has room.  return now */
4370        if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4371                goto err;
4372
4373        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4374                fi = btrfs_item_ptr(leaf, path->slots[0],
4375                                    struct btrfs_file_extent_item);
4376                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4377                        goto err;
4378        }
4379
4380        btrfs_set_path_blocking(path);
4381        ret = split_leaf(trans, root, &key, path, ins_len, 1);
4382        if (ret)
4383                goto err;
4384
4385        path->keep_locks = 0;
4386        btrfs_unlock_up_safe(path, 1);
4387        return 0;
4388err:
4389        path->keep_locks = 0;
4390        return ret;
4391}
4392
4393static noinline int split_item(struct btrfs_trans_handle *trans,
4394                               struct btrfs_root *root,
4395                               struct btrfs_path *path,
4396                               struct btrfs_key *new_key,
4397                               unsigned long split_offset)
4398{
4399        struct extent_buffer *leaf;
4400        struct btrfs_item *item;
4401        struct btrfs_item *new_item;
4402        int slot;
4403        char *buf;
4404        u32 nritems;
4405        u32 item_size;
4406        u32 orig_offset;
4407        struct btrfs_disk_key disk_key;
4408
4409        leaf = path->nodes[0];
4410        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4411
4412        btrfs_set_path_blocking(path);
4413
4414        item = btrfs_item_nr(path->slots[0]);
4415        orig_offset = btrfs_item_offset(leaf, item);
4416        item_size = btrfs_item_size(leaf, item);
4417
4418        buf = kmalloc(item_size, GFP_NOFS);
4419        if (!buf)
4420                return -ENOMEM;
4421
4422        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4423                            path->slots[0]), item_size);
4424
4425        slot = path->slots[0] + 1;
4426        nritems = btrfs_header_nritems(leaf);
4427        if (slot != nritems) {
4428                /* shift the items */
4429                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4430                                btrfs_item_nr_offset(slot),
4431                                (nritems - slot) * sizeof(struct btrfs_item));
4432        }
4433
4434        btrfs_cpu_key_to_disk(&disk_key, new_key);
4435        btrfs_set_item_key(leaf, &disk_key, slot);
4436
4437        new_item = btrfs_item_nr(slot);
4438
4439        btrfs_set_item_offset(leaf, new_item, orig_offset);
4440        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4441
4442        btrfs_set_item_offset(leaf, item,
4443                              orig_offset + item_size - split_offset);
4444        btrfs_set_item_size(leaf, item, split_offset);
4445
4446        btrfs_set_header_nritems(leaf, nritems + 1);
4447
4448        /* write the data for the start of the original item */
4449        write_extent_buffer(leaf, buf,
4450                            btrfs_item_ptr_offset(leaf, path->slots[0]),
4451                            split_offset);
4452
4453        /* write the data for the new item */
4454        write_extent_buffer(leaf, buf + split_offset,
4455                            btrfs_item_ptr_offset(leaf, slot),
4456                            item_size - split_offset);
4457        btrfs_mark_buffer_dirty(leaf);
4458
4459        BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4460        kfree(buf);
4461        return 0;
4462}
4463
4464/*
4465 * This function splits a single item into two items,
4466 * giving 'new_key' to the new item and splitting the
4467 * old one at split_offset (from the start of the item).
4468 *
4469 * The path may be released by this operation.  After
4470 * the split, the path is pointing to the old item.  The
4471 * new item is going to be in the same node as the old one.
4472 *
4473 * Note, the item being split must be smaller enough to live alone on
4474 * a tree block with room for one extra struct btrfs_item
4475 *
4476 * This allows us to split the item in place, keeping a lock on the
4477 * leaf the entire time.
4478 */
4479int btrfs_split_item(struct btrfs_trans_handle *trans,
4480                     struct btrfs_root *root,
4481                     struct btrfs_path *path,
4482                     struct btrfs_key *new_key,
4483                     unsigned long split_offset)
4484{
4485        int ret;
4486        ret = setup_leaf_for_split(trans, root, path,
4487                                   sizeof(struct btrfs_item));
4488        if (ret)
4489                return ret;
4490
4491        ret = split_item(trans, root, path, new_key, split_offset);
4492        return ret;
4493}
4494
4495/*
4496 * This function duplicate a item, giving 'new_key' to the new item.
4497 * It guarantees both items live in the same tree leaf and the new item
4498 * is contiguous with the original item.
4499 *
4500 * This allows us to split file extent in place, keeping a lock on the
4501 * leaf the entire time.
4502 */
4503int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4504                         struct btrfs_root *root,
4505                         struct btrfs_path *path,
4506                         struct btrfs_key *new_key)
4507{
4508        struct extent_buffer *leaf;
4509        int ret;
4510        u32 item_size;
4511
4512        leaf = path->nodes[0];
4513        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4514        ret = setup_leaf_for_split(trans, root, path,
4515                                   item_size + sizeof(struct btrfs_item));
4516        if (ret)
4517                return ret;
4518
4519        path->slots[0]++;
4520        setup_items_for_insert(root, path, new_key, &item_size,
4521                               item_size, item_size +
4522                               sizeof(struct btrfs_item), 1);
4523        leaf = path->nodes[0];
4524        memcpy_extent_buffer(leaf,
4525                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4526                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4527                             item_size);
4528        return 0;
4529}
4530
4531/*
4532 * make the item pointed to by the path smaller.  new_size indicates
4533 * how small to make it, and from_end tells us if we just chop bytes
4534 * off the end of the item or if we shift the item to chop bytes off
4535 * the front.
4536 */
4537void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4538                         u32 new_size, int from_end)
4539{
4540        int slot;
4541        struct extent_buffer *leaf;
4542        struct btrfs_item *item;
4543        u32 nritems;
4544        unsigned int data_end;
4545        unsigned int old_data_start;
4546        unsigned int old_size;
4547        unsigned int size_diff;
4548        int i;
4549        struct btrfs_map_token token;
4550
4551        btrfs_init_map_token(&token);
4552
4553        leaf = path->nodes[0];
4554        slot = path->slots[0];
4555
4556        old_size = btrfs_item_size_nr(leaf, slot);
4557        if (old_size == new_size)
4558                return;
4559
4560        nritems = btrfs_header_nritems(leaf);
4561        data_end = leaf_data_end(root, leaf);
4562
4563        old_data_start = btrfs_item_offset_nr(leaf, slot);
4564
4565        size_diff = old_size - new_size;
4566
4567        BUG_ON(slot < 0);
4568        BUG_ON(slot >= nritems);
4569
4570        /*
4571         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4572         */
4573        /* first correct the data pointers */
4574        for (i = slot; i < nritems; i++) {
4575                u32 ioff;
4576                item = btrfs_item_nr(i);
4577
4578                ioff = btrfs_token_item_offset(leaf, item, &token);
4579                btrfs_set_token_item_offset(leaf, item,
4580                                            ioff + size_diff, &token);
4581        }
4582
4583        /* shift the data */
4584        if (from_end) {
4585                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4586                              data_end + size_diff, btrfs_leaf_data(leaf) +
4587                              data_end, old_data_start + new_size - data_end);
4588        } else {
4589                struct btrfs_disk_key disk_key;
4590                u64 offset;
4591
4592                btrfs_item_key(leaf, &disk_key, slot);
4593
4594                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4595                        unsigned long ptr;
4596                        struct btrfs_file_extent_item *fi;
4597
4598                        fi = btrfs_item_ptr(leaf, slot,
4599                                            struct btrfs_file_extent_item);
4600                        fi = (struct btrfs_file_extent_item *)(
4601                             (unsigned long)fi - size_diff);
4602
4603                        if (btrfs_file_extent_type(leaf, fi) ==
4604                            BTRFS_FILE_EXTENT_INLINE) {
4605                                ptr = btrfs_item_ptr_offset(leaf, slot);
4606                                memmove_extent_buffer(leaf, ptr,
4607                                      (unsigned long)fi,
4608                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4609                        }
4610                }
4611
4612                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4613                              data_end + size_diff, btrfs_leaf_data(leaf) +
4614                              data_end, old_data_start - data_end);
4615
4616                offset = btrfs_disk_key_offset(&disk_key);
4617                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4618                btrfs_set_item_key(leaf, &disk_key, slot);
4619                if (slot == 0)
4620                        fixup_low_keys(root->fs_info, path, &disk_key, 1);
4621        }
4622
4623        item = btrfs_item_nr(slot);
4624        btrfs_set_item_size(leaf, item, new_size);
4625        btrfs_mark_buffer_dirty(leaf);
4626
4627        if (btrfs_leaf_free_space(root, leaf) < 0) {
4628                btrfs_print_leaf(root, leaf);
4629                BUG();
4630        }
4631}
4632
4633/*
4634 * make the item pointed to by the path bigger, data_size is the added size.
4635 */
4636void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4637                       u32 data_size)
4638{
4639        int slot;
4640        struct extent_buffer *leaf;
4641        struct btrfs_item *item;
4642        u32 nritems;
4643        unsigned int data_end;
4644        unsigned int old_data;
4645        unsigned int old_size;
4646        int i;
4647        struct btrfs_map_token token;
4648
4649        btrfs_init_map_token(&token);
4650
4651        leaf = path->nodes[0];
4652
4653        nritems = btrfs_header_nritems(leaf);
4654        data_end = leaf_data_end(root, leaf);
4655
4656        if (btrfs_leaf_free_space(root, leaf) < data_size) {
4657                btrfs_print_leaf(root, leaf);
4658                BUG();
4659        }
4660        slot = path->slots[0];
4661        old_data = btrfs_item_end_nr(leaf, slot);
4662
4663        BUG_ON(slot < 0);
4664        if (slot >= nritems) {
4665                btrfs_print_leaf(root, leaf);
4666                btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4667                       slot, nritems);
4668                BUG_ON(1);
4669        }
4670
4671        /*
4672         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4673         */
4674        /* first correct the data pointers */
4675        for (i = slot; i < nritems; i++) {
4676                u32 ioff;
4677                item = btrfs_item_nr(i);
4678
4679                ioff = btrfs_token_item_offset(leaf, item, &token);
4680                btrfs_set_token_item_offset(leaf, item,
4681                                            ioff - data_size, &token);
4682        }
4683
4684        /* shift the data */
4685        memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4686                      data_end - data_size, btrfs_leaf_data(leaf) +
4687                      data_end, old_data - data_end);
4688
4689        data_end = old_data;
4690        old_size = btrfs_item_size_nr(leaf, slot);
4691        item = btrfs_item_nr(slot);
4692        btrfs_set_item_size(leaf, item, old_size + data_size);
4693        btrfs_mark_buffer_dirty(leaf);
4694
4695        if (btrfs_leaf_free_space(root, leaf) < 0) {
4696                btrfs_print_leaf(root, leaf);
4697                BUG();
4698        }
4699}
4700
4701/*
4702 * this is a helper for btrfs_insert_empty_items, the main goal here is
4703 * to save stack depth by doing the bulk of the work in a function
4704 * that doesn't call btrfs_search_slot
4705 */
4706void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4707                            struct btrfs_key *cpu_key, u32 *data_size,
4708                            u32 total_data, u32 total_size, int nr)
4709{
4710        struct btrfs_item *item;
4711        int i;
4712        u32 nritems;
4713        unsigned int data_end;
4714        struct btrfs_disk_key disk_key;
4715        struct extent_buffer *leaf;
4716        int slot;
4717        struct btrfs_map_token token;
4718
4719        if (path->slots[0] == 0) {
4720                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4721                fixup_low_keys(root->fs_info, path, &disk_key, 1);
4722        }
4723        btrfs_unlock_up_safe(path, 1);
4724
4725        btrfs_init_map_token(&token);
4726
4727        leaf = path->nodes[0];
4728        slot = path->slots[0];
4729
4730        nritems = btrfs_header_nritems(leaf);
4731        data_end = leaf_data_end(root, leaf);
4732
4733        if (btrfs_leaf_free_space(root, leaf) < total_size) {
4734                btrfs_print_leaf(root, leaf);
4735                btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4736                       total_size, btrfs_leaf_free_space(root, leaf));
4737                BUG();
4738        }
4739
4740        if (slot != nritems) {
4741                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4742
4743                if (old_data < data_end) {
4744                        btrfs_print_leaf(root, leaf);
4745                        btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4746                               slot, old_data, data_end);
4747                        BUG_ON(1);
4748                }
4749                /*
4750                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4751                 */
4752                /* first correct the data pointers */
4753                for (i = slot; i < nritems; i++) {
4754                        u32 ioff;
4755
4756                        item = btrfs_item_nr( i);
4757                        ioff = btrfs_token_item_offset(leaf, item, &token);
4758                        btrfs_set_token_item_offset(leaf, item,
4759                                                    ioff - total_data, &token);
4760                }
4761                /* shift the items */
4762                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4763                              btrfs_item_nr_offset(slot),
4764                              (nritems - slot) * sizeof(struct btrfs_item));
4765
4766                /* shift the data */
4767                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4768                              data_end - total_data, btrfs_leaf_data(leaf) +
4769                              data_end, old_data - data_end);
4770                data_end = old_data;
4771        }
4772
4773        /* setup the item for the new data */
4774        for (i = 0; i < nr; i++) {
4775                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4776                btrfs_set_item_key(leaf, &disk_key, slot + i);
4777                item = btrfs_item_nr(slot + i);
4778                btrfs_set_token_item_offset(leaf, item,
4779                                            data_end - data_size[i], &token);
4780                data_end -= data_size[i];
4781                btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4782        }
4783
4784        btrfs_set_header_nritems(leaf, nritems + nr);
4785        btrfs_mark_buffer_dirty(leaf);
4786
4787        if (btrfs_leaf_free_space(root, leaf) < 0) {
4788                btrfs_print_leaf(root, leaf);
4789                BUG();
4790        }
4791}
4792
4793/*
4794 * Given a key and some data, insert items into the tree.
4795 * This does all the path init required, making room in the tree if needed.
4796 */
4797int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4798                            struct btrfs_root *root,
4799                            struct btrfs_path *path,
4800                            struct btrfs_key *cpu_key, u32 *data_size,
4801                            int nr)
4802{
4803        int ret = 0;
4804        int slot;
4805        int i;
4806        u32 total_size = 0;
4807        u32 total_data = 0;
4808
4809        for (i = 0; i < nr; i++)
4810                total_data += data_size[i];
4811
4812        total_size = total_data + (nr * sizeof(struct btrfs_item));
4813        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4814        if (ret == 0)
4815                return -EEXIST;
4816        if (ret < 0)
4817                return ret;
4818
4819        slot = path->slots[0];
4820        BUG_ON(slot < 0);
4821
4822        setup_items_for_insert(root, path, cpu_key, data_size,
4823                               total_data, total_size, nr);
4824        return 0;
4825}
4826
4827/*
4828 * Given a key and some data, insert an item into the tree.
4829 * This does all the path init required, making room in the tree if needed.
4830 */
4831int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4832                      *root, struct btrfs_key *cpu_key, void *data, u32
4833                      data_size)
4834{
4835        int ret = 0;
4836        struct btrfs_path *path;
4837        struct extent_buffer *leaf;
4838        unsigned long ptr;
4839
4840        path = btrfs_alloc_path();
4841        if (!path)
4842                return -ENOMEM;
4843        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4844        if (!ret) {
4845                leaf = path->nodes[0];
4846                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4847                write_extent_buffer(leaf, data, ptr, data_size);
4848                btrfs_mark_buffer_dirty(leaf);
4849        }
4850        btrfs_free_path(path);
4851        return ret;
4852}
4853
4854/*
4855 * delete the pointer from a given node.
4856 *
4857 * the tree should have been previously balanced so the deletion does not
4858 * empty a node.
4859 */
4860static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4861                    int level, int slot)
4862{
4863        struct extent_buffer *parent = path->nodes[level];
4864        u32 nritems;
4865        int ret;
4866
4867        nritems = btrfs_header_nritems(parent);
4868        if (slot != nritems - 1) {
4869                if (level)
4870                        tree_mod_log_eb_move(root->fs_info, parent, slot,
4871                                             slot + 1, nritems - slot - 1);
4872                memmove_extent_buffer(parent,
4873                              btrfs_node_key_ptr_offset(slot),
4874                              btrfs_node_key_ptr_offset(slot + 1),
4875                              sizeof(struct btrfs_key_ptr) *
4876                              (nritems - slot - 1));
4877        } else if (level) {
4878                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4879                                              MOD_LOG_KEY_REMOVE, GFP_NOFS);
4880                BUG_ON(ret < 0);
4881        }
4882
4883        nritems--;
4884        btrfs_set_header_nritems(parent, nritems);
4885        if (nritems == 0 && parent == root->node) {
4886                BUG_ON(btrfs_header_level(root->node) != 1);
4887                /* just turn the root into a leaf and break */
4888                btrfs_set_header_level(root->node, 0);
4889        } else if (slot == 0) {
4890                struct btrfs_disk_key disk_key;
4891
4892                btrfs_node_key(parent, &disk_key, 0);
4893                fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
4894        }
4895        btrfs_mark_buffer_dirty(parent);
4896}
4897
4898/*
4899 * a helper function to delete the leaf pointed to by path->slots[1] and
4900 * path->nodes[1].
4901 *
4902 * This deletes the pointer in path->nodes[1] and frees the leaf
4903 * block extent.  zero is returned if it all worked out, < 0 otherwise.
4904 *
4905 * The path must have already been setup for deleting the leaf, including
4906 * all the proper balancing.  path->nodes[1] must be locked.
4907 */
4908static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4909                                    struct btrfs_root *root,
4910                                    struct btrfs_path *path,
4911                                    struct extent_buffer *leaf)
4912{
4913        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4914        del_ptr(root, path, 1, path->slots[1]);
4915
4916        /*
4917         * btrfs_free_extent is expensive, we want to make sure we
4918         * aren't holding any locks when we call it
4919         */
4920        btrfs_unlock_up_safe(path, 0);
4921
4922        root_sub_used(root, leaf->len);
4923
4924        extent_buffer_get(leaf);
4925        btrfs_free_tree_block(trans, root, leaf, 0, 1);
4926        free_extent_buffer_stale(leaf);
4927}
4928/*
4929 * delete the item at the leaf level in path.  If that empties
4930 * the leaf, remove it from the tree
4931 */
4932int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4933                    struct btrfs_path *path, int slot, int nr)
4934{
4935        struct extent_buffer *leaf;
4936        struct btrfs_item *item;
4937        int last_off;
4938        int dsize = 0;
4939        int ret = 0;
4940        int wret;
4941        int i;
4942        u32 nritems;
4943        struct btrfs_map_token token;
4944
4945        btrfs_init_map_token(&token);
4946
4947        leaf = path->nodes[0];
4948        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4949
4950        for (i = 0; i < nr; i++)
4951                dsize += btrfs_item_size_nr(leaf, slot + i);
4952
4953        nritems = btrfs_header_nritems(leaf);
4954
4955        if (slot + nr != nritems) {
4956                int data_end = leaf_data_end(root, leaf);
4957
4958                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4959                              data_end + dsize,
4960                              btrfs_leaf_data(leaf) + data_end,
4961                              last_off - data_end);
4962
4963                for (i = slot + nr; i < nritems; i++) {
4964                        u32 ioff;
4965
4966                        item = btrfs_item_nr(i);
4967                        ioff = btrfs_token_item_offset(leaf, item, &token);
4968                        btrfs_set_token_item_offset(leaf, item,
4969                                                    ioff + dsize, &token);
4970                }
4971
4972                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4973                              btrfs_item_nr_offset(slot + nr),
4974                              sizeof(struct btrfs_item) *
4975                              (nritems - slot - nr));
4976        }
4977        btrfs_set_header_nritems(leaf, nritems - nr);
4978        nritems -= nr;
4979
4980        /* delete the leaf if we've emptied it */
4981        if (nritems == 0) {
4982                if (leaf == root->node) {
4983                        btrfs_set_header_level(leaf, 0);
4984                } else {
4985                        btrfs_set_path_blocking(path);
4986                        clean_tree_block(trans, root->fs_info, leaf);
4987                        btrfs_del_leaf(trans, root, path, leaf);
4988                }
4989        } else {
4990                int used = leaf_space_used(leaf, 0, nritems);
4991                if (slot == 0) {
4992                        struct btrfs_disk_key disk_key;
4993
4994                        btrfs_item_key(leaf, &disk_key, 0);
4995                        fixup_low_keys(root->fs_info, path, &disk_key, 1);
4996                }
4997
4998                /* delete the leaf if it is mostly empty */
4999                if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
5000                        /* push_leaf_left fixes the path.
5001                         * make sure the path still points to our leaf
5002                         * for possible call to del_ptr below
5003                         */
5004                        slot = path->slots[1];
5005                        extent_buffer_get(leaf);
5006
5007                        btrfs_set_path_blocking(path);
5008                        wret = push_leaf_left(trans, root, path, 1, 1,
5009                                              1, (u32)-1);
5010                        if (wret < 0 && wret != -ENOSPC)
5011                                ret = wret;
5012
5013                        if (path->nodes[0] == leaf &&
5014                            btrfs_header_nritems(leaf)) {
5015                                wret = push_leaf_right(trans, root, path, 1,
5016                                                       1, 1, 0);
5017                                if (wret < 0 && wret != -ENOSPC)
5018                                        ret = wret;
5019                        }
5020
5021                        if (btrfs_header_nritems(leaf) == 0) {
5022                                path->slots[1] = slot;
5023                                btrfs_del_leaf(trans, root, path, leaf);
5024                                free_extent_buffer(leaf);
5025                                ret = 0;
5026                        } else {
5027                                /* if we're still in the path, make sure
5028                                 * we're dirty.  Otherwise, one of the
5029                                 * push_leaf functions must have already
5030                                 * dirtied this buffer
5031                                 */
5032                                if (path->nodes[0] == leaf)
5033                                        btrfs_mark_buffer_dirty(leaf);
5034                                free_extent_buffer(leaf);
5035                        }
5036                } else {
5037                        btrfs_mark_buffer_dirty(leaf);
5038                }
5039        }
5040        return ret;
5041}
5042
5043/*
5044 * search the tree again to find a leaf with lesser keys
5045 * returns 0 if it found something or 1 if there are no lesser leaves.
5046 * returns < 0 on io errors.
5047 *
5048 * This may release the path, and so you may lose any locks held at the
5049 * time you call it.
5050 */
5051int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5052{
5053        struct btrfs_key key;
5054        struct btrfs_disk_key found_key;
5055        int ret;
5056
5057        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5058
5059        if (key.offset > 0) {
5060                key.offset--;
5061        } else if (key.type > 0) {
5062                key.type--;
5063                key.offset = (u64)-1;
5064        } else if (key.objectid > 0) {
5065                key.objectid--;
5066                key.type = (u8)-1;
5067                key.offset = (u64)-1;
5068        } else {
5069                return 1;
5070        }
5071
5072        btrfs_release_path(path);
5073        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5074        if (ret < 0)
5075                return ret;
5076        btrfs_item_key(path->nodes[0], &found_key, 0);
5077        ret = comp_keys(&found_key, &key);
5078        /*
5079         * We might have had an item with the previous key in the tree right
5080         * before we released our path. And after we released our path, that
5081         * item might have been pushed to the first slot (0) of the leaf we
5082         * were holding due to a tree balance. Alternatively, an item with the
5083         * previous key can exist as the only element of a leaf (big fat item).
5084         * Therefore account for these 2 cases, so that our callers (like
5085         * btrfs_previous_item) don't miss an existing item with a key matching
5086         * the previous key we computed above.
5087         */
5088        if (ret <= 0)
5089                return 0;
5090        return 1;
5091}
5092
5093/*
5094 * A helper function to walk down the tree starting at min_key, and looking
5095 * for nodes or leaves that are have a minimum transaction id.
5096 * This is used by the btree defrag code, and tree logging
5097 *
5098 * This does not cow, but it does stuff the starting key it finds back
5099 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5100 * key and get a writable path.
5101 *
5102 * This does lock as it descends, and path->keep_locks should be set
5103 * to 1 by the caller.
5104 *
5105 * This honors path->lowest_level to prevent descent past a given level
5106 * of the tree.
5107 *
5108 * min_trans indicates the oldest transaction that you are interested
5109 * in walking through.  Any nodes or leaves older than min_trans are
5110 * skipped over (without reading them).
5111 *
5112 * returns zero if something useful was found, < 0 on error and 1 if there
5113 * was nothing in the tree that matched the search criteria.
5114 */
5115int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5116                         struct btrfs_path *path,
5117                         u64 min_trans)
5118{
5119        struct extent_buffer *cur;
5120        struct btrfs_key found_key;
5121        int slot;
5122        int sret;
5123        u32 nritems;
5124        int level;
5125        int ret = 1;
5126        int keep_locks = path->keep_locks;
5127
5128        path->keep_locks = 1;
5129again:
5130        cur = btrfs_read_lock_root_node(root);
5131        level = btrfs_header_level(cur);
5132        WARN_ON(path->nodes[level]);
5133        path->nodes[level] = cur;
5134        path->locks[level] = BTRFS_READ_LOCK;
5135
5136        if (btrfs_header_generation(cur) < min_trans) {
5137                ret = 1;
5138                goto out;
5139        }
5140        while (1) {
5141                nritems = btrfs_header_nritems(cur);
5142                level = btrfs_header_level(cur);
5143                sret = bin_search(cur, min_key, level, &slot);
5144
5145                /* at the lowest level, we're done, setup the path and exit */
5146                if (level == path->lowest_level) {
5147                        if (slot >= nritems)
5148                                goto find_next_key;
5149                        ret = 0;
5150                        path->slots[level] = slot;
5151                        btrfs_item_key_to_cpu(cur, &found_key, slot);
5152                        goto out;
5153                }
5154                if (sret && slot > 0)
5155                        slot--;
5156                /*
5157                 * check this node pointer against the min_trans parameters.
5158                 * If it is too old, old, skip to the next one.
5159                 */
5160                while (slot < nritems) {
5161                        u64 gen;
5162
5163                        gen = btrfs_node_ptr_generation(cur, slot);
5164                        if (gen < min_trans) {
5165                                slot++;
5166                                continue;
5167                        }
5168                        break;
5169                }
5170find_next_key:
5171                /*
5172                 * we didn't find a candidate key in this node, walk forward
5173                 * and find another one
5174                 */
5175                if (slot >= nritems) {
5176                        path->slots[level] = slot;
5177                        btrfs_set_path_blocking(path);
5178                        sret = btrfs_find_next_key(root, path, min_key, level,
5179                                                  min_trans);
5180                        if (sret == 0) {
5181                                btrfs_release_path(path);
5182                                goto again;
5183                        } else {
5184                                goto out;
5185                        }
5186                }
5187                /* save our key for returning back */
5188                btrfs_node_key_to_cpu(cur, &found_key, slot);
5189                path->slots[level] = slot;
5190                if (level == path->lowest_level) {
5191                        ret = 0;
5192                        goto out;
5193                }
5194                btrfs_set_path_blocking(path);
5195                cur = read_node_slot(root, cur, slot);
5196                BUG_ON(!cur); /* -ENOMEM */
5197
5198                btrfs_tree_read_lock(cur);
5199
5200                path->locks[level - 1] = BTRFS_READ_LOCK;
5201                path->nodes[level - 1] = cur;
5202                unlock_up(path, level, 1, 0, NULL);
5203                btrfs_clear_path_blocking(path, NULL, 0);
5204        }
5205out:
5206        path->keep_locks = keep_locks;
5207        if (ret == 0) {
5208                btrfs_unlock_up_safe(path, path->lowest_level + 1);
5209                btrfs_set_path_blocking(path);
5210                memcpy(min_key, &found_key, sizeof(found_key));
5211        }
5212        return ret;
5213}
5214
5215static void tree_move_down(struct btrfs_root *root,
5216                           struct btrfs_path *path,
5217                           int *level, int root_level)
5218{
5219        BUG_ON(*level == 0);
5220        path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5221                                        path->slots[*level]);
5222        path->slots[*level - 1] = 0;
5223        (*level)--;
5224}
5225
5226static int tree_move_next_or_upnext(struct btrfs_root *root,
5227                                    struct btrfs_path *path,
5228                                    int *level, int root_level)
5229{
5230        int ret = 0;
5231        int nritems;
5232        nritems = btrfs_header_nritems(path->nodes[*level]);
5233
5234        path->slots[*level]++;
5235
5236        while (path->slots[*level] >= nritems) {
5237                if (*level == root_level)
5238                        return -1;
5239
5240                /* move upnext */
5241                path->slots[*level] = 0;
5242                free_extent_buffer(path->nodes[*level]);
5243                path->nodes[*level] = NULL;
5244                (*level)++;
5245                path->slots[*level]++;
5246
5247                nritems = btrfs_header_nritems(path->nodes[*level]);
5248                ret = 1;
5249        }
5250        return ret;
5251}
5252
5253/*
5254 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5255 * or down.
5256 */
5257static int tree_advance(struct btrfs_root *root,
5258                        struct btrfs_path *path,
5259                        int *level, int root_level,
5260                        int allow_down,
5261                        struct btrfs_key *key)
5262{
5263        int ret;
5264
5265        if (*level == 0 || !allow_down) {
5266                ret = tree_move_next_or_upnext(root, path, level, root_level);
5267        } else {
5268                tree_move_down(root, path, level, root_level);
5269                ret = 0;
5270        }
5271        if (ret >= 0) {
5272                if (*level == 0)
5273                        btrfs_item_key_to_cpu(path->nodes[*level], key,
5274                                        path->slots[*level]);
5275                else
5276                        btrfs_node_key_to_cpu(path->nodes[*level], key,
5277                                        path->slots[*level]);
5278        }
5279        return ret;
5280}
5281
5282static int tree_compare_item(struct btrfs_root *left_root,
5283                             struct btrfs_path *left_path,
5284                             struct btrfs_path *right_path,
5285                             char *tmp_buf)
5286{
5287        int cmp;
5288        int len1, len2;
5289        unsigned long off1, off2;
5290
5291        len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5292        len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5293        if (len1 != len2)
5294                return 1;
5295
5296        off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5297        off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5298                                right_path->slots[0]);
5299
5300        read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5301
5302        cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5303        if (cmp)
5304                return 1;
5305        return 0;
5306}
5307
5308#define ADVANCE 1
5309#define ADVANCE_ONLY_NEXT -1
5310
5311/*
5312 * This function compares two trees and calls the provided callback for
5313 * every changed/new/deleted item it finds.
5314 * If shared tree blocks are encountered, whole subtrees are skipped, making
5315 * the compare pretty fast on snapshotted subvolumes.
5316 *
5317 * This currently works on commit roots only. As commit roots are read only,
5318 * we don't do any locking. The commit roots are protected with transactions.
5319 * Transactions are ended and rejoined when a commit is tried in between.
5320 *
5321 * This function checks for modifications done to the trees while comparing.
5322 * If it detects a change, it aborts immediately.
5323 */
5324int btrfs_compare_trees(struct btrfs_root *left_root,
5325                        struct btrfs_root *right_root,
5326                        btrfs_changed_cb_t changed_cb, void *ctx)
5327{
5328        int ret;
5329        int cmp;
5330        struct btrfs_path *left_path = NULL;
5331        struct btrfs_path *right_path = NULL;
5332        struct btrfs_key left_key;
5333        struct btrfs_key right_key;
5334        char *tmp_buf = NULL;
5335        int left_root_level;
5336        int right_root_level;
5337        int left_level;
5338        int right_level;
5339        int left_end_reached;
5340        int right_end_reached;
5341        int advance_left;
5342        int advance_right;
5343        u64 left_blockptr;
5344        u64 right_blockptr;
5345        u64 left_gen;
5346        u64 right_gen;
5347
5348        left_path = btrfs_alloc_path();
5349        if (!left_path) {
5350                ret = -ENOMEM;
5351                goto out;
5352        }
5353        right_path = btrfs_alloc_path();
5354        if (!right_path) {
5355                ret = -ENOMEM;
5356                goto out;
5357        }
5358
5359        tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5360        if (!tmp_buf) {
5361                ret = -ENOMEM;
5362                goto out;
5363        }
5364
5365        left_path->search_commit_root = 1;
5366        left_path->skip_locking = 1;
5367        right_path->search_commit_root = 1;
5368        right_path->skip_locking = 1;
5369
5370        /*
5371         * Strategy: Go to the first items of both trees. Then do
5372         *
5373         * If both trees are at level 0
5374         *   Compare keys of current items
5375         *     If left < right treat left item as new, advance left tree
5376         *       and repeat
5377         *     If left > right treat right item as deleted, advance right tree
5378         *       and repeat
5379         *     If left == right do deep compare of items, treat as changed if
5380         *       needed, advance both trees and repeat
5381         * If both trees are at the same level but not at level 0
5382         *   Compare keys of current nodes/leafs
5383         *     If left < right advance left tree and repeat
5384         *     If left > right advance right tree and repeat
5385         *     If left == right compare blockptrs of the next nodes/leafs
5386         *       If they match advance both trees but stay at the same level
5387         *         and repeat
5388         *       If they don't match advance both trees while allowing to go
5389         *         deeper and repeat
5390         * If tree levels are different
5391         *   Advance the tree that needs it and repeat
5392         *
5393         * Advancing a tree means:
5394         *   If we are at level 0, try to go to the next slot. If that's not
5395         *   possible, go one level up and repeat. Stop when we found a level
5396         *   where we could go to the next slot. We may at this point be on a
5397         *   node or a leaf.
5398         *
5399         *   If we are not at level 0 and not on shared tree blocks, go one
5400         *   level deeper.
5401         *
5402         *   If we are not at level 0 and on shared tree blocks, go one slot to
5403         *   the right if possible or go up and right.
5404         */
5405
5406        down_read(&left_root->fs_info->commit_root_sem);
5407        left_level = btrfs_header_level(left_root->commit_root);
5408        left_root_level = left_level;
5409        left_path->nodes[left_level] = left_root->commit_root;
5410        extent_buffer_get(left_path->nodes[left_level]);
5411
5412        right_level = btrfs_header_level(right_root->commit_root);
5413        right_root_level = right_level;
5414        right_path->nodes[right_level] = right_root->commit_root;
5415        extent_buffer_get(right_path->nodes[right_level]);
5416        up_read(&left_root->fs_info->commit_root_sem);
5417
5418        if (left_level == 0)
5419                btrfs_item_key_to_cpu(left_path->nodes[left_level],
5420                                &left_key, left_path->slots[left_level]);
5421        else
5422                btrfs_node_key_to_cpu(left_path->nodes[left_level],
5423                                &left_key, left_path->slots[left_level]);
5424        if (right_level == 0)
5425                btrfs_item_key_to_cpu(right_path->nodes[right_level],
5426                                &right_key, right_path->slots[right_level]);
5427        else
5428                btrfs_node_key_to_cpu(right_path->nodes[right_level],
5429                                &right_key, right_path->slots[right_level]);
5430
5431        left_end_reached = right_end_reached = 0;
5432        advance_left = advance_right = 0;
5433
5434        while (1) {
5435                if (advance_left && !left_end_reached) {
5436                        ret = tree_advance(left_root, left_path, &left_level,
5437                                        left_root_level,
5438                                        advance_left != ADVANCE_ONLY_NEXT,
5439                                        &left_key);
5440                        if (ret < 0)
5441                                left_end_reached = ADVANCE;
5442                        advance_left = 0;
5443                }
5444                if (advance_right && !right_end_reached) {
5445                        ret = tree_advance(right_root, right_path, &right_level,
5446                                        right_root_level,
5447                                        advance_right != ADVANCE_ONLY_NEXT,
5448                                        &right_key);
5449                        if (ret < 0)
5450                                right_end_reached = ADVANCE;
5451                        advance_right = 0;
5452                }
5453
5454                if (left_end_reached && right_end_reached) {
5455                        ret = 0;
5456                        goto out;
5457                } else if (left_end_reached) {
5458                        if (right_level == 0) {
5459                                ret = changed_cb(left_root, right_root,
5460                                                left_path, right_path,
5461                                                &right_key,
5462                                                BTRFS_COMPARE_TREE_DELETED,
5463                                                ctx);
5464                                if (ret < 0)
5465                                        goto out;
5466                        }
5467                        advance_right = ADVANCE;
5468                        continue;
5469                } else if (right_end_reached) {
5470                        if (left_level == 0) {
5471                                ret = changed_cb(left_root, right_root,
5472                                                left_path, right_path,
5473                                                &left_key,
5474                                                BTRFS_COMPARE_TREE_NEW,
5475                                                ctx);
5476                                if (ret < 0)
5477                                        goto out;
5478                        }
5479                        advance_left = ADVANCE;
5480                        continue;
5481                }
5482
5483                if (left_level == 0 && right_level == 0) {
5484                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5485                        if (cmp < 0) {
5486                                ret = changed_cb(left_root, right_root,
5487                                                left_path, right_path,
5488                                                &left_key,
5489                                                BTRFS_COMPARE_TREE_NEW,
5490                                                ctx);
5491                                if (ret < 0)
5492                                        goto out;
5493                                advance_left = ADVANCE;
5494                        } else if (cmp > 0) {
5495                                ret = changed_cb(left_root, right_root,
5496                                                left_path, right_path,
5497                                                &right_key,
5498                                                BTRFS_COMPARE_TREE_DELETED,
5499                                                ctx);
5500                                if (ret < 0)
5501                                        goto out;
5502                                advance_right = ADVANCE;
5503                        } else {
5504                                enum btrfs_compare_tree_result result;
5505
5506                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5507                                ret = tree_compare_item(left_root, left_path,
5508                                                right_path, tmp_buf);
5509                                if (ret)
5510                                        result = BTRFS_COMPARE_TREE_CHANGED;
5511                                else
5512                                        result = BTRFS_COMPARE_TREE_SAME;
5513                                ret = changed_cb(left_root, right_root,
5514                                                 left_path, right_path,
5515                                                 &left_key, result, ctx);
5516                                if (ret < 0)
5517                                        goto out;
5518                                advance_left = ADVANCE;
5519                                advance_right = ADVANCE;
5520                        }
5521                } else if (left_level == right_level) {
5522                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5523                        if (cmp < 0) {
5524                                advance_left = ADVANCE;
5525                        } else if (cmp > 0) {
5526                                advance_right = ADVANCE;
5527                        } else {
5528                                left_blockptr = btrfs_node_blockptr(
5529                                                left_path->nodes[left_level],
5530                                                left_path->slots[left_level]);
5531                                right_blockptr = btrfs_node_blockptr(
5532                                                right_path->nodes[right_level],
5533                                                right_path->slots[right_level]);
5534                                left_gen = btrfs_node_ptr_generation(
5535                                                left_path->nodes[left_level],
5536                                                left_path->slots[left_level]);
5537                                right_gen = btrfs_node_ptr_generation(
5538                                                right_path->nodes[right_level],
5539                                                right_path->slots[right_level]);
5540                                if (left_blockptr == right_blockptr &&
5541                                    left_gen == right_gen) {
5542                                        /*
5543                                         * As we're on a shared block, don't
5544                                         * allow to go deeper.
5545                                         */
5546                                        advance_left = ADVANCE_ONLY_NEXT;
5547                                        advance_right = ADVANCE_ONLY_NEXT;
5548                                } else {
5549                                        advance_left = ADVANCE;
5550                                        advance_right = ADVANCE;
5551                                }
5552                        }
5553                } else if (left_level < right_level) {
5554                        advance_right = ADVANCE;
5555                } else {
5556                        advance_left = ADVANCE;
5557                }
5558        }
5559
5560out:
5561        btrfs_free_path(left_path);
5562        btrfs_free_path(right_path);
5563        kfree(tmp_buf);
5564        return ret;
5565}
5566
5567/*
5568 * this is similar to btrfs_next_leaf, but does not try to preserve
5569 * and fixup the path.  It looks for and returns the next key in the
5570 * tree based on the current path and the min_trans parameters.
5571 *
5572 * 0 is returned if another key is found, < 0 if there are any errors
5573 * and 1 is returned if there are no higher keys in the tree
5574 *
5575 * path->keep_locks should be set to 1 on the search made before
5576 * calling this function.
5577 */
5578int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5579                        struct btrfs_key *key, int level, u64 min_trans)
5580{
5581        int slot;
5582        struct extent_buffer *c;
5583
5584        WARN_ON(!path->keep_locks);
5585        while (level < BTRFS_MAX_LEVEL) {
5586                if (!path->nodes[level])
5587                        return 1;
5588
5589                slot = path->slots[level] + 1;
5590                c = path->nodes[level];
5591next:
5592                if (slot >= btrfs_header_nritems(c)) {
5593                        int ret;
5594                        int orig_lowest;
5595                        struct btrfs_key cur_key;
5596                        if (level + 1 >= BTRFS_MAX_LEVEL ||
5597                            !path->nodes[level + 1])
5598                                return 1;
5599
5600                        if (path->locks[level + 1]) {
5601                                level++;
5602                                continue;
5603                        }
5604
5605                        slot = btrfs_header_nritems(c) - 1;
5606                        if (level == 0)
5607                                btrfs_item_key_to_cpu(c, &cur_key, slot);
5608                        else
5609                                btrfs_node_key_to_cpu(c, &cur_key, slot);
5610
5611                        orig_lowest = path->lowest_level;
5612                        btrfs_release_path(path);
5613                        path->lowest_level = level;
5614                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
5615                                                0, 0);
5616                        path->lowest_level = orig_lowest;
5617                        if (ret < 0)
5618                                return ret;
5619
5620                        c = path->nodes[level];
5621                        slot = path->slots[level];
5622                        if (ret == 0)
5623                                slot++;
5624                        goto next;
5625                }
5626
5627                if (level == 0)
5628                        btrfs_item_key_to_cpu(c, key, slot);
5629                else {
5630                        u64 gen = btrfs_node_ptr_generation(c, slot);
5631
5632                        if (gen < min_trans) {
5633                                slot++;
5634                                goto next;
5635                        }
5636                        btrfs_node_key_to_cpu(c, key, slot);
5637                }
5638                return 0;
5639        }
5640        return 1;
5641}
5642
5643/*
5644 * search the tree again to find a leaf with greater keys
5645 * returns 0 if it found something or 1 if there are no greater leaves.
5646 * returns < 0 on io errors.
5647 */
5648int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5649{
5650        return btrfs_next_old_leaf(root, path, 0);
5651}
5652
5653int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5654                        u64 time_seq)
5655{
5656        int slot;
5657        int level;
5658        struct extent_buffer *c;
5659        struct extent_buffer *next;
5660        struct btrfs_key key;
5661        u32 nritems;
5662        int ret;
5663        int old_spinning = path->leave_spinning;
5664        int next_rw_lock = 0;
5665
5666        nritems = btrfs_header_nritems(path->nodes[0]);
5667        if (nritems == 0)
5668                return 1;
5669
5670        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5671again:
5672        level = 1;
5673        next = NULL;
5674        next_rw_lock = 0;
5675        btrfs_release_path(path);
5676
5677        path->keep_locks = 1;
5678        path->leave_spinning = 1;
5679
5680        if (time_seq)
5681                ret = btrfs_search_old_slot(root, &key, path, time_seq);
5682        else
5683                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5684        path->keep_locks = 0;
5685
5686        if (ret < 0)
5687                return ret;
5688
5689        nritems = btrfs_header_nritems(path->nodes[0]);
5690        /*
5691         * by releasing the path above we dropped all our locks.  A balance
5692         * could have added more items next to the key that used to be
5693         * at the very end of the block.  So, check again here and
5694         * advance the path if there are now more items available.
5695         */
5696        if (nritems > 0 && path->slots[0] < nritems - 1) {
5697                if (ret == 0)
5698                        path->slots[0]++;
5699                ret = 0;
5700                goto done;
5701        }
5702        /*
5703         * So the above check misses one case:
5704         * - after releasing the path above, someone has removed the item that
5705         *   used to be at the very end of the block, and balance between leafs
5706         *   gets another one with bigger key.offset to replace it.
5707         *
5708         * This one should be returned as well, or we can get leaf corruption
5709         * later(esp. in __btrfs_drop_extents()).
5710         *
5711         * And a bit more explanation about this check,
5712         * with ret > 0, the key isn't found, the path points to the slot
5713         * where it should be inserted, so the path->slots[0] item must be the
5714         * bigger one.
5715         */
5716        if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5717                ret = 0;
5718                goto done;
5719        }
5720
5721        while (level < BTRFS_MAX_LEVEL) {
5722                if (!path->nodes[level]) {
5723                        ret = 1;
5724                        goto done;
5725                }
5726
5727                slot = path->slots[level] + 1;
5728                c = path->nodes[level];
5729                if (slot >= btrfs_header_nritems(c)) {
5730                        level++;
5731                        if (level == BTRFS_MAX_LEVEL) {
5732                                ret = 1;
5733                                goto done;
5734                        }
5735                        continue;
5736                }
5737
5738                if (next) {
5739                        btrfs_tree_unlock_rw(next, next_rw_lock);
5740                        free_extent_buffer(next);
5741                }
5742
5743                next = c;
5744                next_rw_lock = path->locks[level];
5745                ret = read_block_for_search(NULL, root, path, &next, level,
5746                                            slot, &key, 0);
5747                if (ret == -EAGAIN)
5748                        goto again;
5749
5750                if (ret < 0) {
5751                        btrfs_release_path(path);
5752                        goto done;
5753                }
5754
5755                if (!path->skip_locking) {
5756                        ret = btrfs_try_tree_read_lock(next);
5757                        if (!ret && time_seq) {
5758                                /*
5759                                 * If we don't get the lock, we may be racing
5760                                 * with push_leaf_left, holding that lock while
5761                                 * itself waiting for the leaf we've currently
5762                                 * locked. To solve this situation, we give up
5763                                 * on our lock and cycle.
5764                                 */
5765                                free_extent_buffer(next);
5766                                btrfs_release_path(path);
5767                                cond_resched();
5768                                goto again;
5769                        }
5770                        if (!ret) {
5771                                btrfs_set_path_blocking(path);
5772                                btrfs_tree_read_lock(next);
5773                                btrfs_clear_path_blocking(path, next,
5774                                                          BTRFS_READ_LOCK);
5775                        }
5776                        next_rw_lock = BTRFS_READ_LOCK;
5777                }
5778                break;
5779        }
5780        path->slots[level] = slot;
5781        while (1) {
5782                level--;
5783                c = path->nodes[level];
5784                if (path->locks[level])
5785                        btrfs_tree_unlock_rw(c, path->locks[level]);
5786
5787                free_extent_buffer(c);
5788                path->nodes[level] = next;
5789                path->slots[level] = 0;
5790                if (!path->skip_locking)
5791                        path->locks[level] = next_rw_lock;
5792                if (!level)
5793                        break;
5794
5795                ret = read_block_for_search(NULL, root, path, &next, level,
5796                                            0, &key, 0);
5797                if (ret == -EAGAIN)
5798                        goto again;
5799
5800                if (ret < 0) {
5801                        btrfs_release_path(path);
5802                        goto done;
5803                }
5804
5805                if (!path->skip_locking) {
5806                        ret = btrfs_try_tree_read_lock(next);
5807                        if (!ret) {
5808                                btrfs_set_path_blocking(path);
5809                                btrfs_tree_read_lock(next);
5810                                btrfs_clear_path_blocking(path, next,
5811                                                          BTRFS_READ_LOCK);
5812                        }
5813                        next_rw_lock = BTRFS_READ_LOCK;
5814                }
5815        }
5816        ret = 0;
5817done:
5818        unlock_up(path, 0, 1, 0, NULL);
5819        path->leave_spinning = old_spinning;
5820        if (!old_spinning)
5821                btrfs_set_path_blocking(path);
5822
5823        return ret;
5824}
5825
5826/*
5827 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5828 * searching until it gets past min_objectid or finds an item of 'type'
5829 *
5830 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5831 */
5832int btrfs_previous_item(struct btrfs_root *root,
5833                        struct btrfs_path *path, u64 min_objectid,
5834                        int type)
5835{
5836        struct btrfs_key found_key;
5837        struct extent_buffer *leaf;
5838        u32 nritems;
5839        int ret;
5840
5841        while (1) {
5842                if (path->slots[0] == 0) {
5843                        btrfs_set_path_blocking(path);
5844                        ret = btrfs_prev_leaf(root, path);
5845                        if (ret != 0)
5846                                return ret;
5847                } else {
5848                        path->slots[0]--;
5849                }
5850                leaf = path->nodes[0];
5851                nritems = btrfs_header_nritems(leaf);
5852                if (nritems == 0)
5853                        return 1;
5854                if (path->slots[0] == nritems)
5855                        path->slots[0]--;
5856
5857                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5858                if (found_key.objectid < min_objectid)
5859                        break;
5860                if (found_key.type == type)
5861                        return 0;
5862                if (found_key.objectid == min_objectid &&
5863                    found_key.type < type)
5864                        break;
5865        }
5866        return 1;
5867}
5868
5869/*
5870 * search in extent tree to find a previous Metadata/Data extent item with
5871 * min objecitd.
5872 *
5873 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5874 */
5875int btrfs_previous_extent_item(struct btrfs_root *root,
5876                        struct btrfs_path *path, u64 min_objectid)
5877{
5878        struct btrfs_key found_key;
5879        struct extent_buffer *leaf;
5880        u32 nritems;
5881        int ret;
5882
5883        while (1) {
5884                if (path->slots[0] == 0) {
5885                        btrfs_set_path_blocking(path);
5886                        ret = btrfs_prev_leaf(root, path);
5887                        if (ret != 0)
5888                                return ret;
5889                } else {
5890                        path->slots[0]--;
5891                }
5892                leaf = path->nodes[0];
5893                nritems = btrfs_header_nritems(leaf);
5894                if (nritems == 0)
5895                        return 1;
5896                if (path->slots[0] == nritems)
5897                        path->slots[0]--;
5898
5899                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5900                if (found_key.objectid < min_objectid)
5901                        break;
5902                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5903                    found_key.type == BTRFS_METADATA_ITEM_KEY)
5904                        return 0;
5905                if (found_key.objectid == min_objectid &&
5906                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5907                        break;
5908        }
5909        return 1;
5910}
5911