linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/slab.h>
  21#include <linux/rbtree.h>
  22#include <linux/vmalloc.h>
  23#include "ctree.h"
  24#include "disk-io.h"
  25#include "transaction.h"
  26#include "print-tree.h"
  27#include "locking.h"
  28
  29static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  30                      *root, struct btrfs_path *path, int level);
  31static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
  32                      *root, struct btrfs_key *ins_key,
  33                      struct btrfs_path *path, int data_size, int extend);
  34static int push_node_left(struct btrfs_trans_handle *trans,
  35                          struct btrfs_root *root, struct extent_buffer *dst,
  36                          struct extent_buffer *src, int empty);
  37static int balance_node_right(struct btrfs_trans_handle *trans,
  38                              struct btrfs_root *root,
  39                              struct extent_buffer *dst_buf,
  40                              struct extent_buffer *src_buf);
  41static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
  42                    int level, int slot);
  43static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
  44                                 struct extent_buffer *eb);
  45
  46struct btrfs_path *btrfs_alloc_path(void)
  47{
  48        struct btrfs_path *path;
  49        path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  50        return path;
  51}
  52
  53/*
  54 * set all locked nodes in the path to blocking locks.  This should
  55 * be done before scheduling
  56 */
  57noinline void btrfs_set_path_blocking(struct btrfs_path *p)
  58{
  59        int i;
  60        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  61                if (!p->nodes[i] || !p->locks[i])
  62                        continue;
  63                btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
  64                if (p->locks[i] == BTRFS_READ_LOCK)
  65                        p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
  66                else if (p->locks[i] == BTRFS_WRITE_LOCK)
  67                        p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
  68        }
  69}
  70
  71/*
  72 * reset all the locked nodes in the patch to spinning locks.
  73 *
  74 * held is used to keep lockdep happy, when lockdep is enabled
  75 * we set held to a blocking lock before we go around and
  76 * retake all the spinlocks in the path.  You can safely use NULL
  77 * for held
  78 */
  79noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
  80                                        struct extent_buffer *held, int held_rw)
  81{
  82        int i;
  83
  84        if (held) {
  85                btrfs_set_lock_blocking_rw(held, held_rw);
  86                if (held_rw == BTRFS_WRITE_LOCK)
  87                        held_rw = BTRFS_WRITE_LOCK_BLOCKING;
  88                else if (held_rw == BTRFS_READ_LOCK)
  89                        held_rw = BTRFS_READ_LOCK_BLOCKING;
  90        }
  91        btrfs_set_path_blocking(p);
  92
  93        for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
  94                if (p->nodes[i] && p->locks[i]) {
  95                        btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
  96                        if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
  97                                p->locks[i] = BTRFS_WRITE_LOCK;
  98                        else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
  99                                p->locks[i] = BTRFS_READ_LOCK;
 100                }
 101        }
 102
 103        if (held)
 104                btrfs_clear_lock_blocking_rw(held, held_rw);
 105}
 106
 107/* this also releases the path */
 108void btrfs_free_path(struct btrfs_path *p)
 109{
 110        if (!p)
 111                return;
 112        btrfs_release_path(p);
 113        kmem_cache_free(btrfs_path_cachep, p);
 114}
 115
 116/*
 117 * path release drops references on the extent buffers in the path
 118 * and it drops any locks held by this path
 119 *
 120 * It is safe to call this on paths that no locks or extent buffers held.
 121 */
 122noinline void btrfs_release_path(struct btrfs_path *p)
 123{
 124        int i;
 125
 126        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 127                p->slots[i] = 0;
 128                if (!p->nodes[i])
 129                        continue;
 130                if (p->locks[i]) {
 131                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
 132                        p->locks[i] = 0;
 133                }
 134                free_extent_buffer(p->nodes[i]);
 135                p->nodes[i] = NULL;
 136        }
 137}
 138
 139/*
 140 * safely gets a reference on the root node of a tree.  A lock
 141 * is not taken, so a concurrent writer may put a different node
 142 * at the root of the tree.  See btrfs_lock_root_node for the
 143 * looping required.
 144 *
 145 * The extent buffer returned by this has a reference taken, so
 146 * it won't disappear.  It may stop being the root of the tree
 147 * at any time because there are no locks held.
 148 */
 149struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 150{
 151        struct extent_buffer *eb;
 152
 153        while (1) {
 154                rcu_read_lock();
 155                eb = rcu_dereference(root->node);
 156
 157                /*
 158                 * RCU really hurts here, we could free up the root node because
 159                 * it was cow'ed but we may not get the new root node yet so do
 160                 * the inc_not_zero dance and if it doesn't work then
 161                 * synchronize_rcu and try again.
 162                 */
 163                if (atomic_inc_not_zero(&eb->refs)) {
 164                        rcu_read_unlock();
 165                        break;
 166                }
 167                rcu_read_unlock();
 168                synchronize_rcu();
 169        }
 170        return eb;
 171}
 172
 173/* loop around taking references on and locking the root node of the
 174 * tree until you end up with a lock on the root.  A locked buffer
 175 * is returned, with a reference held.
 176 */
 177struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 178{
 179        struct extent_buffer *eb;
 180
 181        while (1) {
 182                eb = btrfs_root_node(root);
 183                btrfs_tree_lock(eb);
 184                if (eb == root->node)
 185                        break;
 186                btrfs_tree_unlock(eb);
 187                free_extent_buffer(eb);
 188        }
 189        return eb;
 190}
 191
 192/* loop around taking references on and locking the root node of the
 193 * tree until you end up with a lock on the root.  A locked buffer
 194 * is returned, with a reference held.
 195 */
 196static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 197{
 198        struct extent_buffer *eb;
 199
 200        while (1) {
 201                eb = btrfs_root_node(root);
 202                btrfs_tree_read_lock(eb);
 203                if (eb == root->node)
 204                        break;
 205                btrfs_tree_read_unlock(eb);
 206                free_extent_buffer(eb);
 207        }
 208        return eb;
 209}
 210
 211/* cowonly root (everything not a reference counted cow subvolume), just get
 212 * put onto a simple dirty list.  transaction.c walks this to make sure they
 213 * get properly updated on disk.
 214 */
 215static void add_root_to_dirty_list(struct btrfs_root *root)
 216{
 217        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
 218            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
 219                return;
 220
 221        spin_lock(&root->fs_info->trans_lock);
 222        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
 223                /* Want the extent tree to be the last on the list */
 224                if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
 225                        list_move_tail(&root->dirty_list,
 226                                       &root->fs_info->dirty_cowonly_roots);
 227                else
 228                        list_move(&root->dirty_list,
 229                                  &root->fs_info->dirty_cowonly_roots);
 230        }
 231        spin_unlock(&root->fs_info->trans_lock);
 232}
 233
 234/*
 235 * used by snapshot creation to make a copy of a root for a tree with
 236 * a given objectid.  The buffer with the new root node is returned in
 237 * cow_ret, and this func returns zero on success or a negative error code.
 238 */
 239int btrfs_copy_root(struct btrfs_trans_handle *trans,
 240                      struct btrfs_root *root,
 241                      struct extent_buffer *buf,
 242                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 243{
 244        struct extent_buffer *cow;
 245        int ret = 0;
 246        int level;
 247        struct btrfs_disk_key disk_key;
 248
 249        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 250                trans->transid != root->fs_info->running_transaction->transid);
 251        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 252                trans->transid != root->last_trans);
 253
 254        level = btrfs_header_level(buf);
 255        if (level == 0)
 256                btrfs_item_key(buf, &disk_key, 0);
 257        else
 258                btrfs_node_key(buf, &disk_key, 0);
 259
 260        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 261                        &disk_key, level, buf->start, 0);
 262        if (IS_ERR(cow))
 263                return PTR_ERR(cow);
 264
 265        copy_extent_buffer(cow, buf, 0, 0, cow->len);
 266        btrfs_set_header_bytenr(cow, cow->start);
 267        btrfs_set_header_generation(cow, trans->transid);
 268        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 269        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 270                                     BTRFS_HEADER_FLAG_RELOC);
 271        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 272                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 273        else
 274                btrfs_set_header_owner(cow, new_root_objectid);
 275
 276        write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
 277                            BTRFS_FSID_SIZE);
 278
 279        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 280        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 281                ret = btrfs_inc_ref(trans, root, cow, 1);
 282        else
 283                ret = btrfs_inc_ref(trans, root, cow, 0);
 284
 285        if (ret)
 286                return ret;
 287
 288        btrfs_mark_buffer_dirty(cow);
 289        *cow_ret = cow;
 290        return 0;
 291}
 292
 293enum mod_log_op {
 294        MOD_LOG_KEY_REPLACE,
 295        MOD_LOG_KEY_ADD,
 296        MOD_LOG_KEY_REMOVE,
 297        MOD_LOG_KEY_REMOVE_WHILE_FREEING,
 298        MOD_LOG_KEY_REMOVE_WHILE_MOVING,
 299        MOD_LOG_MOVE_KEYS,
 300        MOD_LOG_ROOT_REPLACE,
 301};
 302
 303struct tree_mod_move {
 304        int dst_slot;
 305        int nr_items;
 306};
 307
 308struct tree_mod_root {
 309        u64 logical;
 310        u8 level;
 311};
 312
 313struct tree_mod_elem {
 314        struct rb_node node;
 315        u64 logical;
 316        u64 seq;
 317        enum mod_log_op op;
 318
 319        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
 320        int slot;
 321
 322        /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
 323        u64 generation;
 324
 325        /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
 326        struct btrfs_disk_key key;
 327        u64 blockptr;
 328
 329        /* this is used for op == MOD_LOG_MOVE_KEYS */
 330        struct tree_mod_move move;
 331
 332        /* this is used for op == MOD_LOG_ROOT_REPLACE */
 333        struct tree_mod_root old_root;
 334};
 335
 336static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
 337{
 338        read_lock(&fs_info->tree_mod_log_lock);
 339}
 340
 341static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
 342{
 343        read_unlock(&fs_info->tree_mod_log_lock);
 344}
 345
 346static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
 347{
 348        write_lock(&fs_info->tree_mod_log_lock);
 349}
 350
 351static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
 352{
 353        write_unlock(&fs_info->tree_mod_log_lock);
 354}
 355
 356/*
 357 * Pull a new tree mod seq number for our operation.
 358 */
 359static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 360{
 361        return atomic64_inc_return(&fs_info->tree_mod_seq);
 362}
 363
 364/*
 365 * This adds a new blocker to the tree mod log's blocker list if the @elem
 366 * passed does not already have a sequence number set. So when a caller expects
 367 * to record tree modifications, it should ensure to set elem->seq to zero
 368 * before calling btrfs_get_tree_mod_seq.
 369 * Returns a fresh, unused tree log modification sequence number, even if no new
 370 * blocker was added.
 371 */
 372u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 373                           struct seq_list *elem)
 374{
 375        tree_mod_log_write_lock(fs_info);
 376        spin_lock(&fs_info->tree_mod_seq_lock);
 377        if (!elem->seq) {
 378                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 379                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 380        }
 381        spin_unlock(&fs_info->tree_mod_seq_lock);
 382        tree_mod_log_write_unlock(fs_info);
 383
 384        return elem->seq;
 385}
 386
 387void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 388                            struct seq_list *elem)
 389{
 390        struct rb_root *tm_root;
 391        struct rb_node *node;
 392        struct rb_node *next;
 393        struct seq_list *cur_elem;
 394        struct tree_mod_elem *tm;
 395        u64 min_seq = (u64)-1;
 396        u64 seq_putting = elem->seq;
 397
 398        if (!seq_putting)
 399                return;
 400
 401        spin_lock(&fs_info->tree_mod_seq_lock);
 402        list_del(&elem->list);
 403        elem->seq = 0;
 404
 405        list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
 406                if (cur_elem->seq < min_seq) {
 407                        if (seq_putting > cur_elem->seq) {
 408                                /*
 409                                 * blocker with lower sequence number exists, we
 410                                 * cannot remove anything from the log
 411                                 */
 412                                spin_unlock(&fs_info->tree_mod_seq_lock);
 413                                return;
 414                        }
 415                        min_seq = cur_elem->seq;
 416                }
 417        }
 418        spin_unlock(&fs_info->tree_mod_seq_lock);
 419
 420        /*
 421         * anything that's lower than the lowest existing (read: blocked)
 422         * sequence number can be removed from the tree.
 423         */
 424        tree_mod_log_write_lock(fs_info);
 425        tm_root = &fs_info->tree_mod_log;
 426        for (node = rb_first(tm_root); node; node = next) {
 427                next = rb_next(node);
 428                tm = container_of(node, struct tree_mod_elem, node);
 429                if (tm->seq > min_seq)
 430                        continue;
 431                rb_erase(node, tm_root);
 432                kfree(tm);
 433        }
 434        tree_mod_log_write_unlock(fs_info);
 435}
 436
 437/*
 438 * key order of the log:
 439 *       node/leaf start address -> sequence
 440 *
 441 * The 'start address' is the logical address of the *new* root node
 442 * for root replace operations, or the logical address of the affected
 443 * block for all other operations.
 444 *
 445 * Note: must be called with write lock (tree_mod_log_write_lock).
 446 */
 447static noinline int
 448__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 449{
 450        struct rb_root *tm_root;
 451        struct rb_node **new;
 452        struct rb_node *parent = NULL;
 453        struct tree_mod_elem *cur;
 454
 455        BUG_ON(!tm);
 456
 457        tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 458
 459        tm_root = &fs_info->tree_mod_log;
 460        new = &tm_root->rb_node;
 461        while (*new) {
 462                cur = container_of(*new, struct tree_mod_elem, node);
 463                parent = *new;
 464                if (cur->logical < tm->logical)
 465                        new = &((*new)->rb_left);
 466                else if (cur->logical > tm->logical)
 467                        new = &((*new)->rb_right);
 468                else if (cur->seq < tm->seq)
 469                        new = &((*new)->rb_left);
 470                else if (cur->seq > tm->seq)
 471                        new = &((*new)->rb_right);
 472                else
 473                        return -EEXIST;
 474        }
 475
 476        rb_link_node(&tm->node, parent, new);
 477        rb_insert_color(&tm->node, tm_root);
 478        return 0;
 479}
 480
 481/*
 482 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 483 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 484 * this until all tree mod log insertions are recorded in the rb tree and then
 485 * call tree_mod_log_write_unlock() to release.
 486 */
 487static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 488                                    struct extent_buffer *eb) {
 489        smp_mb();
 490        if (list_empty(&(fs_info)->tree_mod_seq_list))
 491                return 1;
 492        if (eb && btrfs_header_level(eb) == 0)
 493                return 1;
 494
 495        tree_mod_log_write_lock(fs_info);
 496        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 497                tree_mod_log_write_unlock(fs_info);
 498                return 1;
 499        }
 500
 501        return 0;
 502}
 503
 504/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 505static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 506                                    struct extent_buffer *eb)
 507{
 508        smp_mb();
 509        if (list_empty(&(fs_info)->tree_mod_seq_list))
 510                return 0;
 511        if (eb && btrfs_header_level(eb) == 0)
 512                return 0;
 513
 514        return 1;
 515}
 516
 517static struct tree_mod_elem *
 518alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
 519                    enum mod_log_op op, gfp_t flags)
 520{
 521        struct tree_mod_elem *tm;
 522
 523        tm = kzalloc(sizeof(*tm), flags);
 524        if (!tm)
 525                return NULL;
 526
 527        tm->logical = eb->start;
 528        if (op != MOD_LOG_KEY_ADD) {
 529                btrfs_node_key(eb, &tm->key, slot);
 530                tm->blockptr = btrfs_node_blockptr(eb, slot);
 531        }
 532        tm->op = op;
 533        tm->slot = slot;
 534        tm->generation = btrfs_node_ptr_generation(eb, slot);
 535        RB_CLEAR_NODE(&tm->node);
 536
 537        return tm;
 538}
 539
 540static noinline int
 541tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 542                        struct extent_buffer *eb, int slot,
 543                        enum mod_log_op op, gfp_t flags)
 544{
 545        struct tree_mod_elem *tm;
 546        int ret;
 547
 548        if (!tree_mod_need_log(fs_info, eb))
 549                return 0;
 550
 551        tm = alloc_tree_mod_elem(eb, slot, op, flags);
 552        if (!tm)
 553                return -ENOMEM;
 554
 555        if (tree_mod_dont_log(fs_info, eb)) {
 556                kfree(tm);
 557                return 0;
 558        }
 559
 560        ret = __tree_mod_log_insert(fs_info, tm);
 561        tree_mod_log_write_unlock(fs_info);
 562        if (ret)
 563                kfree(tm);
 564
 565        return ret;
 566}
 567
 568static noinline int
 569tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 570                         struct extent_buffer *eb, int dst_slot, int src_slot,
 571                         int nr_items, gfp_t flags)
 572{
 573        struct tree_mod_elem *tm = NULL;
 574        struct tree_mod_elem **tm_list = NULL;
 575        int ret = 0;
 576        int i;
 577        int locked = 0;
 578
 579        if (!tree_mod_need_log(fs_info, eb))
 580                return 0;
 581
 582        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
 583        if (!tm_list)
 584                return -ENOMEM;
 585
 586        tm = kzalloc(sizeof(*tm), flags);
 587        if (!tm) {
 588                ret = -ENOMEM;
 589                goto free_tms;
 590        }
 591
 592        tm->logical = eb->start;
 593        tm->slot = src_slot;
 594        tm->move.dst_slot = dst_slot;
 595        tm->move.nr_items = nr_items;
 596        tm->op = MOD_LOG_MOVE_KEYS;
 597
 598        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 599                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 600                    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
 601                if (!tm_list[i]) {
 602                        ret = -ENOMEM;
 603                        goto free_tms;
 604                }
 605        }
 606
 607        if (tree_mod_dont_log(fs_info, eb))
 608                goto free_tms;
 609        locked = 1;
 610
 611        /*
 612         * When we override something during the move, we log these removals.
 613         * This can only happen when we move towards the beginning of the
 614         * buffer, i.e. dst_slot < src_slot.
 615         */
 616        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 617                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 618                if (ret)
 619                        goto free_tms;
 620        }
 621
 622        ret = __tree_mod_log_insert(fs_info, tm);
 623        if (ret)
 624                goto free_tms;
 625        tree_mod_log_write_unlock(fs_info);
 626        kfree(tm_list);
 627
 628        return 0;
 629free_tms:
 630        for (i = 0; i < nr_items; i++) {
 631                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 632                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 633                kfree(tm_list[i]);
 634        }
 635        if (locked)
 636                tree_mod_log_write_unlock(fs_info);
 637        kfree(tm_list);
 638        kfree(tm);
 639
 640        return ret;
 641}
 642
 643static inline int
 644__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 645                       struct tree_mod_elem **tm_list,
 646                       int nritems)
 647{
 648        int i, j;
 649        int ret;
 650
 651        for (i = nritems - 1; i >= 0; i--) {
 652                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 653                if (ret) {
 654                        for (j = nritems - 1; j > i; j--)
 655                                rb_erase(&tm_list[j]->node,
 656                                         &fs_info->tree_mod_log);
 657                        return ret;
 658                }
 659        }
 660
 661        return 0;
 662}
 663
 664static noinline int
 665tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 666                         struct extent_buffer *old_root,
 667                         struct extent_buffer *new_root, gfp_t flags,
 668                         int log_removal)
 669{
 670        struct tree_mod_elem *tm = NULL;
 671        struct tree_mod_elem **tm_list = NULL;
 672        int nritems = 0;
 673        int ret = 0;
 674        int i;
 675
 676        if (!tree_mod_need_log(fs_info, NULL))
 677                return 0;
 678
 679        if (log_removal && btrfs_header_level(old_root) > 0) {
 680                nritems = btrfs_header_nritems(old_root);
 681                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 682                                  flags);
 683                if (!tm_list) {
 684                        ret = -ENOMEM;
 685                        goto free_tms;
 686                }
 687                for (i = 0; i < nritems; i++) {
 688                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
 689                            MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
 690                        if (!tm_list[i]) {
 691                                ret = -ENOMEM;
 692                                goto free_tms;
 693                        }
 694                }
 695        }
 696
 697        tm = kzalloc(sizeof(*tm), flags);
 698        if (!tm) {
 699                ret = -ENOMEM;
 700                goto free_tms;
 701        }
 702
 703        tm->logical = new_root->start;
 704        tm->old_root.logical = old_root->start;
 705        tm->old_root.level = btrfs_header_level(old_root);
 706        tm->generation = btrfs_header_generation(old_root);
 707        tm->op = MOD_LOG_ROOT_REPLACE;
 708
 709        if (tree_mod_dont_log(fs_info, NULL))
 710                goto free_tms;
 711
 712        if (tm_list)
 713                ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 714        if (!ret)
 715                ret = __tree_mod_log_insert(fs_info, tm);
 716
 717        tree_mod_log_write_unlock(fs_info);
 718        if (ret)
 719                goto free_tms;
 720        kfree(tm_list);
 721
 722        return ret;
 723
 724free_tms:
 725        if (tm_list) {
 726                for (i = 0; i < nritems; i++)
 727                        kfree(tm_list[i]);
 728                kfree(tm_list);
 729        }
 730        kfree(tm);
 731
 732        return ret;
 733}
 734
 735static struct tree_mod_elem *
 736__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 737                      int smallest)
 738{
 739        struct rb_root *tm_root;
 740        struct rb_node *node;
 741        struct tree_mod_elem *cur = NULL;
 742        struct tree_mod_elem *found = NULL;
 743
 744        tree_mod_log_read_lock(fs_info);
 745        tm_root = &fs_info->tree_mod_log;
 746        node = tm_root->rb_node;
 747        while (node) {
 748                cur = container_of(node, struct tree_mod_elem, node);
 749                if (cur->logical < start) {
 750                        node = node->rb_left;
 751                } else if (cur->logical > start) {
 752                        node = node->rb_right;
 753                } else if (cur->seq < min_seq) {
 754                        node = node->rb_left;
 755                } else if (!smallest) {
 756                        /* we want the node with the highest seq */
 757                        if (found)
 758                                BUG_ON(found->seq > cur->seq);
 759                        found = cur;
 760                        node = node->rb_left;
 761                } else if (cur->seq > min_seq) {
 762                        /* we want the node with the smallest seq */
 763                        if (found)
 764                                BUG_ON(found->seq < cur->seq);
 765                        found = cur;
 766                        node = node->rb_right;
 767                } else {
 768                        found = cur;
 769                        break;
 770                }
 771        }
 772        tree_mod_log_read_unlock(fs_info);
 773
 774        return found;
 775}
 776
 777/*
 778 * this returns the element from the log with the smallest time sequence
 779 * value that's in the log (the oldest log item). any element with a time
 780 * sequence lower than min_seq will be ignored.
 781 */
 782static struct tree_mod_elem *
 783tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
 784                           u64 min_seq)
 785{
 786        return __tree_mod_log_search(fs_info, start, min_seq, 1);
 787}
 788
 789/*
 790 * this returns the element from the log with the largest time sequence
 791 * value that's in the log (the most recent log item). any element with
 792 * a time sequence lower than min_seq will be ignored.
 793 */
 794static struct tree_mod_elem *
 795tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 796{
 797        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 798}
 799
 800static noinline int
 801tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 802                     struct extent_buffer *src, unsigned long dst_offset,
 803                     unsigned long src_offset, int nr_items)
 804{
 805        int ret = 0;
 806        struct tree_mod_elem **tm_list = NULL;
 807        struct tree_mod_elem **tm_list_add, **tm_list_rem;
 808        int i;
 809        int locked = 0;
 810
 811        if (!tree_mod_need_log(fs_info, NULL))
 812                return 0;
 813
 814        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 815                return 0;
 816
 817        tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 818                          GFP_NOFS);
 819        if (!tm_list)
 820                return -ENOMEM;
 821
 822        tm_list_add = tm_list;
 823        tm_list_rem = tm_list + nr_items;
 824        for (i = 0; i < nr_items; i++) {
 825                tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 826                    MOD_LOG_KEY_REMOVE, GFP_NOFS);
 827                if (!tm_list_rem[i]) {
 828                        ret = -ENOMEM;
 829                        goto free_tms;
 830                }
 831
 832                tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 833                    MOD_LOG_KEY_ADD, GFP_NOFS);
 834                if (!tm_list_add[i]) {
 835                        ret = -ENOMEM;
 836                        goto free_tms;
 837                }
 838        }
 839
 840        if (tree_mod_dont_log(fs_info, NULL))
 841                goto free_tms;
 842        locked = 1;
 843
 844        for (i = 0; i < nr_items; i++) {
 845                ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
 846                if (ret)
 847                        goto free_tms;
 848                ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
 849                if (ret)
 850                        goto free_tms;
 851        }
 852
 853        tree_mod_log_write_unlock(fs_info);
 854        kfree(tm_list);
 855
 856        return 0;
 857
 858free_tms:
 859        for (i = 0; i < nr_items * 2; i++) {
 860                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 861                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 862                kfree(tm_list[i]);
 863        }
 864        if (locked)
 865                tree_mod_log_write_unlock(fs_info);
 866        kfree(tm_list);
 867
 868        return ret;
 869}
 870
 871static inline void
 872tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 873                     int dst_offset, int src_offset, int nr_items)
 874{
 875        int ret;
 876        ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
 877                                       nr_items, GFP_NOFS);
 878        BUG_ON(ret < 0);
 879}
 880
 881static noinline void
 882tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 883                          struct extent_buffer *eb, int slot, int atomic)
 884{
 885        int ret;
 886
 887        ret = tree_mod_log_insert_key(fs_info, eb, slot,
 888                                        MOD_LOG_KEY_REPLACE,
 889                                        atomic ? GFP_ATOMIC : GFP_NOFS);
 890        BUG_ON(ret < 0);
 891}
 892
 893static noinline int
 894tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 895{
 896        struct tree_mod_elem **tm_list = NULL;
 897        int nritems = 0;
 898        int i;
 899        int ret = 0;
 900
 901        if (btrfs_header_level(eb) == 0)
 902                return 0;
 903
 904        if (!tree_mod_need_log(fs_info, NULL))
 905                return 0;
 906
 907        nritems = btrfs_header_nritems(eb);
 908        tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 909        if (!tm_list)
 910                return -ENOMEM;
 911
 912        for (i = 0; i < nritems; i++) {
 913                tm_list[i] = alloc_tree_mod_elem(eb, i,
 914                    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 915                if (!tm_list[i]) {
 916                        ret = -ENOMEM;
 917                        goto free_tms;
 918                }
 919        }
 920
 921        if (tree_mod_dont_log(fs_info, eb))
 922                goto free_tms;
 923
 924        ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 925        tree_mod_log_write_unlock(fs_info);
 926        if (ret)
 927                goto free_tms;
 928        kfree(tm_list);
 929
 930        return 0;
 931
 932free_tms:
 933        for (i = 0; i < nritems; i++)
 934                kfree(tm_list[i]);
 935        kfree(tm_list);
 936
 937        return ret;
 938}
 939
 940static noinline void
 941tree_mod_log_set_root_pointer(struct btrfs_root *root,
 942                              struct extent_buffer *new_root_node,
 943                              int log_removal)
 944{
 945        int ret;
 946        ret = tree_mod_log_insert_root(root->fs_info, root->node,
 947                                       new_root_node, GFP_NOFS, log_removal);
 948        BUG_ON(ret < 0);
 949}
 950
 951/*
 952 * check if the tree block can be shared by multiple trees
 953 */
 954int btrfs_block_can_be_shared(struct btrfs_root *root,
 955                              struct extent_buffer *buf)
 956{
 957        /*
 958         * Tree blocks not in refernece counted trees and tree roots
 959         * are never shared. If a block was allocated after the last
 960         * snapshot and the block was not allocated by tree relocation,
 961         * we know the block is not shared.
 962         */
 963        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 964            buf != root->node && buf != root->commit_root &&
 965            (btrfs_header_generation(buf) <=
 966             btrfs_root_last_snapshot(&root->root_item) ||
 967             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 968                return 1;
 969#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 970        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 971            btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 972                return 1;
 973#endif
 974        return 0;
 975}
 976
 977static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 978                                       struct btrfs_root *root,
 979                                       struct extent_buffer *buf,
 980                                       struct extent_buffer *cow,
 981                                       int *last_ref)
 982{
 983        u64 refs;
 984        u64 owner;
 985        u64 flags;
 986        u64 new_flags = 0;
 987        int ret;
 988
 989        /*
 990         * Backrefs update rules:
 991         *
 992         * Always use full backrefs for extent pointers in tree block
 993         * allocated by tree relocation.
 994         *
 995         * If a shared tree block is no longer referenced by its owner
 996         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 997         * use full backrefs for extent pointers in tree block.
 998         *
 999         * If a tree block is been relocating
1000         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
1001         * use full backrefs for extent pointers in tree block.
1002         * The reason for this is some operations (such as drop tree)
1003         * are only allowed for blocks use full backrefs.
1004         */
1005
1006        if (btrfs_block_can_be_shared(root, buf)) {
1007                ret = btrfs_lookup_extent_info(trans, root, buf->start,
1008                                               btrfs_header_level(buf), 1,
1009                                               &refs, &flags);
1010                if (ret)
1011                        return ret;
1012                if (refs == 0) {
1013                        ret = -EROFS;
1014                        btrfs_std_error(root->fs_info, ret, NULL);
1015                        return ret;
1016                }
1017        } else {
1018                refs = 1;
1019                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1020                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1021                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1022                else
1023                        flags = 0;
1024        }
1025
1026        owner = btrfs_header_owner(buf);
1027        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1028               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1029
1030        if (refs > 1) {
1031                if ((owner == root->root_key.objectid ||
1032                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1033                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1034                        ret = btrfs_inc_ref(trans, root, buf, 1);
1035                        BUG_ON(ret); /* -ENOMEM */
1036
1037                        if (root->root_key.objectid ==
1038                            BTRFS_TREE_RELOC_OBJECTID) {
1039                                ret = btrfs_dec_ref(trans, root, buf, 0);
1040                                BUG_ON(ret); /* -ENOMEM */
1041                                ret = btrfs_inc_ref(trans, root, cow, 1);
1042                                BUG_ON(ret); /* -ENOMEM */
1043                        }
1044                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1045                } else {
1046
1047                        if (root->root_key.objectid ==
1048                            BTRFS_TREE_RELOC_OBJECTID)
1049                                ret = btrfs_inc_ref(trans, root, cow, 1);
1050                        else
1051                                ret = btrfs_inc_ref(trans, root, cow, 0);
1052                        BUG_ON(ret); /* -ENOMEM */
1053                }
1054                if (new_flags != 0) {
1055                        int level = btrfs_header_level(buf);
1056
1057                        ret = btrfs_set_disk_extent_flags(trans, root,
1058                                                          buf->start,
1059                                                          buf->len,
1060                                                          new_flags, level, 0);
1061                        if (ret)
1062                                return ret;
1063                }
1064        } else {
1065                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1066                        if (root->root_key.objectid ==
1067                            BTRFS_TREE_RELOC_OBJECTID)
1068                                ret = btrfs_inc_ref(trans, root, cow, 1);
1069                        else
1070                                ret = btrfs_inc_ref(trans, root, cow, 0);
1071                        BUG_ON(ret); /* -ENOMEM */
1072                        ret = btrfs_dec_ref(trans, root, buf, 1);
1073                        BUG_ON(ret); /* -ENOMEM */
1074                }
1075                clean_tree_block(trans, root->fs_info, buf);
1076                *last_ref = 1;
1077        }
1078        return 0;
1079}
1080
1081/*
1082 * does the dirty work in cow of a single block.  The parent block (if
1083 * supplied) is updated to point to the new cow copy.  The new buffer is marked
1084 * dirty and returned locked.  If you modify the block it needs to be marked
1085 * dirty again.
1086 *
1087 * search_start -- an allocation hint for the new block
1088 *
1089 * empty_size -- a hint that you plan on doing more cow.  This is the size in
1090 * bytes the allocator should try to find free next to the block it returns.
1091 * This is just a hint and may be ignored by the allocator.
1092 */
1093static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1094                             struct btrfs_root *root,
1095                             struct extent_buffer *buf,
1096                             struct extent_buffer *parent, int parent_slot,
1097                             struct extent_buffer **cow_ret,
1098                             u64 search_start, u64 empty_size)
1099{
1100        struct btrfs_disk_key disk_key;
1101        struct extent_buffer *cow;
1102        int level, ret;
1103        int last_ref = 0;
1104        int unlock_orig = 0;
1105        u64 parent_start;
1106
1107        if (*cow_ret == buf)
1108                unlock_orig = 1;
1109
1110        btrfs_assert_tree_locked(buf);
1111
1112        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1113                trans->transid != root->fs_info->running_transaction->transid);
1114        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1115                trans->transid != root->last_trans);
1116
1117        level = btrfs_header_level(buf);
1118
1119        if (level == 0)
1120                btrfs_item_key(buf, &disk_key, 0);
1121        else
1122                btrfs_node_key(buf, &disk_key, 0);
1123
1124        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1125                if (parent)
1126                        parent_start = parent->start;
1127                else
1128                        parent_start = 0;
1129        } else
1130                parent_start = 0;
1131
1132        cow = btrfs_alloc_tree_block(trans, root, parent_start,
1133                        root->root_key.objectid, &disk_key, level,
1134                        search_start, empty_size);
1135        if (IS_ERR(cow))
1136                return PTR_ERR(cow);
1137
1138        /* cow is set to blocking by btrfs_init_new_buffer */
1139
1140        copy_extent_buffer(cow, buf, 0, 0, cow->len);
1141        btrfs_set_header_bytenr(cow, cow->start);
1142        btrfs_set_header_generation(cow, trans->transid);
1143        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1144        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1145                                     BTRFS_HEADER_FLAG_RELOC);
1146        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1147                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1148        else
1149                btrfs_set_header_owner(cow, root->root_key.objectid);
1150
1151        write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1152                            BTRFS_FSID_SIZE);
1153
1154        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1155        if (ret) {
1156                btrfs_abort_transaction(trans, root, ret);
1157                return ret;
1158        }
1159
1160        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1161                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1162                if (ret) {
1163                        btrfs_abort_transaction(trans, root, ret);
1164                        return ret;
1165                }
1166        }
1167
1168        if (buf == root->node) {
1169                WARN_ON(parent && parent != buf);
1170                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1171                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1172                        parent_start = buf->start;
1173                else
1174                        parent_start = 0;
1175
1176                extent_buffer_get(cow);
1177                tree_mod_log_set_root_pointer(root, cow, 1);
1178                rcu_assign_pointer(root->node, cow);
1179
1180                btrfs_free_tree_block(trans, root, buf, parent_start,
1181                                      last_ref);
1182                free_extent_buffer(buf);
1183                add_root_to_dirty_list(root);
1184        } else {
1185                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1186                        parent_start = parent->start;
1187                else
1188                        parent_start = 0;
1189
1190                WARN_ON(trans->transid != btrfs_header_generation(parent));
1191                tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1192                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1193                btrfs_set_node_blockptr(parent, parent_slot,
1194                                        cow->start);
1195                btrfs_set_node_ptr_generation(parent, parent_slot,
1196                                              trans->transid);
1197                btrfs_mark_buffer_dirty(parent);
1198                if (last_ref) {
1199                        ret = tree_mod_log_free_eb(root->fs_info, buf);
1200                        if (ret) {
1201                                btrfs_abort_transaction(trans, root, ret);
1202                                return ret;
1203                        }
1204                }
1205                btrfs_free_tree_block(trans, root, buf, parent_start,
1206                                      last_ref);
1207        }
1208        if (unlock_orig)
1209                btrfs_tree_unlock(buf);
1210        free_extent_buffer_stale(buf);
1211        btrfs_mark_buffer_dirty(cow);
1212        *cow_ret = cow;
1213        return 0;
1214}
1215
1216/*
1217 * returns the logical address of the oldest predecessor of the given root.
1218 * entries older than time_seq are ignored.
1219 */
1220static struct tree_mod_elem *
1221__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1222                           struct extent_buffer *eb_root, u64 time_seq)
1223{
1224        struct tree_mod_elem *tm;
1225        struct tree_mod_elem *found = NULL;
1226        u64 root_logical = eb_root->start;
1227        int looped = 0;
1228
1229        if (!time_seq)
1230                return NULL;
1231
1232        /*
1233         * the very last operation that's logged for a root is the
1234         * replacement operation (if it is replaced at all). this has
1235         * the logical address of the *new* root, making it the very
1236         * first operation that's logged for this root.
1237         */
1238        while (1) {
1239                tm = tree_mod_log_search_oldest(fs_info, root_logical,
1240                                                time_seq);
1241                if (!looped && !tm)
1242                        return NULL;
1243                /*
1244                 * if there are no tree operation for the oldest root, we simply
1245                 * return it. this should only happen if that (old) root is at
1246                 * level 0.
1247                 */
1248                if (!tm)
1249                        break;
1250
1251                /*
1252                 * if there's an operation that's not a root replacement, we
1253                 * found the oldest version of our root. normally, we'll find a
1254                 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1255                 */
1256                if (tm->op != MOD_LOG_ROOT_REPLACE)
1257                        break;
1258
1259                found = tm;
1260                root_logical = tm->old_root.logical;
1261                looped = 1;
1262        }
1263
1264        /* if there's no old root to return, return what we found instead */
1265        if (!found)
1266                found = tm;
1267
1268        return found;
1269}
1270
1271/*
1272 * tm is a pointer to the first operation to rewind within eb. then, all
1273 * previous operations will be rewinded (until we reach something older than
1274 * time_seq).
1275 */
1276static void
1277__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1278                      u64 time_seq, struct tree_mod_elem *first_tm)
1279{
1280        u32 n;
1281        struct rb_node *next;
1282        struct tree_mod_elem *tm = first_tm;
1283        unsigned long o_dst;
1284        unsigned long o_src;
1285        unsigned long p_size = sizeof(struct btrfs_key_ptr);
1286
1287        n = btrfs_header_nritems(eb);
1288        tree_mod_log_read_lock(fs_info);
1289        while (tm && tm->seq >= time_seq) {
1290                /*
1291                 * all the operations are recorded with the operator used for
1292                 * the modification. as we're going backwards, we do the
1293                 * opposite of each operation here.
1294                 */
1295                switch (tm->op) {
1296                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1297                        BUG_ON(tm->slot < n);
1298                        /* Fallthrough */
1299                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1300                case MOD_LOG_KEY_REMOVE:
1301                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1302                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1303                        btrfs_set_node_ptr_generation(eb, tm->slot,
1304                                                      tm->generation);
1305                        n++;
1306                        break;
1307                case MOD_LOG_KEY_REPLACE:
1308                        BUG_ON(tm->slot >= n);
1309                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1310                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1311                        btrfs_set_node_ptr_generation(eb, tm->slot,
1312                                                      tm->generation);
1313                        break;
1314                case MOD_LOG_KEY_ADD:
1315                        /* if a move operation is needed it's in the log */
1316                        n--;
1317                        break;
1318                case MOD_LOG_MOVE_KEYS:
1319                        o_dst = btrfs_node_key_ptr_offset(tm->slot);
1320                        o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1321                        memmove_extent_buffer(eb, o_dst, o_src,
1322                                              tm->move.nr_items * p_size);
1323                        break;
1324                case MOD_LOG_ROOT_REPLACE:
1325                        /*
1326                         * this operation is special. for roots, this must be
1327                         * handled explicitly before rewinding.
1328                         * for non-roots, this operation may exist if the node
1329                         * was a root: root A -> child B; then A gets empty and
1330                         * B is promoted to the new root. in the mod log, we'll
1331                         * have a root-replace operation for B, a tree block
1332                         * that is no root. we simply ignore that operation.
1333                         */
1334                        break;
1335                }
1336                next = rb_next(&tm->node);
1337                if (!next)
1338                        break;
1339                tm = container_of(next, struct tree_mod_elem, node);
1340                if (tm->logical != first_tm->logical)
1341                        break;
1342        }
1343        tree_mod_log_read_unlock(fs_info);
1344        btrfs_set_header_nritems(eb, n);
1345}
1346
1347/*
1348 * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1349 * is returned. If rewind operations happen, a fresh buffer is returned. The
1350 * returned buffer is always read-locked. If the returned buffer is not the
1351 * input buffer, the lock on the input buffer is released and the input buffer
1352 * is freed (its refcount is decremented).
1353 */
1354static struct extent_buffer *
1355tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1356                    struct extent_buffer *eb, u64 time_seq)
1357{
1358        struct extent_buffer *eb_rewin;
1359        struct tree_mod_elem *tm;
1360
1361        if (!time_seq)
1362                return eb;
1363
1364        if (btrfs_header_level(eb) == 0)
1365                return eb;
1366
1367        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1368        if (!tm)
1369                return eb;
1370
1371        btrfs_set_path_blocking(path);
1372        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1373
1374        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1375                BUG_ON(tm->slot != 0);
1376                eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1377                if (!eb_rewin) {
1378                        btrfs_tree_read_unlock_blocking(eb);
1379                        free_extent_buffer(eb);
1380                        return NULL;
1381                }
1382                btrfs_set_header_bytenr(eb_rewin, eb->start);
1383                btrfs_set_header_backref_rev(eb_rewin,
1384                                             btrfs_header_backref_rev(eb));
1385                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1386                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1387        } else {
1388                eb_rewin = btrfs_clone_extent_buffer(eb);
1389                if (!eb_rewin) {
1390                        btrfs_tree_read_unlock_blocking(eb);
1391                        free_extent_buffer(eb);
1392                        return NULL;
1393                }
1394        }
1395
1396        btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1397        btrfs_tree_read_unlock_blocking(eb);
1398        free_extent_buffer(eb);
1399
1400        extent_buffer_get(eb_rewin);
1401        btrfs_tree_read_lock(eb_rewin);
1402        __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1403        WARN_ON(btrfs_header_nritems(eb_rewin) >
1404                BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1405
1406        return eb_rewin;
1407}
1408
1409/*
1410 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1411 * value. If there are no changes, the current root->root_node is returned. If
1412 * anything changed in between, there's a fresh buffer allocated on which the
1413 * rewind operations are done. In any case, the returned buffer is read locked.
1414 * Returns NULL on error (with no locks held).
1415 */
1416static inline struct extent_buffer *
1417get_old_root(struct btrfs_root *root, u64 time_seq)
1418{
1419        struct tree_mod_elem *tm;
1420        struct extent_buffer *eb = NULL;
1421        struct extent_buffer *eb_root;
1422        struct extent_buffer *old;
1423        struct tree_mod_root *old_root = NULL;
1424        u64 old_generation = 0;
1425        u64 logical;
1426
1427        eb_root = btrfs_read_lock_root_node(root);
1428        tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1429        if (!tm)
1430                return eb_root;
1431
1432        if (tm->op == MOD_LOG_ROOT_REPLACE) {
1433                old_root = &tm->old_root;
1434                old_generation = tm->generation;
1435                logical = old_root->logical;
1436        } else {
1437                logical = eb_root->start;
1438        }
1439
1440        tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1441        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1442                btrfs_tree_read_unlock(eb_root);
1443                free_extent_buffer(eb_root);
1444                old = read_tree_block(root, logical, 0);
1445                if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1446                        if (!IS_ERR(old))
1447                                free_extent_buffer(old);
1448                        btrfs_warn(root->fs_info,
1449                                "failed to read tree block %llu from get_old_root", logical);
1450                } else {
1451                        eb = btrfs_clone_extent_buffer(old);
1452                        free_extent_buffer(old);
1453                }
1454        } else if (old_root) {
1455                btrfs_tree_read_unlock(eb_root);
1456                free_extent_buffer(eb_root);
1457                eb = alloc_dummy_extent_buffer(root->fs_info, logical);
1458        } else {
1459                btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1460                eb = btrfs_clone_extent_buffer(eb_root);
1461                btrfs_tree_read_unlock_blocking(eb_root);
1462                free_extent_buffer(eb_root);
1463        }
1464
1465        if (!eb)
1466                return NULL;
1467        extent_buffer_get(eb);
1468        btrfs_tree_read_lock(eb);
1469        if (old_root) {
1470                btrfs_set_header_bytenr(eb, eb->start);
1471                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1472                btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1473                btrfs_set_header_level(eb, old_root->level);
1474                btrfs_set_header_generation(eb, old_generation);
1475        }
1476        if (tm)
1477                __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1478        else
1479                WARN_ON(btrfs_header_level(eb) != 0);
1480        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1481
1482        return eb;
1483}
1484
1485int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1486{
1487        struct tree_mod_elem *tm;
1488        int level;
1489        struct extent_buffer *eb_root = btrfs_root_node(root);
1490
1491        tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1492        if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1493                level = tm->old_root.level;
1494        } else {
1495                level = btrfs_header_level(eb_root);
1496        }
1497        free_extent_buffer(eb_root);
1498
1499        return level;
1500}
1501
1502static inline int should_cow_block(struct btrfs_trans_handle *trans,
1503                                   struct btrfs_root *root,
1504                                   struct extent_buffer *buf)
1505{
1506        if (btrfs_test_is_dummy_root(root))
1507                return 0;
1508
1509        /* ensure we can see the force_cow */
1510        smp_rmb();
1511
1512        /*
1513         * We do not need to cow a block if
1514         * 1) this block is not created or changed in this transaction;
1515         * 2) this block does not belong to TREE_RELOC tree;
1516         * 3) the root is not forced COW.
1517         *
1518         * What is forced COW:
1519         *    when we create snapshot during commiting the transaction,
1520         *    after we've finished coping src root, we must COW the shared
1521         *    block to ensure the metadata consistency.
1522         */
1523        if (btrfs_header_generation(buf) == trans->transid &&
1524            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1525            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1526              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1527            !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1528                return 0;
1529        return 1;
1530}
1531
1532/*
1533 * cows a single block, see __btrfs_cow_block for the real work.
1534 * This version of it has extra checks so that a block isn't cow'd more than
1535 * once per transaction, as long as it hasn't been written yet
1536 */
1537noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1538                    struct btrfs_root *root, struct extent_buffer *buf,
1539                    struct extent_buffer *parent, int parent_slot,
1540                    struct extent_buffer **cow_ret)
1541{
1542        u64 search_start;
1543        int ret;
1544
1545        if (trans->transaction != root->fs_info->running_transaction)
1546                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1547                       trans->transid,
1548                       root->fs_info->running_transaction->transid);
1549
1550        if (trans->transid != root->fs_info->generation)
1551                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1552                       trans->transid, root->fs_info->generation);
1553
1554        if (!should_cow_block(trans, root, buf)) {
1555                *cow_ret = buf;
1556                return 0;
1557        }
1558
1559        search_start = buf->start & ~((u64)SZ_1G - 1);
1560
1561        if (parent)
1562                btrfs_set_lock_blocking(parent);
1563        btrfs_set_lock_blocking(buf);
1564
1565        ret = __btrfs_cow_block(trans, root, buf, parent,
1566                                 parent_slot, cow_ret, search_start, 0);
1567
1568        trace_btrfs_cow_block(root, buf, *cow_ret);
1569
1570        return ret;
1571}
1572
1573/*
1574 * helper function for defrag to decide if two blocks pointed to by a
1575 * node are actually close by
1576 */
1577static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1578{
1579        if (blocknr < other && other - (blocknr + blocksize) < 32768)
1580                return 1;
1581        if (blocknr > other && blocknr - (other + blocksize) < 32768)
1582                return 1;
1583        return 0;
1584}
1585
1586/*
1587 * compare two keys in a memcmp fashion
1588 */
1589static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1590{
1591        struct btrfs_key k1;
1592
1593        btrfs_disk_key_to_cpu(&k1, disk);
1594
1595        return btrfs_comp_cpu_keys(&k1, k2);
1596}
1597
1598/*
1599 * same as comp_keys only with two btrfs_key's
1600 */
1601int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1602{
1603        if (k1->objectid > k2->objectid)
1604                return 1;
1605        if (k1->objectid < k2->objectid)
1606                return -1;
1607        if (k1->type > k2->type)
1608                return 1;
1609        if (k1->type < k2->type)
1610                return -1;
1611        if (k1->offset > k2->offset)
1612                return 1;
1613        if (k1->offset < k2->offset)
1614                return -1;
1615        return 0;
1616}
1617
1618/*
1619 * this is used by the defrag code to go through all the
1620 * leaves pointed to by a node and reallocate them so that
1621 * disk order is close to key order
1622 */
1623int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1624                       struct btrfs_root *root, struct extent_buffer *parent,
1625                       int start_slot, u64 *last_ret,
1626                       struct btrfs_key *progress)
1627{
1628        struct extent_buffer *cur;
1629        u64 blocknr;
1630        u64 gen;
1631        u64 search_start = *last_ret;
1632        u64 last_block = 0;
1633        u64 other;
1634        u32 parent_nritems;
1635        int end_slot;
1636        int i;
1637        int err = 0;
1638        int parent_level;
1639        int uptodate;
1640        u32 blocksize;
1641        int progress_passed = 0;
1642        struct btrfs_disk_key disk_key;
1643
1644        parent_level = btrfs_header_level(parent);
1645
1646        WARN_ON(trans->transaction != root->fs_info->running_transaction);
1647        WARN_ON(trans->transid != root->fs_info->generation);
1648
1649        parent_nritems = btrfs_header_nritems(parent);
1650        blocksize = root->nodesize;
1651        end_slot = parent_nritems - 1;
1652
1653        if (parent_nritems <= 1)
1654                return 0;
1655
1656        btrfs_set_lock_blocking(parent);
1657
1658        for (i = start_slot; i <= end_slot; i++) {
1659                int close = 1;
1660
1661                btrfs_node_key(parent, &disk_key, i);
1662                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1663                        continue;
1664
1665                progress_passed = 1;
1666                blocknr = btrfs_node_blockptr(parent, i);
1667                gen = btrfs_node_ptr_generation(parent, i);
1668                if (last_block == 0)
1669                        last_block = blocknr;
1670
1671                if (i > 0) {
1672                        other = btrfs_node_blockptr(parent, i - 1);
1673                        close = close_blocks(blocknr, other, blocksize);
1674                }
1675                if (!close && i < end_slot) {
1676                        other = btrfs_node_blockptr(parent, i + 1);
1677                        close = close_blocks(blocknr, other, blocksize);
1678                }
1679                if (close) {
1680                        last_block = blocknr;
1681                        continue;
1682                }
1683
1684                cur = btrfs_find_tree_block(root->fs_info, blocknr);
1685                if (cur)
1686                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1687                else
1688                        uptodate = 0;
1689                if (!cur || !uptodate) {
1690                        if (!cur) {
1691                                cur = read_tree_block(root, blocknr, gen);
1692                                if (IS_ERR(cur)) {
1693                                        return PTR_ERR(cur);
1694                                } else if (!extent_buffer_uptodate(cur)) {
1695                                        free_extent_buffer(cur);
1696                                        return -EIO;
1697                                }
1698                        } else if (!uptodate) {
1699                                err = btrfs_read_buffer(cur, gen);
1700                                if (err) {
1701                                        free_extent_buffer(cur);
1702                                        return err;
1703                                }
1704                        }
1705                }
1706                if (search_start == 0)
1707                        search_start = last_block;
1708
1709                btrfs_tree_lock(cur);
1710                btrfs_set_lock_blocking(cur);
1711                err = __btrfs_cow_block(trans, root, cur, parent, i,
1712                                        &cur, search_start,
1713                                        min(16 * blocksize,
1714                                            (end_slot - i) * blocksize));
1715                if (err) {
1716                        btrfs_tree_unlock(cur);
1717                        free_extent_buffer(cur);
1718                        break;
1719                }
1720                search_start = cur->start;
1721                last_block = cur->start;
1722                *last_ret = search_start;
1723                btrfs_tree_unlock(cur);
1724                free_extent_buffer(cur);
1725        }
1726        return err;
1727}
1728
1729/*
1730 * The leaf data grows from end-to-front in the node.
1731 * this returns the address of the start of the last item,
1732 * which is the stop of the leaf data stack
1733 */
1734static inline unsigned int leaf_data_end(struct btrfs_root *root,
1735                                         struct extent_buffer *leaf)
1736{
1737        u32 nr = btrfs_header_nritems(leaf);
1738        if (nr == 0)
1739                return BTRFS_LEAF_DATA_SIZE(root);
1740        return btrfs_item_offset_nr(leaf, nr - 1);
1741}
1742
1743
1744/*
1745 * search for key in the extent_buffer.  The items start at offset p,
1746 * and they are item_size apart.  There are 'max' items in p.
1747 *
1748 * the slot in the array is returned via slot, and it points to
1749 * the place where you would insert key if it is not found in
1750 * the array.
1751 *
1752 * slot may point to max if the key is bigger than all of the keys
1753 */
1754static noinline int generic_bin_search(struct extent_buffer *eb,
1755                                       unsigned long p,
1756                                       int item_size, struct btrfs_key *key,
1757                                       int max, int *slot)
1758{
1759        int low = 0;
1760        int high = max;
1761        int mid;
1762        int ret;
1763        struct btrfs_disk_key *tmp = NULL;
1764        struct btrfs_disk_key unaligned;
1765        unsigned long offset;
1766        char *kaddr = NULL;
1767        unsigned long map_start = 0;
1768        unsigned long map_len = 0;
1769        int err;
1770
1771        while (low < high) {
1772                mid = (low + high) / 2;
1773                offset = p + mid * item_size;
1774
1775                if (!kaddr || offset < map_start ||
1776                    (offset + sizeof(struct btrfs_disk_key)) >
1777                    map_start + map_len) {
1778
1779                        err = map_private_extent_buffer(eb, offset,
1780                                                sizeof(struct btrfs_disk_key),
1781                                                &kaddr, &map_start, &map_len);
1782
1783                        if (!err) {
1784                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
1785                                                        map_start);
1786                        } else {
1787                                read_extent_buffer(eb, &unaligned,
1788                                                   offset, sizeof(unaligned));
1789                                tmp = &unaligned;
1790                        }
1791
1792                } else {
1793                        tmp = (struct btrfs_disk_key *)(kaddr + offset -
1794                                                        map_start);
1795                }
1796                ret = comp_keys(tmp, key);
1797
1798                if (ret < 0)
1799                        low = mid + 1;
1800                else if (ret > 0)
1801                        high = mid;
1802                else {
1803                        *slot = mid;
1804                        return 0;
1805                }
1806        }
1807        *slot = low;
1808        return 1;
1809}
1810
1811/*
1812 * simple bin_search frontend that does the right thing for
1813 * leaves vs nodes
1814 */
1815static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1816                      int level, int *slot)
1817{
1818        if (level == 0)
1819                return generic_bin_search(eb,
1820                                          offsetof(struct btrfs_leaf, items),
1821                                          sizeof(struct btrfs_item),
1822                                          key, btrfs_header_nritems(eb),
1823                                          slot);
1824        else
1825                return generic_bin_search(eb,
1826                                          offsetof(struct btrfs_node, ptrs),
1827                                          sizeof(struct btrfs_key_ptr),
1828                                          key, btrfs_header_nritems(eb),
1829                                          slot);
1830}
1831
1832int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1833                     int level, int *slot)
1834{
1835        return bin_search(eb, key, level, slot);
1836}
1837
1838static void root_add_used(struct btrfs_root *root, u32 size)
1839{
1840        spin_lock(&root->accounting_lock);
1841        btrfs_set_root_used(&root->root_item,
1842                            btrfs_root_used(&root->root_item) + size);
1843        spin_unlock(&root->accounting_lock);
1844}
1845
1846static void root_sub_used(struct btrfs_root *root, u32 size)
1847{
1848        spin_lock(&root->accounting_lock);
1849        btrfs_set_root_used(&root->root_item,
1850                            btrfs_root_used(&root->root_item) - size);
1851        spin_unlock(&root->accounting_lock);
1852}
1853
1854/* given a node and slot number, this reads the blocks it points to.  The
1855 * extent buffer is returned with a reference taken (but unlocked).
1856 * NULL is returned on error.
1857 */
1858static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1859                                   struct extent_buffer *parent, int slot)
1860{
1861        int level = btrfs_header_level(parent);
1862        struct extent_buffer *eb;
1863
1864        if (slot < 0)
1865                return NULL;
1866        if (slot >= btrfs_header_nritems(parent))
1867                return NULL;
1868
1869        BUG_ON(level == 0);
1870
1871        eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1872                             btrfs_node_ptr_generation(parent, slot));
1873        if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
1874                if (!IS_ERR(eb))
1875                        free_extent_buffer(eb);
1876                eb = NULL;
1877        }
1878
1879        return eb;
1880}
1881
1882/*
1883 * node level balancing, used to make sure nodes are in proper order for
1884 * item deletion.  We balance from the top down, so we have to make sure
1885 * that a deletion won't leave an node completely empty later on.
1886 */
1887static noinline int balance_level(struct btrfs_trans_handle *trans,
1888                         struct btrfs_root *root,
1889                         struct btrfs_path *path, int level)
1890{
1891        struct extent_buffer *right = NULL;
1892        struct extent_buffer *mid;
1893        struct extent_buffer *left = NULL;
1894        struct extent_buffer *parent = NULL;
1895        int ret = 0;
1896        int wret;
1897        int pslot;
1898        int orig_slot = path->slots[level];
1899        u64 orig_ptr;
1900
1901        if (level == 0)
1902                return 0;
1903
1904        mid = path->nodes[level];
1905
1906        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1907                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1908        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1909
1910        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1911
1912        if (level < BTRFS_MAX_LEVEL - 1) {
1913                parent = path->nodes[level + 1];
1914                pslot = path->slots[level + 1];
1915        }
1916
1917        /*
1918         * deal with the case where there is only one pointer in the root
1919         * by promoting the node below to a root
1920         */
1921        if (!parent) {
1922                struct extent_buffer *child;
1923
1924                if (btrfs_header_nritems(mid) != 1)
1925                        return 0;
1926
1927                /* promote the child to a root */
1928                child = read_node_slot(root, mid, 0);
1929                if (!child) {
1930                        ret = -EROFS;
1931                        btrfs_std_error(root->fs_info, ret, NULL);
1932                        goto enospc;
1933                }
1934
1935                btrfs_tree_lock(child);
1936                btrfs_set_lock_blocking(child);
1937                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1938                if (ret) {
1939                        btrfs_tree_unlock(child);
1940                        free_extent_buffer(child);
1941                        goto enospc;
1942                }
1943
1944                tree_mod_log_set_root_pointer(root, child, 1);
1945                rcu_assign_pointer(root->node, child);
1946
1947                add_root_to_dirty_list(root);
1948                btrfs_tree_unlock(child);
1949
1950                path->locks[level] = 0;
1951                path->nodes[level] = NULL;
1952                clean_tree_block(trans, root->fs_info, mid);
1953                btrfs_tree_unlock(mid);
1954                /* once for the path */
1955                free_extent_buffer(mid);
1956
1957                root_sub_used(root, mid->len);
1958                btrfs_free_tree_block(trans, root, mid, 0, 1);
1959                /* once for the root ptr */
1960                free_extent_buffer_stale(mid);
1961                return 0;
1962        }
1963        if (btrfs_header_nritems(mid) >
1964            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1965                return 0;
1966
1967        left = read_node_slot(root, parent, pslot - 1);
1968        if (left) {
1969                btrfs_tree_lock(left);
1970                btrfs_set_lock_blocking(left);
1971                wret = btrfs_cow_block(trans, root, left,
1972                                       parent, pslot - 1, &left);
1973                if (wret) {
1974                        ret = wret;
1975                        goto enospc;
1976                }
1977        }
1978        right = read_node_slot(root, parent, pslot + 1);
1979        if (right) {
1980                btrfs_tree_lock(right);
1981                btrfs_set_lock_blocking(right);
1982                wret = btrfs_cow_block(trans, root, right,
1983                                       parent, pslot + 1, &right);
1984                if (wret) {
1985                        ret = wret;
1986                        goto enospc;
1987                }
1988        }
1989
1990        /* first, try to make some room in the middle buffer */
1991        if (left) {
1992                orig_slot += btrfs_header_nritems(left);
1993                wret = push_node_left(trans, root, left, mid, 1);
1994                if (wret < 0)
1995                        ret = wret;
1996        }
1997
1998        /*
1999         * then try to empty the right most buffer into the middle
2000         */
2001        if (right) {
2002                wret = push_node_left(trans, root, mid, right, 1);
2003                if (wret < 0 && wret != -ENOSPC)
2004                        ret = wret;
2005                if (btrfs_header_nritems(right) == 0) {
2006                        clean_tree_block(trans, root->fs_info, right);
2007                        btrfs_tree_unlock(right);
2008                        del_ptr(root, path, level + 1, pslot + 1);
2009                        root_sub_used(root, right->len);
2010                        btrfs_free_tree_block(trans, root, right, 0, 1);
2011                        free_extent_buffer_stale(right);
2012                        right = NULL;
2013                } else {
2014                        struct btrfs_disk_key right_key;
2015                        btrfs_node_key(right, &right_key, 0);
2016                        tree_mod_log_set_node_key(root->fs_info, parent,
2017                                                  pslot + 1, 0);
2018                        btrfs_set_node_key(parent, &right_key, pslot + 1);
2019                        btrfs_mark_buffer_dirty(parent);
2020                }
2021        }
2022        if (btrfs_header_nritems(mid) == 1) {
2023                /*
2024                 * we're not allowed to leave a node with one item in the
2025                 * tree during a delete.  A deletion from lower in the tree
2026                 * could try to delete the only pointer in this node.
2027                 * So, pull some keys from the left.
2028                 * There has to be a left pointer at this point because
2029                 * otherwise we would have pulled some pointers from the
2030                 * right
2031                 */
2032                if (!left) {
2033                        ret = -EROFS;
2034                        btrfs_std_error(root->fs_info, ret, NULL);
2035                        goto enospc;
2036                }
2037                wret = balance_node_right(trans, root, mid, left);
2038                if (wret < 0) {
2039                        ret = wret;
2040                        goto enospc;
2041                }
2042                if (wret == 1) {
2043                        wret = push_node_left(trans, root, left, mid, 1);
2044                        if (wret < 0)
2045                                ret = wret;
2046                }
2047                BUG_ON(wret == 1);
2048        }
2049        if (btrfs_header_nritems(mid) == 0) {
2050                clean_tree_block(trans, root->fs_info, mid);
2051                btrfs_tree_unlock(mid);
2052                del_ptr(root, path, level + 1, pslot);
2053                root_sub_used(root, mid->len);
2054                btrfs_free_tree_block(trans, root, mid, 0, 1);
2055                free_extent_buffer_stale(mid);
2056                mid = NULL;
2057        } else {
2058                /* update the parent key to reflect our changes */
2059                struct btrfs_disk_key mid_key;
2060                btrfs_node_key(mid, &mid_key, 0);
2061                tree_mod_log_set_node_key(root->fs_info, parent,
2062                                          pslot, 0);
2063                btrfs_set_node_key(parent, &mid_key, pslot);
2064                btrfs_mark_buffer_dirty(parent);
2065        }
2066
2067        /* update the path */
2068        if (left) {
2069                if (btrfs_header_nritems(left) > orig_slot) {
2070                        extent_buffer_get(left);
2071                        /* left was locked after cow */
2072                        path->nodes[level] = left;
2073                        path->slots[level + 1] -= 1;
2074                        path->slots[level] = orig_slot;
2075                        if (mid) {
2076                                btrfs_tree_unlock(mid);
2077                                free_extent_buffer(mid);
2078                        }
2079                } else {
2080                        orig_slot -= btrfs_header_nritems(left);
2081                        path->slots[level] = orig_slot;
2082                }
2083        }
2084        /* double check we haven't messed things up */
2085        if (orig_ptr !=
2086            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2087                BUG();
2088enospc:
2089        if (right) {
2090                btrfs_tree_unlock(right);
2091                free_extent_buffer(right);
2092        }
2093        if (left) {
2094                if (path->nodes[level] != left)
2095                        btrfs_tree_unlock(left);
2096                free_extent_buffer(left);
2097        }
2098        return ret;
2099}
2100
2101/* Node balancing for insertion.  Here we only split or push nodes around
2102 * when they are completely full.  This is also done top down, so we
2103 * have to be pessimistic.
2104 */
2105static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2106                                          struct btrfs_root *root,
2107                                          struct btrfs_path *path, int level)
2108{
2109        struct extent_buffer *right = NULL;
2110        struct extent_buffer *mid;
2111        struct extent_buffer *left = NULL;
2112        struct extent_buffer *parent = NULL;
2113        int ret = 0;
2114        int wret;
2115        int pslot;
2116        int orig_slot = path->slots[level];
2117
2118        if (level == 0)
2119                return 1;
2120
2121        mid = path->nodes[level];
2122        WARN_ON(btrfs_header_generation(mid) != trans->transid);
2123
2124        if (level < BTRFS_MAX_LEVEL - 1) {
2125                parent = path->nodes[level + 1];
2126                pslot = path->slots[level + 1];
2127        }
2128
2129        if (!parent)
2130                return 1;
2131
2132        left = read_node_slot(root, parent, pslot - 1);
2133
2134        /* first, try to make some room in the middle buffer */
2135        if (left) {
2136                u32 left_nr;
2137
2138                btrfs_tree_lock(left);
2139                btrfs_set_lock_blocking(left);
2140
2141                left_nr = btrfs_header_nritems(left);
2142                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2143                        wret = 1;
2144                } else {
2145                        ret = btrfs_cow_block(trans, root, left, parent,
2146                                              pslot - 1, &left);
2147                        if (ret)
2148                                wret = 1;
2149                        else {
2150                                wret = push_node_left(trans, root,
2151                                                      left, mid, 0);
2152                        }
2153                }
2154                if (wret < 0)
2155                        ret = wret;
2156                if (wret == 0) {
2157                        struct btrfs_disk_key disk_key;
2158                        orig_slot += left_nr;
2159                        btrfs_node_key(mid, &disk_key, 0);
2160                        tree_mod_log_set_node_key(root->fs_info, parent,
2161                                                  pslot, 0);
2162                        btrfs_set_node_key(parent, &disk_key, pslot);
2163                        btrfs_mark_buffer_dirty(parent);
2164                        if (btrfs_header_nritems(left) > orig_slot) {
2165                                path->nodes[level] = left;
2166                                path->slots[level + 1] -= 1;
2167                                path->slots[level] = orig_slot;
2168                                btrfs_tree_unlock(mid);
2169                                free_extent_buffer(mid);
2170                        } else {
2171                                orig_slot -=
2172                                        btrfs_header_nritems(left);
2173                                path->slots[level] = orig_slot;
2174                                btrfs_tree_unlock(left);
2175                                free_extent_buffer(left);
2176                        }
2177                        return 0;
2178                }
2179                btrfs_tree_unlock(left);
2180                free_extent_buffer(left);
2181        }
2182        right = read_node_slot(root, parent, pslot + 1);
2183
2184        /*
2185         * then try to empty the right most buffer into the middle
2186         */
2187        if (right) {
2188                u32 right_nr;
2189
2190                btrfs_tree_lock(right);
2191                btrfs_set_lock_blocking(right);
2192
2193                right_nr = btrfs_header_nritems(right);
2194                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2195                        wret = 1;
2196                } else {
2197                        ret = btrfs_cow_block(trans, root, right,
2198                                              parent, pslot + 1,
2199                                              &right);
2200                        if (ret)
2201                                wret = 1;
2202                        else {
2203                                wret = balance_node_right(trans, root,
2204                                                          right, mid);
2205                        }
2206                }
2207                if (wret < 0)
2208                        ret = wret;
2209                if (wret == 0) {
2210                        struct btrfs_disk_key disk_key;
2211
2212                        btrfs_node_key(right, &disk_key, 0);
2213                        tree_mod_log_set_node_key(root->fs_info, parent,
2214                                                  pslot + 1, 0);
2215                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
2216                        btrfs_mark_buffer_dirty(parent);
2217
2218                        if (btrfs_header_nritems(mid) <= orig_slot) {
2219                                path->nodes[level] = right;
2220                                path->slots[level + 1] += 1;
2221                                path->slots[level] = orig_slot -
2222                                        btrfs_header_nritems(mid);
2223                                btrfs_tree_unlock(mid);
2224                                free_extent_buffer(mid);
2225                        } else {
2226                                btrfs_tree_unlock(right);
2227                                free_extent_buffer(right);
2228                        }
2229                        return 0;
2230                }
2231                btrfs_tree_unlock(right);
2232                free_extent_buffer(right);
2233        }
2234        return 1;
2235}
2236
2237/*
2238 * readahead one full node of leaves, finding things that are close
2239 * to the block in 'slot', and triggering ra on them.
2240 */
2241static void reada_for_search(struct btrfs_root *root,
2242                             struct btrfs_path *path,
2243                             int level, int slot, u64 objectid)
2244{
2245        struct extent_buffer *node;
2246        struct btrfs_disk_key disk_key;
2247        u32 nritems;
2248        u64 search;
2249        u64 target;
2250        u64 nread = 0;
2251        u64 gen;
2252        struct extent_buffer *eb;
2253        u32 nr;
2254        u32 blocksize;
2255        u32 nscan = 0;
2256
2257        if (level != 1)
2258                return;
2259
2260        if (!path->nodes[level])
2261                return;
2262
2263        node = path->nodes[level];
2264
2265        search = btrfs_node_blockptr(node, slot);
2266        blocksize = root->nodesize;
2267        eb = btrfs_find_tree_block(root->fs_info, search);
2268        if (eb) {
2269                free_extent_buffer(eb);
2270                return;
2271        }
2272
2273        target = search;
2274
2275        nritems = btrfs_header_nritems(node);
2276        nr = slot;
2277
2278        while (1) {
2279                if (path->reada == READA_BACK) {
2280                        if (nr == 0)
2281                                break;
2282                        nr--;
2283                } else if (path->reada == READA_FORWARD) {
2284                        nr++;
2285                        if (nr >= nritems)
2286                                break;
2287                }
2288                if (path->reada == READA_BACK && objectid) {
2289                        btrfs_node_key(node, &disk_key, nr);
2290                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
2291                                break;
2292                }
2293                search = btrfs_node_blockptr(node, nr);
2294                if ((search <= target && target - search <= 65536) ||
2295                    (search > target && search - target <= 65536)) {
2296                        gen = btrfs_node_ptr_generation(node, nr);
2297                        readahead_tree_block(root, search);
2298                        nread += blocksize;
2299                }
2300                nscan++;
2301                if ((nread > 65536 || nscan > 32))
2302                        break;
2303        }
2304}
2305
2306static noinline void reada_for_balance(struct btrfs_root *root,
2307                                       struct btrfs_path *path, int level)
2308{
2309        int slot;
2310        int nritems;
2311        struct extent_buffer *parent;
2312        struct extent_buffer *eb;
2313        u64 gen;
2314        u64 block1 = 0;
2315        u64 block2 = 0;
2316
2317        parent = path->nodes[level + 1];
2318        if (!parent)
2319                return;
2320
2321        nritems = btrfs_header_nritems(parent);
2322        slot = path->slots[level + 1];
2323
2324        if (slot > 0) {
2325                block1 = btrfs_node_blockptr(parent, slot - 1);
2326                gen = btrfs_node_ptr_generation(parent, slot - 1);
2327                eb = btrfs_find_tree_block(root->fs_info, block1);
2328                /*
2329                 * if we get -eagain from btrfs_buffer_uptodate, we
2330                 * don't want to return eagain here.  That will loop
2331                 * forever
2332                 */
2333                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2334                        block1 = 0;
2335                free_extent_buffer(eb);
2336        }
2337        if (slot + 1 < nritems) {
2338                block2 = btrfs_node_blockptr(parent, slot + 1);
2339                gen = btrfs_node_ptr_generation(parent, slot + 1);
2340                eb = btrfs_find_tree_block(root->fs_info, block2);
2341                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2342                        block2 = 0;
2343                free_extent_buffer(eb);
2344        }
2345
2346        if (block1)
2347                readahead_tree_block(root, block1);
2348        if (block2)
2349                readahead_tree_block(root, block2);
2350}
2351
2352
2353/*
2354 * when we walk down the tree, it is usually safe to unlock the higher layers
2355 * in the tree.  The exceptions are when our path goes through slot 0, because
2356 * operations on the tree might require changing key pointers higher up in the
2357 * tree.
2358 *
2359 * callers might also have set path->keep_locks, which tells this code to keep
2360 * the lock if the path points to the last slot in the block.  This is part of
2361 * walking through the tree, and selecting the next slot in the higher block.
2362 *
2363 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2364 * if lowest_unlock is 1, level 0 won't be unlocked
2365 */
2366static noinline void unlock_up(struct btrfs_path *path, int level,
2367                               int lowest_unlock, int min_write_lock_level,
2368                               int *write_lock_level)
2369{
2370        int i;
2371        int skip_level = level;
2372        int no_skips = 0;
2373        struct extent_buffer *t;
2374
2375        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2376                if (!path->nodes[i])
2377                        break;
2378                if (!path->locks[i])
2379                        break;
2380                if (!no_skips && path->slots[i] == 0) {
2381                        skip_level = i + 1;
2382                        continue;
2383                }
2384                if (!no_skips && path->keep_locks) {
2385                        u32 nritems;
2386                        t = path->nodes[i];
2387                        nritems = btrfs_header_nritems(t);
2388                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
2389                                skip_level = i + 1;
2390                                continue;
2391                        }
2392                }
2393                if (skip_level < i && i >= lowest_unlock)
2394                        no_skips = 1;
2395
2396                t = path->nodes[i];
2397                if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2398                        btrfs_tree_unlock_rw(t, path->locks[i]);
2399                        path->locks[i] = 0;
2400                        if (write_lock_level &&
2401                            i > min_write_lock_level &&
2402                            i <= *write_lock_level) {
2403                                *write_lock_level = i - 1;
2404                        }
2405                }
2406        }
2407}
2408
2409/*
2410 * This releases any locks held in the path starting at level and
2411 * going all the way up to the root.
2412 *
2413 * btrfs_search_slot will keep the lock held on higher nodes in a few
2414 * corner cases, such as COW of the block at slot zero in the node.  This
2415 * ignores those rules, and it should only be called when there are no
2416 * more updates to be done higher up in the tree.
2417 */
2418noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2419{
2420        int i;
2421
2422        if (path->keep_locks)
2423                return;
2424
2425        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2426                if (!path->nodes[i])
2427                        continue;
2428                if (!path->locks[i])
2429                        continue;
2430                btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2431                path->locks[i] = 0;
2432        }
2433}
2434
2435/*
2436 * helper function for btrfs_search_slot.  The goal is to find a block
2437 * in cache without setting the path to blocking.  If we find the block
2438 * we return zero and the path is unchanged.
2439 *
2440 * If we can't find the block, we set the path blocking and do some
2441 * reada.  -EAGAIN is returned and the search must be repeated.
2442 */
2443static int
2444read_block_for_search(struct btrfs_trans_handle *trans,
2445                       struct btrfs_root *root, struct btrfs_path *p,
2446                       struct extent_buffer **eb_ret, int level, int slot,
2447                       struct btrfs_key *key, u64 time_seq)
2448{
2449        u64 blocknr;
2450        u64 gen;
2451        struct extent_buffer *b = *eb_ret;
2452        struct extent_buffer *tmp;
2453        int ret;
2454
2455        blocknr = btrfs_node_blockptr(b, slot);
2456        gen = btrfs_node_ptr_generation(b, slot);
2457
2458        tmp = btrfs_find_tree_block(root->fs_info, blocknr);
2459        if (tmp) {
2460                /* first we do an atomic uptodate check */
2461                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2462                        *eb_ret = tmp;
2463                        return 0;
2464                }
2465
2466                /* the pages were up to date, but we failed
2467                 * the generation number check.  Do a full
2468                 * read for the generation number that is correct.
2469                 * We must do this without dropping locks so
2470                 * we can trust our generation number
2471                 */
2472                btrfs_set_path_blocking(p);
2473
2474                /* now we're allowed to do a blocking uptodate check */
2475                ret = btrfs_read_buffer(tmp, gen);
2476                if (!ret) {
2477                        *eb_ret = tmp;
2478                        return 0;
2479                }
2480                free_extent_buffer(tmp);
2481                btrfs_release_path(p);
2482                return -EIO;
2483        }
2484
2485        /*
2486         * reduce lock contention at high levels
2487         * of the btree by dropping locks before
2488         * we read.  Don't release the lock on the current
2489         * level because we need to walk this node to figure
2490         * out which blocks to read.
2491         */
2492        btrfs_unlock_up_safe(p, level + 1);
2493        btrfs_set_path_blocking(p);
2494
2495        free_extent_buffer(tmp);
2496        if (p->reada != READA_NONE)
2497                reada_for_search(root, p, level, slot, key->objectid);
2498
2499        btrfs_release_path(p);
2500
2501        ret = -EAGAIN;
2502        tmp = read_tree_block(root, blocknr, 0);
2503        if (!IS_ERR(tmp)) {
2504                /*
2505                 * If the read above didn't mark this buffer up to date,
2506                 * it will never end up being up to date.  Set ret to EIO now
2507                 * and give up so that our caller doesn't loop forever
2508                 * on our EAGAINs.
2509                 */
2510                if (!btrfs_buffer_uptodate(tmp, 0, 0))
2511                        ret = -EIO;
2512                free_extent_buffer(tmp);
2513        }
2514        return ret;
2515}
2516
2517/*
2518 * helper function for btrfs_search_slot.  This does all of the checks
2519 * for node-level blocks and does any balancing required based on
2520 * the ins_len.
2521 *
2522 * If no extra work was required, zero is returned.  If we had to
2523 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2524 * start over
2525 */
2526static int
2527setup_nodes_for_search(struct btrfs_trans_handle *trans,
2528                       struct btrfs_root *root, struct btrfs_path *p,
2529                       struct extent_buffer *b, int level, int ins_len,
2530                       int *write_lock_level)
2531{
2532        int ret;
2533        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2534            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2535                int sret;
2536
2537                if (*write_lock_level < level + 1) {
2538                        *write_lock_level = level + 1;
2539                        btrfs_release_path(p);
2540                        goto again;
2541                }
2542
2543                btrfs_set_path_blocking(p);
2544                reada_for_balance(root, p, level);
2545                sret = split_node(trans, root, p, level);
2546                btrfs_clear_path_blocking(p, NULL, 0);
2547
2548                BUG_ON(sret > 0);
2549                if (sret) {
2550                        ret = sret;
2551                        goto done;
2552                }
2553                b = p->nodes[level];
2554        } else if (ins_len < 0 && btrfs_header_nritems(b) <
2555                   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2556                int sret;
2557
2558                if (*write_lock_level < level + 1) {
2559                        *write_lock_level = level + 1;
2560                        btrfs_release_path(p);
2561                        goto again;
2562                }
2563
2564                btrfs_set_path_blocking(p);
2565                reada_for_balance(root, p, level);
2566                sret = balance_level(trans, root, p, level);
2567                btrfs_clear_path_blocking(p, NULL, 0);
2568
2569                if (sret) {
2570                        ret = sret;
2571                        goto done;
2572                }
2573                b = p->nodes[level];
2574                if (!b) {
2575                        btrfs_release_path(p);
2576                        goto again;
2577                }
2578                BUG_ON(btrfs_header_nritems(b) == 1);
2579        }
2580        return 0;
2581
2582again:
2583        ret = -EAGAIN;
2584done:
2585        return ret;
2586}
2587
2588static void key_search_validate(struct extent_buffer *b,
2589                                struct btrfs_key *key,
2590                                int level)
2591{
2592#ifdef CONFIG_BTRFS_ASSERT
2593        struct btrfs_disk_key disk_key;
2594
2595        btrfs_cpu_key_to_disk(&disk_key, key);
2596
2597        if (level == 0)
2598                ASSERT(!memcmp_extent_buffer(b, &disk_key,
2599                    offsetof(struct btrfs_leaf, items[0].key),
2600                    sizeof(disk_key)));
2601        else
2602                ASSERT(!memcmp_extent_buffer(b, &disk_key,
2603                    offsetof(struct btrfs_node, ptrs[0].key),
2604                    sizeof(disk_key)));
2605#endif
2606}
2607
2608static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2609                      int level, int *prev_cmp, int *slot)
2610{
2611        if (*prev_cmp != 0) {
2612                *prev_cmp = bin_search(b, key, level, slot);
2613                return *prev_cmp;
2614        }
2615
2616        key_search_validate(b, key, level);
2617        *slot = 0;
2618
2619        return 0;
2620}
2621
2622int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2623                u64 iobjectid, u64 ioff, u8 key_type,
2624                struct btrfs_key *found_key)
2625{
2626        int ret;
2627        struct btrfs_key key;
2628        struct extent_buffer *eb;
2629
2630        ASSERT(path);
2631        ASSERT(found_key);
2632
2633        key.type = key_type;
2634        key.objectid = iobjectid;
2635        key.offset = ioff;
2636
2637        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2638        if (ret < 0)
2639                return ret;
2640
2641        eb = path->nodes[0];
2642        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2643                ret = btrfs_next_leaf(fs_root, path);
2644                if (ret)
2645                        return ret;
2646                eb = path->nodes[0];
2647        }
2648
2649        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2650        if (found_key->type != key.type ||
2651                        found_key->objectid != key.objectid)
2652                return 1;
2653
2654        return 0;
2655}
2656
2657/*
2658 * look for key in the tree.  path is filled in with nodes along the way
2659 * if key is found, we return zero and you can find the item in the leaf
2660 * level of the path (level 0)
2661 *
2662 * If the key isn't found, the path points to the slot where it should
2663 * be inserted, and 1 is returned.  If there are other errors during the
2664 * search a negative error number is returned.
2665 *
2666 * if ins_len > 0, nodes and leaves will be split as we walk down the
2667 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2668 * possible)
2669 */
2670int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2671                      *root, struct btrfs_key *key, struct btrfs_path *p, int
2672                      ins_len, int cow)
2673{
2674        struct extent_buffer *b;
2675        int slot;
2676        int ret;
2677        int err;
2678        int level;
2679        int lowest_unlock = 1;
2680        int root_lock;
2681        /* everything at write_lock_level or lower must be write locked */
2682        int write_lock_level = 0;
2683        u8 lowest_level = 0;
2684        int min_write_lock_level;
2685        int prev_cmp;
2686
2687        lowest_level = p->lowest_level;
2688        WARN_ON(lowest_level && ins_len > 0);
2689        WARN_ON(p->nodes[0] != NULL);
2690        BUG_ON(!cow && ins_len);
2691
2692        if (ins_len < 0) {
2693                lowest_unlock = 2;
2694
2695                /* when we are removing items, we might have to go up to level
2696                 * two as we update tree pointers  Make sure we keep write
2697                 * for those levels as well
2698                 */
2699                write_lock_level = 2;
2700        } else if (ins_len > 0) {
2701                /*
2702                 * for inserting items, make sure we have a write lock on
2703                 * level 1 so we can update keys
2704                 */
2705                write_lock_level = 1;
2706        }
2707
2708        if (!cow)
2709                write_lock_level = -1;
2710
2711        if (cow && (p->keep_locks || p->lowest_level))
2712                write_lock_level = BTRFS_MAX_LEVEL;
2713
2714        min_write_lock_level = write_lock_level;
2715
2716again:
2717        prev_cmp = -1;
2718        /*
2719         * we try very hard to do read locks on the root
2720         */
2721        root_lock = BTRFS_READ_LOCK;
2722        level = 0;
2723        if (p->search_commit_root) {
2724                /*
2725                 * the commit roots are read only
2726                 * so we always do read locks
2727                 */
2728                if (p->need_commit_sem)
2729                        down_read(&root->fs_info->commit_root_sem);
2730                b = root->commit_root;
2731                extent_buffer_get(b);
2732                level = btrfs_header_level(b);
2733                if (p->need_commit_sem)
2734                        up_read(&root->fs_info->commit_root_sem);
2735                if (!p->skip_locking)
2736                        btrfs_tree_read_lock(b);
2737        } else {
2738                if (p->skip_locking) {
2739                        b = btrfs_root_node(root);
2740                        level = btrfs_header_level(b);
2741                } else {
2742                        /* we don't know the level of the root node
2743                         * until we actually have it read locked
2744                         */
2745                        b = btrfs_read_lock_root_node(root);
2746                        level = btrfs_header_level(b);
2747                        if (level <= write_lock_level) {
2748                                /* whoops, must trade for write lock */
2749                                btrfs_tree_read_unlock(b);
2750                                free_extent_buffer(b);
2751                                b = btrfs_lock_root_node(root);
2752                                root_lock = BTRFS_WRITE_LOCK;
2753
2754                                /* the level might have changed, check again */
2755                                level = btrfs_header_level(b);
2756                        }
2757                }
2758        }
2759        p->nodes[level] = b;
2760        if (!p->skip_locking)
2761                p->locks[level] = root_lock;
2762
2763        while (b) {
2764                level = btrfs_header_level(b);
2765
2766                /*
2767                 * setup the path here so we can release it under lock
2768                 * contention with the cow code
2769                 */
2770                if (cow) {
2771                        /*
2772                         * if we don't really need to cow this block
2773                         * then we don't want to set the path blocking,
2774                         * so we test it here
2775                         */
2776                        if (!should_cow_block(trans, root, b))
2777                                goto cow_done;
2778
2779                        /*
2780                         * must have write locks on this node and the
2781                         * parent
2782                         */
2783                        if (level > write_lock_level ||
2784                            (level + 1 > write_lock_level &&
2785                            level + 1 < BTRFS_MAX_LEVEL &&
2786                            p->nodes[level + 1])) {
2787                                write_lock_level = level + 1;
2788                                btrfs_release_path(p);
2789                                goto again;
2790                        }
2791
2792                        btrfs_set_path_blocking(p);
2793                        err = btrfs_cow_block(trans, root, b,
2794                                              p->nodes[level + 1],
2795                                              p->slots[level + 1], &b);
2796                        if (err) {
2797                                ret = err;
2798                                goto done;
2799                        }
2800                }
2801cow_done:
2802                p->nodes[level] = b;
2803                btrfs_clear_path_blocking(p, NULL, 0);
2804
2805                /*
2806                 * we have a lock on b and as long as we aren't changing
2807                 * the tree, there is no way to for the items in b to change.
2808                 * It is safe to drop the lock on our parent before we
2809                 * go through the expensive btree search on b.
2810                 *
2811                 * If we're inserting or deleting (ins_len != 0), then we might
2812                 * be changing slot zero, which may require changing the parent.
2813                 * So, we can't drop the lock until after we know which slot
2814                 * we're operating on.
2815                 */
2816                if (!ins_len && !p->keep_locks) {
2817                        int u = level + 1;
2818
2819                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2820                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2821                                p->locks[u] = 0;
2822                        }
2823                }
2824
2825                ret = key_search(b, key, level, &prev_cmp, &slot);
2826
2827                if (level != 0) {
2828                        int dec = 0;
2829                        if (ret && slot > 0) {
2830                                dec = 1;
2831                                slot -= 1;
2832                        }
2833                        p->slots[level] = slot;
2834                        err = setup_nodes_for_search(trans, root, p, b, level,
2835                                             ins_len, &write_lock_level);
2836                        if (err == -EAGAIN)
2837                                goto again;
2838                        if (err) {
2839                                ret = err;
2840                                goto done;
2841                        }
2842                        b = p->nodes[level];
2843                        slot = p->slots[level];
2844
2845                        /*
2846                         * slot 0 is special, if we change the key
2847                         * we have to update the parent pointer
2848                         * which means we must have a write lock
2849                         * on the parent
2850                         */
2851                        if (slot == 0 && ins_len &&
2852                            write_lock_level < level + 1) {
2853                                write_lock_level = level + 1;
2854                                btrfs_release_path(p);
2855                                goto again;
2856                        }
2857
2858                        unlock_up(p, level, lowest_unlock,
2859                                  min_write_lock_level, &write_lock_level);
2860
2861                        if (level == lowest_level) {
2862                                if (dec)
2863                                        p->slots[level]++;
2864                                goto done;
2865                        }
2866
2867                        err = read_block_for_search(trans, root, p,
2868                                                    &b, level, slot, key, 0);
2869                        if (err == -EAGAIN)
2870                                goto again;
2871                        if (err) {
2872                                ret = err;
2873                                goto done;
2874                        }
2875
2876                        if (!p->skip_locking) {
2877                                level = btrfs_header_level(b);
2878                                if (level <= write_lock_level) {
2879                                        err = btrfs_try_tree_write_lock(b);
2880                                        if (!err) {
2881                                                btrfs_set_path_blocking(p);
2882                                                btrfs_tree_lock(b);
2883                                                btrfs_clear_path_blocking(p, b,
2884                                                                  BTRFS_WRITE_LOCK);
2885                                        }
2886                                        p->locks[level] = BTRFS_WRITE_LOCK;
2887                                } else {
2888                                        err = btrfs_tree_read_lock_atomic(b);
2889                                        if (!err) {
2890                                                btrfs_set_path_blocking(p);
2891                                                btrfs_tree_read_lock(b);
2892                                                btrfs_clear_path_blocking(p, b,
2893                                                                  BTRFS_READ_LOCK);
2894                                        }
2895                                        p->locks[level] = BTRFS_READ_LOCK;
2896                                }
2897                                p->nodes[level] = b;
2898                        }
2899                } else {
2900                        p->slots[level] = slot;
2901                        if (ins_len > 0 &&
2902                            btrfs_leaf_free_space(root, b) < ins_len) {
2903                                if (write_lock_level < 1) {
2904                                        write_lock_level = 1;
2905                                        btrfs_release_path(p);
2906                                        goto again;
2907                                }
2908
2909                                btrfs_set_path_blocking(p);
2910                                err = split_leaf(trans, root, key,
2911                                                 p, ins_len, ret == 0);
2912                                btrfs_clear_path_blocking(p, NULL, 0);
2913
2914                                BUG_ON(err > 0);
2915                                if (err) {
2916                                        ret = err;
2917                                        goto done;
2918                                }
2919                        }
2920                        if (!p->search_for_split)
2921                                unlock_up(p, level, lowest_unlock,
2922                                          min_write_lock_level, &write_lock_level);
2923                        goto done;
2924                }
2925        }
2926        ret = 1;
2927done:
2928        /*
2929         * we don't really know what they plan on doing with the path
2930         * from here on, so for now just mark it as blocking
2931         */
2932        if (!p->leave_spinning)
2933                btrfs_set_path_blocking(p);
2934        if (ret < 0 && !p->skip_release_on_error)
2935                btrfs_release_path(p);
2936        return ret;
2937}
2938
2939/*
2940 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2941 * current state of the tree together with the operations recorded in the tree
2942 * modification log to search for the key in a previous version of this tree, as
2943 * denoted by the time_seq parameter.
2944 *
2945 * Naturally, there is no support for insert, delete or cow operations.
2946 *
2947 * The resulting path and return value will be set up as if we called
2948 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2949 */
2950int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2951                          struct btrfs_path *p, u64 time_seq)
2952{
2953        struct extent_buffer *b;
2954        int slot;
2955        int ret;
2956        int err;
2957        int level;
2958        int lowest_unlock = 1;
2959        u8 lowest_level = 0;
2960        int prev_cmp = -1;
2961
2962        lowest_level = p->lowest_level;
2963        WARN_ON(p->nodes[0] != NULL);
2964
2965        if (p->search_commit_root) {
2966                BUG_ON(time_seq);
2967                return btrfs_search_slot(NULL, root, key, p, 0, 0);
2968        }
2969
2970again:
2971        b = get_old_root(root, time_seq);
2972        level = btrfs_header_level(b);
2973        p->locks[level] = BTRFS_READ_LOCK;
2974
2975        while (b) {
2976                level = btrfs_header_level(b);
2977                p->nodes[level] = b;
2978                btrfs_clear_path_blocking(p, NULL, 0);
2979
2980                /*
2981                 * we have a lock on b and as long as we aren't changing
2982                 * the tree, there is no way to for the items in b to change.
2983                 * It is safe to drop the lock on our parent before we
2984                 * go through the expensive btree search on b.
2985                 */
2986                btrfs_unlock_up_safe(p, level + 1);
2987
2988                /*
2989                 * Since we can unwind eb's we want to do a real search every
2990                 * time.
2991                 */
2992                prev_cmp = -1;
2993                ret = key_search(b, key, level, &prev_cmp, &slot);
2994
2995                if (level != 0) {
2996                        int dec = 0;
2997                        if (ret && slot > 0) {
2998                                dec = 1;
2999                                slot -= 1;
3000                        }
3001                        p->slots[level] = slot;
3002                        unlock_up(p, level, lowest_unlock, 0, NULL);
3003
3004                        if (level == lowest_level) {
3005                                if (dec)
3006                                        p->slots[level]++;
3007                                goto done;
3008                        }
3009
3010                        err = read_block_for_search(NULL, root, p, &b, level,
3011                                                    slot, key, time_seq);
3012                        if (err == -EAGAIN)
3013                                goto again;
3014                        if (err) {
3015                                ret = err;
3016                                goto done;
3017                        }
3018
3019                        level = btrfs_header_level(b);
3020                        err = btrfs_tree_read_lock_atomic(b);
3021                        if (!err) {
3022                                btrfs_set_path_blocking(p);
3023                                btrfs_tree_read_lock(b);
3024                                btrfs_clear_path_blocking(p, b,
3025                                                          BTRFS_READ_LOCK);
3026                        }
3027                        b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3028                        if (!b) {
3029                                ret = -ENOMEM;
3030                                goto done;
3031                        }
3032                        p->locks[level] = BTRFS_READ_LOCK;
3033                        p->nodes[level] = b;
3034                } else {
3035                        p->slots[level] = slot;
3036                        unlock_up(p, level, lowest_unlock, 0, NULL);
3037                        goto done;
3038                }
3039        }
3040        ret = 1;
3041done:
3042        if (!p->leave_spinning)
3043                btrfs_set_path_blocking(p);
3044        if (ret < 0)
3045                btrfs_release_path(p);
3046
3047        return ret;
3048}
3049
3050/*
3051 * helper to use instead of search slot if no exact match is needed but
3052 * instead the next or previous item should be returned.
3053 * When find_higher is true, the next higher item is returned, the next lower
3054 * otherwise.
3055 * When return_any and find_higher are both true, and no higher item is found,
3056 * return the next lower instead.
3057 * When return_any is true and find_higher is false, and no lower item is found,
3058 * return the next higher instead.
3059 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3060 * < 0 on error
3061 */
3062int btrfs_search_slot_for_read(struct btrfs_root *root,
3063                               struct btrfs_key *key, struct btrfs_path *p,
3064                               int find_higher, int return_any)
3065{
3066        int ret;
3067        struct extent_buffer *leaf;
3068
3069again:
3070        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3071        if (ret <= 0)
3072                return ret;
3073        /*
3074         * a return value of 1 means the path is at the position where the
3075         * item should be inserted. Normally this is the next bigger item,
3076         * but in case the previous item is the last in a leaf, path points
3077         * to the first free slot in the previous leaf, i.e. at an invalid
3078         * item.
3079         */
3080        leaf = p->nodes[0];
3081
3082        if (find_higher) {
3083                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3084                        ret = btrfs_next_leaf(root, p);
3085                        if (ret <= 0)
3086                                return ret;
3087                        if (!return_any)
3088                                return 1;
3089                        /*
3090                         * no higher item found, return the next
3091                         * lower instead
3092                         */
3093                        return_any = 0;
3094                        find_higher = 0;
3095                        btrfs_release_path(p);
3096                        goto again;
3097                }
3098        } else {
3099                if (p->slots[0] == 0) {
3100                        ret = btrfs_prev_leaf(root, p);
3101                        if (ret < 0)
3102                                return ret;
3103                        if (!ret) {
3104                                leaf = p->nodes[0];
3105                                if (p->slots[0] == btrfs_header_nritems(leaf))
3106                                        p->slots[0]--;
3107                                return 0;
3108                        }
3109                        if (!return_any)
3110                                return 1;
3111                        /*
3112                         * no lower item found, return the next
3113                         * higher instead
3114                         */
3115                        return_any = 0;
3116                        find_higher = 1;
3117                        btrfs_release_path(p);
3118                        goto again;
3119                } else {
3120                        --p->slots[0];
3121                }
3122        }
3123        return 0;
3124}
3125
3126/*
3127 * adjust the pointers going up the tree, starting at level
3128 * making sure the right key of each node is points to 'key'.
3129 * This is used after shifting pointers to the left, so it stops
3130 * fixing up pointers when a given leaf/node is not in slot 0 of the
3131 * higher levels
3132 *
3133 */
3134static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3135                           struct btrfs_path *path,
3136                           struct btrfs_disk_key *key, int level)
3137{
3138        int i;
3139        struct extent_buffer *t;
3140
3141        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3142                int tslot = path->slots[i];
3143                if (!path->nodes[i])
3144                        break;
3145                t = path->nodes[i];
3146                tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3147                btrfs_set_node_key(t, key, tslot);
3148                btrfs_mark_buffer_dirty(path->nodes[i]);
3149                if (tslot != 0)
3150                        break;
3151        }
3152}
3153
3154/*
3155 * update item key.
3156 *
3157 * This function isn't completely safe. It's the caller's responsibility
3158 * that the new key won't break the order
3159 */
3160void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3161                             struct btrfs_path *path,
3162                             struct btrfs_key *new_key)
3163{
3164        struct btrfs_disk_key disk_key;
3165        struct extent_buffer *eb;
3166        int slot;
3167
3168        eb = path->nodes[0];
3169        slot = path->slots[0];
3170        if (slot > 0) {
3171                btrfs_item_key(eb, &disk_key, slot - 1);
3172                BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3173        }
3174        if (slot < btrfs_header_nritems(eb) - 1) {
3175                btrfs_item_key(eb, &disk_key, slot + 1);
3176                BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3177        }
3178
3179        btrfs_cpu_key_to_disk(&disk_key, new_key);
3180        btrfs_set_item_key(eb, &disk_key, slot);
3181        btrfs_mark_buffer_dirty(eb);
3182        if (slot == 0)
3183                fixup_low_keys(fs_info, path, &disk_key, 1);
3184}
3185
3186/*
3187 * try to push data from one node into the next node left in the
3188 * tree.
3189 *
3190 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3191 * error, and > 0 if there was no room in the left hand block.
3192 */
3193static int push_node_left(struct btrfs_trans_handle *trans,
3194                          struct btrfs_root *root, struct extent_buffer *dst,
3195                          struct extent_buffer *src, int empty)
3196{
3197        int push_items = 0;
3198        int src_nritems;
3199        int dst_nritems;
3200        int ret = 0;
3201
3202        src_nritems = btrfs_header_nritems(src);
3203        dst_nritems = btrfs_header_nritems(dst);
3204        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3205        WARN_ON(btrfs_header_generation(src) != trans->transid);
3206        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3207
3208        if (!empty && src_nritems <= 8)
3209                return 1;
3210
3211        if (push_items <= 0)
3212                return 1;
3213
3214        if (empty) {
3215                push_items = min(src_nritems, push_items);
3216                if (push_items < src_nritems) {
3217                        /* leave at least 8 pointers in the node if
3218                         * we aren't going to empty it
3219                         */
3220                        if (src_nritems - push_items < 8) {
3221                                if (push_items <= 8)
3222                                        return 1;
3223                                push_items -= 8;
3224                        }
3225                }
3226        } else
3227                push_items = min(src_nritems - 8, push_items);
3228
3229        ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3230                                   push_items);
3231        if (ret) {
3232                btrfs_abort_transaction(trans, root, ret);
3233                return ret;
3234        }
3235        copy_extent_buffer(dst, src,
3236                           btrfs_node_key_ptr_offset(dst_nritems),
3237                           btrfs_node_key_ptr_offset(0),
3238                           push_items * sizeof(struct btrfs_key_ptr));
3239
3240        if (push_items < src_nritems) {
3241                /*
3242                 * don't call tree_mod_log_eb_move here, key removal was already
3243                 * fully logged by tree_mod_log_eb_copy above.
3244                 */
3245                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3246                                      btrfs_node_key_ptr_offset(push_items),
3247                                      (src_nritems - push_items) *
3248                                      sizeof(struct btrfs_key_ptr));
3249        }
3250        btrfs_set_header_nritems(src, src_nritems - push_items);
3251        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3252        btrfs_mark_buffer_dirty(src);
3253        btrfs_mark_buffer_dirty(dst);
3254
3255        return ret;
3256}
3257
3258/*
3259 * try to push data from one node into the next node right in the
3260 * tree.
3261 *
3262 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3263 * error, and > 0 if there was no room in the right hand block.
3264 *
3265 * this will  only push up to 1/2 the contents of the left node over
3266 */
3267static int balance_node_right(struct btrfs_trans_handle *trans,
3268                              struct btrfs_root *root,
3269                              struct extent_buffer *dst,
3270                              struct extent_buffer *src)
3271{
3272        int push_items = 0;
3273        int max_push;
3274        int src_nritems;
3275        int dst_nritems;
3276        int ret = 0;
3277
3278        WARN_ON(btrfs_header_generation(src) != trans->transid);
3279        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3280
3281        src_nritems = btrfs_header_nritems(src);
3282        dst_nritems = btrfs_header_nritems(dst);
3283        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3284        if (push_items <= 0)
3285                return 1;
3286
3287        if (src_nritems < 4)
3288                return 1;
3289
3290        max_push = src_nritems / 2 + 1;
3291        /* don't try to empty the node */
3292        if (max_push >= src_nritems)
3293                return 1;
3294
3295        if (max_push < push_items)
3296                push_items = max_push;
3297
3298        tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3299        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3300                                      btrfs_node_key_ptr_offset(0),
3301                                      (dst_nritems) *
3302                                      sizeof(struct btrfs_key_ptr));
3303
3304        ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3305                                   src_nritems - push_items, push_items);
3306        if (ret) {
3307                btrfs_abort_transaction(trans, root, ret);
3308                return ret;
3309        }
3310        copy_extent_buffer(dst, src,
3311                           btrfs_node_key_ptr_offset(0),
3312                           btrfs_node_key_ptr_offset(src_nritems - push_items),
3313                           push_items * sizeof(struct btrfs_key_ptr));
3314
3315        btrfs_set_header_nritems(src, src_nritems - push_items);
3316        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3317
3318        btrfs_mark_buffer_dirty(src);
3319        btrfs_mark_buffer_dirty(dst);
3320
3321        return ret;
3322}
3323
3324/*
3325 * helper function to insert a new root level in the tree.
3326 * A new node is allocated, and a single item is inserted to
3327 * point to the existing root
3328 *
3329 * returns zero on success or < 0 on failure.
3330 */
3331static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3332                           struct btrfs_root *root,
3333                           struct btrfs_path *path, int level)
3334{
3335        u64 lower_gen;
3336        struct extent_buffer *lower;
3337        struct extent_buffer *c;
3338        struct extent_buffer *old;
3339        struct btrfs_disk_key lower_key;
3340
3341        BUG_ON(path->nodes[level]);
3342        BUG_ON(path->nodes[level-1] != root->node);
3343
3344        lower = path->nodes[level-1];
3345        if (level == 1)
3346                btrfs_item_key(lower, &lower_key, 0);
3347        else
3348                btrfs_node_key(lower, &lower_key, 0);
3349
3350        c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3351                                   &lower_key, level, root->node->start, 0);
3352        if (IS_ERR(c))
3353                return PTR_ERR(c);
3354
3355        root_add_used(root, root->nodesize);
3356
3357        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3358        btrfs_set_header_nritems(c, 1);
3359        btrfs_set_header_level(c, level);
3360        btrfs_set_header_bytenr(c, c->start);
3361        btrfs_set_header_generation(c, trans->transid);
3362        btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3363        btrfs_set_header_owner(c, root->root_key.objectid);
3364
3365        write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3366                            BTRFS_FSID_SIZE);
3367
3368        write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3369                            btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3370
3371        btrfs_set_node_key(c, &lower_key, 0);
3372        btrfs_set_node_blockptr(c, 0, lower->start);
3373        lower_gen = btrfs_header_generation(lower);
3374        WARN_ON(lower_gen != trans->transid);
3375
3376        btrfs_set_node_ptr_generation(c, 0, lower_gen);
3377
3378        btrfs_mark_buffer_dirty(c);
3379
3380        old = root->node;
3381        tree_mod_log_set_root_pointer(root, c, 0);
3382        rcu_assign_pointer(root->node, c);
3383
3384        /* the super has an extra ref to root->node */
3385        free_extent_buffer(old);
3386
3387        add_root_to_dirty_list(root);
3388        extent_buffer_get(c);
3389        path->nodes[level] = c;
3390        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3391        path->slots[level] = 0;
3392        return 0;
3393}
3394
3395/*
3396 * worker function to insert a single pointer in a node.
3397 * the node should have enough room for the pointer already
3398 *
3399 * slot and level indicate where you want the key to go, and
3400 * blocknr is the block the key points to.
3401 */
3402static void insert_ptr(struct btrfs_trans_handle *trans,
3403                       struct btrfs_root *root, struct btrfs_path *path,
3404                       struct btrfs_disk_key *key, u64 bytenr,
3405                       int slot, int level)
3406{
3407        struct extent_buffer *lower;
3408        int nritems;
3409        int ret;
3410
3411        BUG_ON(!path->nodes[level]);
3412        btrfs_assert_tree_locked(path->nodes[level]);
3413        lower = path->nodes[level];
3414        nritems = btrfs_header_nritems(lower);
3415        BUG_ON(slot > nritems);
3416        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3417        if (slot != nritems) {
3418                if (level)
3419                        tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3420                                             slot, nritems - slot);
3421                memmove_extent_buffer(lower,
3422                              btrfs_node_key_ptr_offset(slot + 1),
3423                              btrfs_node_key_ptr_offset(slot),
3424                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
3425        }
3426        if (level) {
3427                ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3428                                              MOD_LOG_KEY_ADD, GFP_NOFS);
3429                BUG_ON(ret < 0);
3430        }
3431        btrfs_set_node_key(lower, key, slot);
3432        btrfs_set_node_blockptr(lower, slot, bytenr);
3433        WARN_ON(trans->transid == 0);
3434        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3435        btrfs_set_header_nritems(lower, nritems + 1);
3436        btrfs_mark_buffer_dirty(lower);
3437}
3438
3439/*
3440 * split the node at the specified level in path in two.
3441 * The path is corrected to point to the appropriate node after the split
3442 *
3443 * Before splitting this tries to make some room in the node by pushing
3444 * left and right, if either one works, it returns right away.
3445 *
3446 * returns 0 on success and < 0 on failure
3447 */
3448static noinline int split_node(struct btrfs_trans_handle *trans,
3449                               struct btrfs_root *root,
3450                               struct btrfs_path *path, int level)
3451{
3452        struct extent_buffer *c;
3453        struct extent_buffer *split;
3454        struct btrfs_disk_key disk_key;
3455        int mid;
3456        int ret;
3457        u32 c_nritems;
3458
3459        c = path->nodes[level];
3460        WARN_ON(btrfs_header_generation(c) != trans->transid);
3461        if (c == root->node) {
3462                /*
3463                 * trying to split the root, lets make a new one
3464                 *
3465                 * tree mod log: We don't log_removal old root in
3466                 * insert_new_root, because that root buffer will be kept as a
3467                 * normal node. We are going to log removal of half of the
3468                 * elements below with tree_mod_log_eb_copy. We're holding a
3469                 * tree lock on the buffer, which is why we cannot race with
3470                 * other tree_mod_log users.
3471                 */
3472                ret = insert_new_root(trans, root, path, level + 1);
3473                if (ret)
3474                        return ret;
3475        } else {
3476                ret = push_nodes_for_insert(trans, root, path, level);
3477                c = path->nodes[level];
3478                if (!ret && btrfs_header_nritems(c) <
3479                    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3480                        return 0;
3481                if (ret < 0)
3482                        return ret;
3483        }
3484
3485        c_nritems = btrfs_header_nritems(c);
3486        mid = (c_nritems + 1) / 2;
3487        btrfs_node_key(c, &disk_key, mid);
3488
3489        split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3490                        &disk_key, level, c->start, 0);
3491        if (IS_ERR(split))
3492                return PTR_ERR(split);
3493
3494        root_add_used(root, root->nodesize);
3495
3496        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3497        btrfs_set_header_level(split, btrfs_header_level(c));
3498        btrfs_set_header_bytenr(split, split->start);
3499        btrfs_set_header_generation(split, trans->transid);
3500        btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3501        btrfs_set_header_owner(split, root->root_key.objectid);
3502        write_extent_buffer(split, root->fs_info->fsid,
3503                            btrfs_header_fsid(), BTRFS_FSID_SIZE);
3504        write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3505                            btrfs_header_chunk_tree_uuid(split),
3506                            BTRFS_UUID_SIZE);
3507
3508        ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3509                                   mid, c_nritems - mid);
3510        if (ret) {
3511                btrfs_abort_transaction(trans, root, ret);
3512                return ret;
3513        }
3514        copy_extent_buffer(split, c,
3515                           btrfs_node_key_ptr_offset(0),
3516                           btrfs_node_key_ptr_offset(mid),
3517                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3518        btrfs_set_header_nritems(split, c_nritems - mid);
3519        btrfs_set_header_nritems(c, mid);
3520        ret = 0;
3521
3522        btrfs_mark_buffer_dirty(c);
3523        btrfs_mark_buffer_dirty(split);
3524
3525        insert_ptr(trans, root, path, &disk_key, split->start,
3526                   path->slots[level + 1] + 1, level + 1);
3527
3528        if (path->slots[level] >= mid) {
3529                path->slots[level] -= mid;
3530                btrfs_tree_unlock(c);
3531                free_extent_buffer(c);
3532                path->nodes[level] = split;
3533                path->slots[level + 1] += 1;
3534        } else {
3535                btrfs_tree_unlock(split);
3536                free_extent_buffer(split);
3537        }
3538        return ret;
3539}
3540
3541/*
3542 * how many bytes are required to store the items in a leaf.  start
3543 * and nr indicate which items in the leaf to check.  This totals up the
3544 * space used both by the item structs and the item data
3545 */
3546static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3547{
3548        struct btrfs_item *start_item;
3549        struct btrfs_item *end_item;
3550        struct btrfs_map_token token;
3551        int data_len;
3552        int nritems = btrfs_header_nritems(l);
3553        int end = min(nritems, start + nr) - 1;
3554
3555        if (!nr)
3556                return 0;
3557        btrfs_init_map_token(&token);
3558        start_item = btrfs_item_nr(start);
3559        end_item = btrfs_item_nr(end);
3560        data_len = btrfs_token_item_offset(l, start_item, &token) +
3561                btrfs_token_item_size(l, start_item, &token);
3562        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3563        data_len += sizeof(struct btrfs_item) * nr;
3564        WARN_ON(data_len < 0);
3565        return data_len;
3566}
3567
3568/*
3569 * The space between the end of the leaf items and
3570 * the start of the leaf data.  IOW, how much room
3571 * the leaf has left for both items and data
3572 */
3573noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3574                                   struct extent_buffer *leaf)
3575{
3576        int nritems = btrfs_header_nritems(leaf);
3577        int ret;
3578        ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3579        if (ret < 0) {
3580                btrfs_crit(root->fs_info,
3581                        "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3582                       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3583                       leaf_space_used(leaf, 0, nritems), nritems);
3584        }
3585        return ret;
3586}
3587
3588/*
3589 * min slot controls the lowest index we're willing to push to the
3590 * right.  We'll push up to and including min_slot, but no lower
3591 */
3592static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3593                                      struct btrfs_root *root,
3594                                      struct btrfs_path *path,
3595                                      int data_size, int empty,
3596                                      struct extent_buffer *right,
3597                                      int free_space, u32 left_nritems,
3598                                      u32 min_slot)
3599{
3600        struct extent_buffer *left = path->nodes[0];
3601        struct extent_buffer *upper = path->nodes[1];
3602        struct btrfs_map_token token;
3603        struct btrfs_disk_key disk_key;
3604        int slot;
3605        u32 i;
3606        int push_space = 0;
3607        int push_items = 0;
3608        struct btrfs_item *item;
3609        u32 nr;
3610        u32 right_nritems;
3611        u32 data_end;
3612        u32 this_item_size;
3613
3614        btrfs_init_map_token(&token);
3615
3616        if (empty)
3617                nr = 0;
3618        else
3619                nr = max_t(u32, 1, min_slot);
3620
3621        if (path->slots[0] >= left_nritems)
3622                push_space += data_size;
3623
3624        slot = path->slots[1];
3625        i = left_nritems - 1;
3626        while (i >= nr) {
3627                item = btrfs_item_nr(i);
3628
3629                if (!empty && push_items > 0) {
3630                        if (path->slots[0] > i)
3631                                break;
3632                        if (path->slots[0] == i) {
3633                                int space = btrfs_leaf_free_space(root, left);
3634                                if (space + push_space * 2 > free_space)
3635                                        break;
3636                        }
3637                }
3638
3639                if (path->slots[0] == i)
3640                        push_space += data_size;
3641
3642                this_item_size = btrfs_item_size(left, item);
3643                if (this_item_size + sizeof(*item) + push_space > free_space)
3644                        break;
3645
3646                push_items++;
3647                push_space += this_item_size + sizeof(*item);
3648                if (i == 0)
3649                        break;
3650                i--;
3651        }
3652
3653        if (push_items == 0)
3654                goto out_unlock;
3655
3656        WARN_ON(!empty && push_items == left_nritems);
3657
3658        /* push left to right */
3659        right_nritems = btrfs_header_nritems(right);
3660
3661        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3662        push_space -= leaf_data_end(root, left);
3663
3664        /* make room in the right data area */
3665        data_end = leaf_data_end(root, right);
3666        memmove_extent_buffer(right,
3667                              btrfs_leaf_data(right) + data_end - push_space,
3668                              btrfs_leaf_data(right) + data_end,
3669                              BTRFS_LEAF_DATA_SIZE(root) - data_end);
3670
3671        /* copy from the left data area */
3672        copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3673                     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3674                     btrfs_leaf_data(left) + leaf_data_end(root, left),
3675                     push_space);
3676
3677        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3678                              btrfs_item_nr_offset(0),
3679                              right_nritems * sizeof(struct btrfs_item));
3680
3681        /* copy the items from left to right */
3682        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3683                   btrfs_item_nr_offset(left_nritems - push_items),
3684                   push_items * sizeof(struct btrfs_item));
3685
3686        /* update the item pointers */
3687        right_nritems += push_items;
3688        btrfs_set_header_nritems(right, right_nritems);
3689        push_space = BTRFS_LEAF_DATA_SIZE(root);
3690        for (i = 0; i < right_nritems; i++) {
3691                item = btrfs_item_nr(i);
3692                push_space -= btrfs_token_item_size(right, item, &token);
3693                btrfs_set_token_item_offset(right, item, push_space, &token);
3694        }
3695
3696        left_nritems -= push_items;
3697        btrfs_set_header_nritems(left, left_nritems);
3698
3699        if (left_nritems)
3700                btrfs_mark_buffer_dirty(left);
3701        else
3702                clean_tree_block(trans, root->fs_info, left);
3703
3704        btrfs_mark_buffer_dirty(right);
3705
3706        btrfs_item_key(right, &disk_key, 0);
3707        btrfs_set_node_key(upper, &disk_key, slot + 1);
3708        btrfs_mark_buffer_dirty(upper);
3709
3710        /* then fixup the leaf pointer in the path */
3711        if (path->slots[0] >= left_nritems) {
3712                path->slots[0] -= left_nritems;
3713                if (btrfs_header_nritems(path->nodes[0]) == 0)
3714                        clean_tree_block(trans, root->fs_info, path->nodes[0]);
3715                btrfs_tree_unlock(path->nodes[0]);
3716                free_extent_buffer(path->nodes[0]);
3717                path->nodes[0] = right;
3718                path->slots[1] += 1;
3719        } else {
3720                btrfs_tree_unlock(right);
3721                free_extent_buffer(right);
3722        }
3723        return 0;
3724
3725out_unlock:
3726        btrfs_tree_unlock(right);
3727        free_extent_buffer(right);
3728        return 1;
3729}
3730
3731/*
3732 * push some data in the path leaf to the right, trying to free up at
3733 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3734 *
3735 * returns 1 if the push failed because the other node didn't have enough
3736 * room, 0 if everything worked out and < 0 if there were major errors.
3737 *
3738 * this will push starting from min_slot to the end of the leaf.  It won't
3739 * push any slot lower than min_slot
3740 */
3741static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3742                           *root, struct btrfs_path *path,
3743                           int min_data_size, int data_size,
3744                           int empty, u32 min_slot)
3745{
3746        struct extent_buffer *left = path->nodes[0];
3747        struct extent_buffer *right;
3748        struct extent_buffer *upper;
3749        int slot;
3750        int free_space;
3751        u32 left_nritems;
3752        int ret;
3753
3754        if (!path->nodes[1])
3755                return 1;
3756
3757        slot = path->slots[1];
3758        upper = path->nodes[1];
3759        if (slot >= btrfs_header_nritems(upper) - 1)
3760                return 1;
3761
3762        btrfs_assert_tree_locked(path->nodes[1]);
3763
3764        right = read_node_slot(root, upper, slot + 1);
3765        if (right == NULL)
3766                return 1;
3767
3768        btrfs_tree_lock(right);
3769        btrfs_set_lock_blocking(right);
3770
3771        free_space = btrfs_leaf_free_space(root, right);
3772        if (free_space < data_size)
3773                goto out_unlock;
3774
3775        /* cow and double check */
3776        ret = btrfs_cow_block(trans, root, right, upper,
3777                              slot + 1, &right);
3778        if (ret)
3779                goto out_unlock;
3780
3781        free_space = btrfs_leaf_free_space(root, right);
3782        if (free_space < data_size)
3783                goto out_unlock;
3784
3785        left_nritems = btrfs_header_nritems(left);
3786        if (left_nritems == 0)
3787                goto out_unlock;
3788
3789        if (path->slots[0] == left_nritems && !empty) {
3790                /* Key greater than all keys in the leaf, right neighbor has
3791                 * enough room for it and we're not emptying our leaf to delete
3792                 * it, therefore use right neighbor to insert the new item and
3793                 * no need to touch/dirty our left leaft. */
3794                btrfs_tree_unlock(left);
3795                free_extent_buffer(left);
3796                path->nodes[0] = right;
3797                path->slots[0] = 0;
3798                path->slots[1]++;
3799                return 0;
3800        }
3801
3802        return __push_leaf_right(trans, root, path, min_data_size, empty,
3803                                right, free_space, left_nritems, min_slot);
3804out_unlock:
3805        btrfs_tree_unlock(right);
3806        free_extent_buffer(right);
3807        return 1;
3808}
3809
3810/*
3811 * push some data in the path leaf to the left, trying to free up at
3812 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3813 *
3814 * max_slot can put a limit on how far into the leaf we'll push items.  The
3815 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3816 * items
3817 */
3818static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3819                                     struct btrfs_root *root,
3820                                     struct btrfs_path *path, int data_size,
3821                                     int empty, struct extent_buffer *left,
3822                                     int free_space, u32 right_nritems,
3823                                     u32 max_slot)
3824{
3825        struct btrfs_disk_key disk_key;
3826        struct extent_buffer *right = path->nodes[0];
3827        int i;
3828        int push_space = 0;
3829        int push_items = 0;
3830        struct btrfs_item *item;
3831        u32 old_left_nritems;
3832        u32 nr;
3833        int ret = 0;
3834        u32 this_item_size;
3835        u32 old_left_item_size;
3836        struct btrfs_map_token token;
3837
3838        btrfs_init_map_token(&token);
3839
3840        if (empty)
3841                nr = min(right_nritems, max_slot);
3842        else
3843                nr = min(right_nritems - 1, max_slot);
3844
3845        for (i = 0; i < nr; i++) {
3846                item = btrfs_item_nr(i);
3847
3848                if (!empty && push_items > 0) {
3849                        if (path->slots[0] < i)
3850                                break;
3851                        if (path->slots[0] == i) {
3852                                int space = btrfs_leaf_free_space(root, right);
3853                                if (space + push_space * 2 > free_space)
3854                                        break;
3855                        }
3856                }
3857
3858                if (path->slots[0] == i)
3859                        push_space += data_size;
3860
3861                this_item_size = btrfs_item_size(right, item);
3862                if (this_item_size + sizeof(*item) + push_space > free_space)
3863                        break;
3864
3865                push_items++;
3866                push_space += this_item_size + sizeof(*item);
3867        }
3868
3869        if (push_items == 0) {
3870                ret = 1;
3871                goto out;
3872        }
3873        WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3874
3875        /* push data from right to left */
3876        copy_extent_buffer(left, right,
3877                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
3878                           btrfs_item_nr_offset(0),
3879                           push_items * sizeof(struct btrfs_item));
3880
3881        push_space = BTRFS_LEAF_DATA_SIZE(root) -
3882                     btrfs_item_offset_nr(right, push_items - 1);
3883
3884        copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3885                     leaf_data_end(root, left) - push_space,
3886                     btrfs_leaf_data(right) +
3887                     btrfs_item_offset_nr(right, push_items - 1),
3888                     push_space);
3889        old_left_nritems = btrfs_header_nritems(left);
3890        BUG_ON(old_left_nritems <= 0);
3891
3892        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3893        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3894                u32 ioff;
3895
3896                item = btrfs_item_nr(i);
3897
3898                ioff = btrfs_token_item_offset(left, item, &token);
3899                btrfs_set_token_item_offset(left, item,
3900                      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3901                      &token);
3902        }
3903        btrfs_set_header_nritems(left, old_left_nritems + push_items);
3904
3905        /* fixup right node */
3906        if (push_items > right_nritems)
3907                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3908                       right_nritems);
3909
3910        if (push_items < right_nritems) {
3911                push_space = btrfs_item_offset_nr(right, push_items - 1) -
3912                                                  leaf_data_end(root, right);
3913                memmove_extent_buffer(right, btrfs_leaf_data(right) +
3914                                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3915                                      btrfs_leaf_data(right) +
3916                                      leaf_data_end(root, right), push_space);
3917
3918                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3919                              btrfs_item_nr_offset(push_items),
3920                             (btrfs_header_nritems(right) - push_items) *
3921                             sizeof(struct btrfs_item));
3922        }
3923        right_nritems -= push_items;
3924        btrfs_set_header_nritems(right, right_nritems);
3925        push_space = BTRFS_LEAF_DATA_SIZE(root);
3926        for (i = 0; i < right_nritems; i++) {
3927                item = btrfs_item_nr(i);
3928
3929                push_space = push_space - btrfs_token_item_size(right,
3930                                                                item, &token);
3931                btrfs_set_token_item_offset(right, item, push_space, &token);
3932        }
3933
3934        btrfs_mark_buffer_dirty(left);
3935        if (right_nritems)
3936                btrfs_mark_buffer_dirty(right);
3937        else
3938                clean_tree_block(trans, root->fs_info, right);
3939
3940        btrfs_item_key(right, &disk_key, 0);
3941        fixup_low_keys(root->fs_info, path, &disk_key, 1);
3942
3943        /* then fixup the leaf pointer in the path */
3944        if (path->slots[0] < push_items) {
3945                path->slots[0] += old_left_nritems;
3946                btrfs_tree_unlock(path->nodes[0]);
3947                free_extent_buffer(path->nodes[0]);
3948                path->nodes[0] = left;
3949                path->slots[1] -= 1;
3950        } else {
3951                btrfs_tree_unlock(left);
3952                free_extent_buffer(left);
3953                path->slots[0] -= push_items;
3954        }
3955        BUG_ON(path->slots[0] < 0);
3956        return ret;
3957out:
3958        btrfs_tree_unlock(left);
3959        free_extent_buffer(left);
3960        return ret;
3961}
3962
3963/*
3964 * push some data in the path leaf to the left, trying to free up at
3965 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3966 *
3967 * max_slot can put a limit on how far into the leaf we'll push items.  The
3968 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3969 * items
3970 */
3971static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3972                          *root, struct btrfs_path *path, int min_data_size,
3973                          int data_size, int empty, u32 max_slot)
3974{
3975        struct extent_buffer *right = path->nodes[0];
3976        struct extent_buffer *left;
3977        int slot;
3978        int free_space;
3979        u32 right_nritems;
3980        int ret = 0;
3981
3982        slot = path->slots[1];
3983        if (slot == 0)
3984                return 1;
3985        if (!path->nodes[1])
3986                return 1;
3987
3988        right_nritems = btrfs_header_nritems(right);
3989        if (right_nritems == 0)
3990                return 1;
3991
3992        btrfs_assert_tree_locked(path->nodes[1]);
3993
3994        left = read_node_slot(root, path->nodes[1], slot - 1);
3995        if (left == NULL)
3996                return 1;
3997
3998        btrfs_tree_lock(left);
3999        btrfs_set_lock_blocking(left);
4000
4001        free_space = btrfs_leaf_free_space(root, left);
4002        if (free_space < data_size) {
4003                ret = 1;
4004                goto out;
4005        }
4006
4007        /* cow and double check */
4008        ret = btrfs_cow_block(trans, root, left,
4009                              path->nodes[1], slot - 1, &left);
4010        if (ret) {
4011                /* we hit -ENOSPC, but it isn't fatal here */
4012                if (ret == -ENOSPC)
4013                        ret = 1;
4014                goto out;
4015        }
4016
4017        free_space = btrfs_leaf_free_space(root, left);
4018        if (free_space < data_size) {
4019                ret = 1;
4020                goto out;
4021        }
4022
4023        return __push_leaf_left(trans, root, path, min_data_size,
4024                               empty, left, free_space, right_nritems,
4025                               max_slot);
4026out:
4027        btrfs_tree_unlock(left);
4028        free_extent_buffer(left);
4029        return ret;
4030}
4031
4032/*
4033 * split the path's leaf in two, making sure there is at least data_size
4034 * available for the resulting leaf level of the path.
4035 */
4036static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4037                                    struct btrfs_root *root,
4038                                    struct btrfs_path *path,
4039                                    struct extent_buffer *l,
4040                                    struct extent_buffer *right,
4041                                    int slot, int mid, int nritems)
4042{
4043        int data_copy_size;
4044        int rt_data_off;
4045        int i;
4046        struct btrfs_disk_key disk_key;
4047        struct btrfs_map_token token;
4048
4049        btrfs_init_map_token(&token);
4050
4051        nritems = nritems - mid;
4052        btrfs_set_header_nritems(right, nritems);
4053        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4054
4055        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4056                           btrfs_item_nr_offset(mid),
4057                           nritems * sizeof(struct btrfs_item));
4058
4059        copy_extent_buffer(right, l,
4060                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4061                     data_copy_size, btrfs_leaf_data(l) +
4062                     leaf_data_end(root, l), data_copy_size);
4063
4064        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4065                      btrfs_item_end_nr(l, mid);
4066
4067        for (i = 0; i < nritems; i++) {
4068                struct btrfs_item *item = btrfs_item_nr(i);
4069                u32 ioff;
4070
4071                ioff = btrfs_token_item_offset(right, item, &token);
4072                btrfs_set_token_item_offset(right, item,
4073                                            ioff + rt_data_off, &token);
4074        }
4075
4076        btrfs_set_header_nritems(l, mid);
4077        btrfs_item_key(right, &disk_key, 0);
4078        insert_ptr(trans, root, path, &disk_key, right->start,
4079                   path->slots[1] + 1, 1);
4080
4081        btrfs_mark_buffer_dirty(right);
4082        btrfs_mark_buffer_dirty(l);
4083        BUG_ON(path->slots[0] != slot);
4084
4085        if (mid <= slot) {
4086                btrfs_tree_unlock(path->nodes[0]);
4087                free_extent_buffer(path->nodes[0]);
4088                path->nodes[0] = right;
4089                path->slots[0] -= mid;
4090                path->slots[1] += 1;
4091        } else {
4092                btrfs_tree_unlock(right);
4093                free_extent_buffer(right);
4094        }
4095
4096        BUG_ON(path->slots[0] < 0);
4097}
4098
4099/*
4100 * double splits happen when we need to insert a big item in the middle
4101 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4102 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4103 *          A                 B                 C
4104 *
4105 * We avoid this by trying to push the items on either side of our target
4106 * into the adjacent leaves.  If all goes well we can avoid the double split
4107 * completely.
4108 */
4109static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4110                                          struct btrfs_root *root,
4111                                          struct btrfs_path *path,
4112                                          int data_size)
4113{
4114        int ret;
4115        int progress = 0;
4116        int slot;
4117        u32 nritems;
4118        int space_needed = data_size;
4119
4120        slot = path->slots[0];
4121        if (slot < btrfs_header_nritems(path->nodes[0]))
4122                space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4123
4124        /*
4125         * try to push all the items after our slot into the
4126         * right leaf
4127         */
4128        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4129        if (ret < 0)
4130                return ret;
4131
4132        if (ret == 0)
4133                progress++;
4134
4135        nritems = btrfs_header_nritems(path->nodes[0]);
4136        /*
4137         * our goal is to get our slot at the start or end of a leaf.  If
4138         * we've done so we're done
4139         */
4140        if (path->slots[0] == 0 || path->slots[0] == nritems)
4141                return 0;
4142
4143        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4144                return 0;
4145
4146        /* try to push all the items before our slot into the next leaf */
4147        slot = path->slots[0];
4148        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4149        if (ret < 0)
4150                return ret;
4151
4152        if (ret == 0)
4153                progress++;
4154
4155        if (progress)
4156                return 0;
4157        return 1;
4158}
4159
4160/*
4161 * split the path's leaf in two, making sure there is at least data_size
4162 * available for the resulting leaf level of the path.
4163 *
4164 * returns 0 if all went well and < 0 on failure.
4165 */
4166static noinline int split_leaf(struct btrfs_trans_handle *trans,
4167                               struct btrfs_root *root,
4168                               struct btrfs_key *ins_key,
4169                               struct btrfs_path *path, int data_size,
4170                               int extend)
4171{
4172        struct btrfs_disk_key disk_key;
4173        struct extent_buffer *l;
4174        u32 nritems;
4175        int mid;
4176        int slot;
4177        struct extent_buffer *right;
4178        struct btrfs_fs_info *fs_info = root->fs_info;
4179        int ret = 0;
4180        int wret;
4181        int split;
4182        int num_doubles = 0;
4183        int tried_avoid_double = 0;
4184
4185        l = path->nodes[0];
4186        slot = path->slots[0];
4187        if (extend && data_size + btrfs_item_size_nr(l, slot) +
4188            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4189                return -EOVERFLOW;
4190
4191        /* first try to make some room by pushing left and right */
4192        if (data_size && path->nodes[1]) {
4193                int space_needed = data_size;
4194
4195                if (slot < btrfs_header_nritems(l))
4196                        space_needed -= btrfs_leaf_free_space(root, l);
4197
4198                wret = push_leaf_right(trans, root, path, space_needed,
4199                                       space_needed, 0, 0);
4200                if (wret < 0)
4201                        return wret;
4202                if (wret) {
4203                        wret = push_leaf_left(trans, root, path, space_needed,
4204                                              space_needed, 0, (u32)-1);
4205                        if (wret < 0)
4206                                return wret;
4207                }
4208                l = path->nodes[0];
4209
4210                /* did the pushes work? */
4211                if (btrfs_leaf_free_space(root, l) >= data_size)
4212                        return 0;
4213        }
4214
4215        if (!path->nodes[1]) {
4216                ret = insert_new_root(trans, root, path, 1);
4217                if (ret)
4218                        return ret;
4219        }
4220again:
4221        split = 1;
4222        l = path->nodes[0];
4223        slot = path->slots[0];
4224        nritems = btrfs_header_nritems(l);
4225        mid = (nritems + 1) / 2;
4226
4227        if (mid <= slot) {
4228                if (nritems == 1 ||
4229                    leaf_space_used(l, mid, nritems - mid) + data_size >
4230                        BTRFS_LEAF_DATA_SIZE(root)) {
4231                        if (slot >= nritems) {
4232                                split = 0;
4233                        } else {
4234                                mid = slot;
4235                                if (mid != nritems &&
4236                                    leaf_space_used(l, mid, nritems - mid) +
4237                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4238                                        if (data_size && !tried_avoid_double)
4239                                                goto push_for_double;
4240                                        split = 2;
4241                                }
4242                        }
4243                }
4244        } else {
4245                if (leaf_space_used(l, 0, mid) + data_size >
4246                        BTRFS_LEAF_DATA_SIZE(root)) {
4247                        if (!extend && data_size && slot == 0) {
4248                                split = 0;
4249                        } else if ((extend || !data_size) && slot == 0) {
4250                                mid = 1;
4251                        } else {
4252                                mid = slot;
4253                                if (mid != nritems &&
4254                                    leaf_space_used(l, mid, nritems - mid) +
4255                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4256                                        if (data_size && !tried_avoid_double)
4257                                                goto push_for_double;
4258                                        split = 2;
4259                                }
4260                        }
4261                }
4262        }
4263
4264        if (split == 0)
4265                btrfs_cpu_key_to_disk(&disk_key, ins_key);
4266        else
4267                btrfs_item_key(l, &disk_key, mid);
4268
4269        right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4270                        &disk_key, 0, l->start, 0);
4271        if (IS_ERR(right))
4272                return PTR_ERR(right);
4273
4274        root_add_used(root, root->nodesize);
4275
4276        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4277        btrfs_set_header_bytenr(right, right->start);
4278        btrfs_set_header_generation(right, trans->transid);
4279        btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4280        btrfs_set_header_owner(right, root->root_key.objectid);
4281        btrfs_set_header_level(right, 0);
4282        write_extent_buffer(right, fs_info->fsid,
4283                            btrfs_header_fsid(), BTRFS_FSID_SIZE);
4284
4285        write_extent_buffer(right, fs_info->chunk_tree_uuid,
4286                            btrfs_header_chunk_tree_uuid(right),
4287                            BTRFS_UUID_SIZE);
4288
4289        if (split == 0) {
4290                if (mid <= slot) {
4291                        btrfs_set_header_nritems(right, 0);
4292                        insert_ptr(trans, root, path, &disk_key, right->start,
4293                                   path->slots[1] + 1, 1);
4294                        btrfs_tree_unlock(path->nodes[0]);
4295                        free_extent_buffer(path->nodes[0]);
4296                        path->nodes[0] = right;
4297                        path->slots[0] = 0;
4298                        path->slots[1] += 1;
4299                } else {
4300                        btrfs_set_header_nritems(right, 0);
4301                        insert_ptr(trans, root, path, &disk_key, right->start,
4302                                          path->slots[1], 1);
4303                        btrfs_tree_unlock(path->nodes[0]);
4304                        free_extent_buffer(path->nodes[0]);
4305                        path->nodes[0] = right;
4306                        path->slots[0] = 0;
4307                        if (path->slots[1] == 0)
4308                                fixup_low_keys(fs_info, path, &disk_key, 1);
4309                }
4310                btrfs_mark_buffer_dirty(right);
4311                return ret;
4312        }
4313
4314        copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4315
4316        if (split == 2) {
4317                BUG_ON(num_doubles != 0);
4318                num_doubles++;
4319                goto again;
4320        }
4321
4322        return 0;
4323
4324push_for_double:
4325        push_for_double_split(trans, root, path, data_size);
4326        tried_avoid_double = 1;
4327        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4328                return 0;
4329        goto again;
4330}
4331
4332static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4333                                         struct btrfs_root *root,
4334                                         struct btrfs_path *path, int ins_len)
4335{
4336        struct btrfs_key key;
4337        struct extent_buffer *leaf;
4338        struct btrfs_file_extent_item *fi;
4339        u64 extent_len = 0;
4340        u32 item_size;
4341        int ret;
4342
4343        leaf = path->nodes[0];
4344        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4345
4346        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4347               key.type != BTRFS_EXTENT_CSUM_KEY);
4348
4349        if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4350                return 0;
4351
4352        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4353        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4354                fi = btrfs_item_ptr(leaf, path->slots[0],
4355                                    struct btrfs_file_extent_item);
4356                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4357        }
4358        btrfs_release_path(path);
4359
4360        path->keep_locks = 1;
4361        path->search_for_split = 1;
4362        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4363        path->search_for_split = 0;
4364        if (ret > 0)
4365                ret = -EAGAIN;
4366        if (ret < 0)
4367                goto err;
4368
4369        ret = -EAGAIN;
4370        leaf = path->nodes[0];
4371        /* if our item isn't there, return now */
4372        if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4373                goto err;
4374
4375        /* the leaf has  changed, it now has room.  return now */
4376        if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4377                goto err;
4378
4379        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4380                fi = btrfs_item_ptr(leaf, path->slots[0],
4381                                    struct btrfs_file_extent_item);
4382                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4383                        goto err;
4384        }
4385
4386        btrfs_set_path_blocking(path);
4387        ret = split_leaf(trans, root, &key, path, ins_len, 1);
4388        if (ret)
4389                goto err;
4390
4391        path->keep_locks = 0;
4392        btrfs_unlock_up_safe(path, 1);
4393        return 0;
4394err:
4395        path->keep_locks = 0;
4396        return ret;
4397}
4398
4399static noinline int split_item(struct btrfs_trans_handle *trans,
4400                               struct btrfs_root *root,
4401                               struct btrfs_path *path,
4402                               struct btrfs_key *new_key,
4403                               unsigned long split_offset)
4404{
4405        struct extent_buffer *leaf;
4406        struct btrfs_item *item;
4407        struct btrfs_item *new_item;
4408        int slot;
4409        char *buf;
4410        u32 nritems;
4411        u32 item_size;
4412        u32 orig_offset;
4413        struct btrfs_disk_key disk_key;
4414
4415        leaf = path->nodes[0];
4416        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4417
4418        btrfs_set_path_blocking(path);
4419
4420        item = btrfs_item_nr(path->slots[0]);
4421        orig_offset = btrfs_item_offset(leaf, item);
4422        item_size = btrfs_item_size(leaf, item);
4423
4424        buf = kmalloc(item_size, GFP_NOFS);
4425        if (!buf)
4426                return -ENOMEM;
4427
4428        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4429                            path->slots[0]), item_size);
4430
4431        slot = path->slots[0] + 1;
4432        nritems = btrfs_header_nritems(leaf);
4433        if (slot != nritems) {
4434                /* shift the items */
4435                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4436                                btrfs_item_nr_offset(slot),
4437                                (nritems - slot) * sizeof(struct btrfs_item));
4438        }
4439
4440        btrfs_cpu_key_to_disk(&disk_key, new_key);
4441        btrfs_set_item_key(leaf, &disk_key, slot);
4442
4443        new_item = btrfs_item_nr(slot);
4444
4445        btrfs_set_item_offset(leaf, new_item, orig_offset);
4446        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4447
4448        btrfs_set_item_offset(leaf, item,
4449                              orig_offset + item_size - split_offset);
4450        btrfs_set_item_size(leaf, item, split_offset);
4451
4452        btrfs_set_header_nritems(leaf, nritems + 1);
4453
4454        /* write the data for the start of the original item */
4455        write_extent_buffer(leaf, buf,
4456                            btrfs_item_ptr_offset(leaf, path->slots[0]),
4457                            split_offset);
4458
4459        /* write the data for the new item */
4460        write_extent_buffer(leaf, buf + split_offset,
4461                            btrfs_item_ptr_offset(leaf, slot),
4462                            item_size - split_offset);
4463        btrfs_mark_buffer_dirty(leaf);
4464
4465        BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4466        kfree(buf);
4467        return 0;
4468}
4469
4470/*
4471 * This function splits a single item into two items,
4472 * giving 'new_key' to the new item and splitting the
4473 * old one at split_offset (from the start of the item).
4474 *
4475 * The path may be released by this operation.  After
4476 * the split, the path is pointing to the old item.  The
4477 * new item is going to be in the same node as the old one.
4478 *
4479 * Note, the item being split must be smaller enough to live alone on
4480 * a tree block with room for one extra struct btrfs_item
4481 *
4482 * This allows us to split the item in place, keeping a lock on the
4483 * leaf the entire time.
4484 */
4485int btrfs_split_item(struct btrfs_trans_handle *trans,
4486                     struct btrfs_root *root,
4487                     struct btrfs_path *path,
4488                     struct btrfs_key *new_key,
4489                     unsigned long split_offset)
4490{
4491        int ret;
4492        ret = setup_leaf_for_split(trans, root, path,
4493                                   sizeof(struct btrfs_item));
4494        if (ret)
4495                return ret;
4496
4497        ret = split_item(trans, root, path, new_key, split_offset);
4498        return ret;
4499}
4500
4501/*
4502 * This function duplicate a item, giving 'new_key' to the new item.
4503 * It guarantees both items live in the same tree leaf and the new item
4504 * is contiguous with the original item.
4505 *
4506 * This allows us to split file extent in place, keeping a lock on the
4507 * leaf the entire time.
4508 */
4509int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4510                         struct btrfs_root *root,
4511                         struct btrfs_path *path,
4512                         struct btrfs_key *new_key)
4513{
4514        struct extent_buffer *leaf;
4515        int ret;
4516        u32 item_size;
4517
4518        leaf = path->nodes[0];
4519        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4520        ret = setup_leaf_for_split(trans, root, path,
4521                                   item_size + sizeof(struct btrfs_item));
4522        if (ret)
4523                return ret;
4524
4525        path->slots[0]++;
4526        setup_items_for_insert(root, path, new_key, &item_size,
4527                               item_size, item_size +
4528                               sizeof(struct btrfs_item), 1);
4529        leaf = path->nodes[0];
4530        memcpy_extent_buffer(leaf,
4531                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4532                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4533                             item_size);
4534        return 0;
4535}
4536
4537/*
4538 * make the item pointed to by the path smaller.  new_size indicates
4539 * how small to make it, and from_end tells us if we just chop bytes
4540 * off the end of the item or if we shift the item to chop bytes off
4541 * the front.
4542 */
4543void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4544                         u32 new_size, int from_end)
4545{
4546        int slot;
4547        struct extent_buffer *leaf;
4548        struct btrfs_item *item;
4549        u32 nritems;
4550        unsigned int data_end;
4551        unsigned int old_data_start;
4552        unsigned int old_size;
4553        unsigned int size_diff;
4554        int i;
4555        struct btrfs_map_token token;
4556
4557        btrfs_init_map_token(&token);
4558
4559        leaf = path->nodes[0];
4560        slot = path->slots[0];
4561
4562        old_size = btrfs_item_size_nr(leaf, slot);
4563        if (old_size == new_size)
4564                return;
4565
4566        nritems = btrfs_header_nritems(leaf);
4567        data_end = leaf_data_end(root, leaf);
4568
4569        old_data_start = btrfs_item_offset_nr(leaf, slot);
4570
4571        size_diff = old_size - new_size;
4572
4573        BUG_ON(slot < 0);
4574        BUG_ON(slot >= nritems);
4575
4576        /*
4577         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4578         */
4579        /* first correct the data pointers */
4580        for (i = slot; i < nritems; i++) {
4581                u32 ioff;
4582                item = btrfs_item_nr(i);
4583
4584                ioff = btrfs_token_item_offset(leaf, item, &token);
4585                btrfs_set_token_item_offset(leaf, item,
4586                                            ioff + size_diff, &token);
4587        }
4588
4589        /* shift the data */
4590        if (from_end) {
4591                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4592                              data_end + size_diff, btrfs_leaf_data(leaf) +
4593                              data_end, old_data_start + new_size - data_end);
4594        } else {
4595                struct btrfs_disk_key disk_key;
4596                u64 offset;
4597
4598                btrfs_item_key(leaf, &disk_key, slot);
4599
4600                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4601                        unsigned long ptr;
4602                        struct btrfs_file_extent_item *fi;
4603
4604                        fi = btrfs_item_ptr(leaf, slot,
4605                                            struct btrfs_file_extent_item);
4606                        fi = (struct btrfs_file_extent_item *)(
4607                             (unsigned long)fi - size_diff);
4608
4609                        if (btrfs_file_extent_type(leaf, fi) ==
4610                            BTRFS_FILE_EXTENT_INLINE) {
4611                                ptr = btrfs_item_ptr_offset(leaf, slot);
4612                                memmove_extent_buffer(leaf, ptr,
4613                                      (unsigned long)fi,
4614                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4615                        }
4616                }
4617
4618                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4619                              data_end + size_diff, btrfs_leaf_data(leaf) +
4620                              data_end, old_data_start - data_end);
4621
4622                offset = btrfs_disk_key_offset(&disk_key);
4623                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4624                btrfs_set_item_key(leaf, &disk_key, slot);
4625                if (slot == 0)
4626                        fixup_low_keys(root->fs_info, path, &disk_key, 1);
4627        }
4628
4629        item = btrfs_item_nr(slot);
4630        btrfs_set_item_size(leaf, item, new_size);
4631        btrfs_mark_buffer_dirty(leaf);
4632
4633        if (btrfs_leaf_free_space(root, leaf) < 0) {
4634                btrfs_print_leaf(root, leaf);
4635                BUG();
4636        }
4637}
4638
4639/*
4640 * make the item pointed to by the path bigger, data_size is the added size.
4641 */
4642void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4643                       u32 data_size)
4644{
4645        int slot;
4646        struct extent_buffer *leaf;
4647        struct btrfs_item *item;
4648        u32 nritems;
4649        unsigned int data_end;
4650        unsigned int old_data;
4651        unsigned int old_size;
4652        int i;
4653        struct btrfs_map_token token;
4654
4655        btrfs_init_map_token(&token);
4656
4657        leaf = path->nodes[0];
4658
4659        nritems = btrfs_header_nritems(leaf);
4660        data_end = leaf_data_end(root, leaf);
4661
4662        if (btrfs_leaf_free_space(root, leaf) < data_size) {
4663                btrfs_print_leaf(root, leaf);
4664                BUG();
4665        }
4666        slot = path->slots[0];
4667        old_data = btrfs_item_end_nr(leaf, slot);
4668
4669        BUG_ON(slot < 0);
4670        if (slot >= nritems) {
4671                btrfs_print_leaf(root, leaf);
4672                btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4673                       slot, nritems);
4674                BUG_ON(1);
4675        }
4676
4677        /*
4678         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4679         */
4680        /* first correct the data pointers */
4681        for (i = slot; i < nritems; i++) {
4682                u32 ioff;
4683                item = btrfs_item_nr(i);
4684
4685                ioff = btrfs_token_item_offset(leaf, item, &token);
4686                btrfs_set_token_item_offset(leaf, item,
4687                                            ioff - data_size, &token);
4688        }
4689
4690        /* shift the data */
4691        memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4692                      data_end - data_size, btrfs_leaf_data(leaf) +
4693                      data_end, old_data - data_end);
4694
4695        data_end = old_data;
4696        old_size = btrfs_item_size_nr(leaf, slot);
4697        item = btrfs_item_nr(slot);
4698        btrfs_set_item_size(leaf, item, old_size + data_size);
4699        btrfs_mark_buffer_dirty(leaf);
4700
4701        if (btrfs_leaf_free_space(root, leaf) < 0) {
4702                btrfs_print_leaf(root, leaf);
4703                BUG();
4704        }
4705}
4706
4707/*
4708 * this is a helper for btrfs_insert_empty_items, the main goal here is
4709 * to save stack depth by doing the bulk of the work in a function
4710 * that doesn't call btrfs_search_slot
4711 */
4712void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4713                            struct btrfs_key *cpu_key, u32 *data_size,
4714                            u32 total_data, u32 total_size, int nr)
4715{
4716        struct btrfs_item *item;
4717        int i;
4718        u32 nritems;
4719        unsigned int data_end;
4720        struct btrfs_disk_key disk_key;
4721        struct extent_buffer *leaf;
4722        int slot;
4723        struct btrfs_map_token token;
4724
4725        if (path->slots[0] == 0) {
4726                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4727                fixup_low_keys(root->fs_info, path, &disk_key, 1);
4728        }
4729        btrfs_unlock_up_safe(path, 1);
4730
4731        btrfs_init_map_token(&token);
4732
4733        leaf = path->nodes[0];
4734        slot = path->slots[0];
4735
4736        nritems = btrfs_header_nritems(leaf);
4737        data_end = leaf_data_end(root, leaf);
4738
4739        if (btrfs_leaf_free_space(root, leaf) < total_size) {
4740                btrfs_print_leaf(root, leaf);
4741                btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4742                       total_size, btrfs_leaf_free_space(root, leaf));
4743                BUG();
4744        }
4745
4746        if (slot != nritems) {
4747                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4748
4749                if (old_data < data_end) {
4750                        btrfs_print_leaf(root, leaf);
4751                        btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4752                               slot, old_data, data_end);
4753                        BUG_ON(1);
4754                }
4755                /*
4756                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4757                 */
4758                /* first correct the data pointers */
4759                for (i = slot; i < nritems; i++) {
4760                        u32 ioff;
4761
4762                        item = btrfs_item_nr( i);
4763                        ioff = btrfs_token_item_offset(leaf, item, &token);
4764                        btrfs_set_token_item_offset(leaf, item,
4765                                                    ioff - total_data, &token);
4766                }
4767                /* shift the items */
4768                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4769                              btrfs_item_nr_offset(slot),
4770                              (nritems - slot) * sizeof(struct btrfs_item));
4771
4772                /* shift the data */
4773                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4774                              data_end - total_data, btrfs_leaf_data(leaf) +
4775                              data_end, old_data - data_end);
4776                data_end = old_data;
4777        }
4778
4779        /* setup the item for the new data */
4780        for (i = 0; i < nr; i++) {
4781                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4782                btrfs_set_item_key(leaf, &disk_key, slot + i);
4783                item = btrfs_item_nr(slot + i);
4784                btrfs_set_token_item_offset(leaf, item,
4785                                            data_end - data_size[i], &token);
4786                data_end -= data_size[i];
4787                btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4788        }
4789
4790        btrfs_set_header_nritems(leaf, nritems + nr);
4791        btrfs_mark_buffer_dirty(leaf);
4792
4793        if (btrfs_leaf_free_space(root, leaf) < 0) {
4794                btrfs_print_leaf(root, leaf);
4795                BUG();
4796        }
4797}
4798
4799/*
4800 * Given a key and some data, insert items into the tree.
4801 * This does all the path init required, making room in the tree if needed.
4802 */
4803int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4804                            struct btrfs_root *root,
4805                            struct btrfs_path *path,
4806                            struct btrfs_key *cpu_key, u32 *data_size,
4807                            int nr)
4808{
4809        int ret = 0;
4810        int slot;
4811        int i;
4812        u32 total_size = 0;
4813        u32 total_data = 0;
4814
4815        for (i = 0; i < nr; i++)
4816                total_data += data_size[i];
4817
4818        total_size = total_data + (nr * sizeof(struct btrfs_item));
4819        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4820        if (ret == 0)
4821                return -EEXIST;
4822        if (ret < 0)
4823                return ret;
4824
4825        slot = path->slots[0];
4826        BUG_ON(slot < 0);
4827
4828        setup_items_for_insert(root, path, cpu_key, data_size,
4829                               total_data, total_size, nr);
4830        return 0;
4831}
4832
4833/*
4834 * Given a key and some data, insert an item into the tree.
4835 * This does all the path init required, making room in the tree if needed.
4836 */
4837int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4838                      *root, struct btrfs_key *cpu_key, void *data, u32
4839                      data_size)
4840{
4841        int ret = 0;
4842        struct btrfs_path *path;
4843        struct extent_buffer *leaf;
4844        unsigned long ptr;
4845
4846        path = btrfs_alloc_path();
4847        if (!path)
4848                return -ENOMEM;
4849        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4850        if (!ret) {
4851                leaf = path->nodes[0];
4852                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4853                write_extent_buffer(leaf, data, ptr, data_size);
4854                btrfs_mark_buffer_dirty(leaf);
4855        }
4856        btrfs_free_path(path);
4857        return ret;
4858}
4859
4860/*
4861 * delete the pointer from a given node.
4862 *
4863 * the tree should have been previously balanced so the deletion does not
4864 * empty a node.
4865 */
4866static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4867                    int level, int slot)
4868{
4869        struct extent_buffer *parent = path->nodes[level];
4870        u32 nritems;
4871        int ret;
4872
4873        nritems = btrfs_header_nritems(parent);
4874        if (slot != nritems - 1) {
4875                if (level)
4876                        tree_mod_log_eb_move(root->fs_info, parent, slot,
4877                                             slot + 1, nritems - slot - 1);
4878                memmove_extent_buffer(parent,
4879                              btrfs_node_key_ptr_offset(slot),
4880                              btrfs_node_key_ptr_offset(slot + 1),
4881                              sizeof(struct btrfs_key_ptr) *
4882                              (nritems - slot - 1));
4883        } else if (level) {
4884                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4885                                              MOD_LOG_KEY_REMOVE, GFP_NOFS);
4886                BUG_ON(ret < 0);
4887        }
4888
4889        nritems--;
4890        btrfs_set_header_nritems(parent, nritems);
4891        if (nritems == 0 && parent == root->node) {
4892                BUG_ON(btrfs_header_level(root->node) != 1);
4893                /* just turn the root into a leaf and break */
4894                btrfs_set_header_level(root->node, 0);
4895        } else if (slot == 0) {
4896                struct btrfs_disk_key disk_key;
4897
4898                btrfs_node_key(parent, &disk_key, 0);
4899                fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
4900        }
4901        btrfs_mark_buffer_dirty(parent);
4902}
4903
4904/*
4905 * a helper function to delete the leaf pointed to by path->slots[1] and
4906 * path->nodes[1].
4907 *
4908 * This deletes the pointer in path->nodes[1] and frees the leaf
4909 * block extent.  zero is returned if it all worked out, < 0 otherwise.
4910 *
4911 * The path must have already been setup for deleting the leaf, including
4912 * all the proper balancing.  path->nodes[1] must be locked.
4913 */
4914static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4915                                    struct btrfs_root *root,
4916                                    struct btrfs_path *path,
4917                                    struct extent_buffer *leaf)
4918{
4919        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4920        del_ptr(root, path, 1, path->slots[1]);
4921
4922        /*
4923         * btrfs_free_extent is expensive, we want to make sure we
4924         * aren't holding any locks when we call it
4925         */
4926        btrfs_unlock_up_safe(path, 0);
4927
4928        root_sub_used(root, leaf->len);
4929
4930        extent_buffer_get(leaf);
4931        btrfs_free_tree_block(trans, root, leaf, 0, 1);
4932        free_extent_buffer_stale(leaf);
4933}
4934/*
4935 * delete the item at the leaf level in path.  If that empties
4936 * the leaf, remove it from the tree
4937 */
4938int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4939                    struct btrfs_path *path, int slot, int nr)
4940{
4941        struct extent_buffer *leaf;
4942        struct btrfs_item *item;
4943        u32 last_off;
4944        u32 dsize = 0;
4945        int ret = 0;
4946        int wret;
4947        int i;
4948        u32 nritems;
4949        struct btrfs_map_token token;
4950
4951        btrfs_init_map_token(&token);
4952
4953        leaf = path->nodes[0];
4954        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4955
4956        for (i = 0; i < nr; i++)
4957                dsize += btrfs_item_size_nr(leaf, slot + i);
4958
4959        nritems = btrfs_header_nritems(leaf);
4960
4961        if (slot + nr != nritems) {
4962                int data_end = leaf_data_end(root, leaf);
4963
4964                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4965                              data_end + dsize,
4966                              btrfs_leaf_data(leaf) + data_end,
4967                              last_off - data_end);
4968
4969                for (i = slot + nr; i < nritems; i++) {
4970                        u32 ioff;
4971
4972                        item = btrfs_item_nr(i);
4973                        ioff = btrfs_token_item_offset(leaf, item, &token);
4974                        btrfs_set_token_item_offset(leaf, item,
4975                                                    ioff + dsize, &token);
4976                }
4977
4978                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4979                              btrfs_item_nr_offset(slot + nr),
4980                              sizeof(struct btrfs_item) *
4981                              (nritems - slot - nr));
4982        }
4983        btrfs_set_header_nritems(leaf, nritems - nr);
4984        nritems -= nr;
4985
4986        /* delete the leaf if we've emptied it */
4987        if (nritems == 0) {
4988                if (leaf == root->node) {
4989                        btrfs_set_header_level(leaf, 0);
4990                } else {
4991                        btrfs_set_path_blocking(path);
4992                        clean_tree_block(trans, root->fs_info, leaf);
4993                        btrfs_del_leaf(trans, root, path, leaf);
4994                }
4995        } else {
4996                int used = leaf_space_used(leaf, 0, nritems);
4997                if (slot == 0) {
4998                        struct btrfs_disk_key disk_key;
4999
5000                        btrfs_item_key(leaf, &disk_key, 0);
5001                        fixup_low_keys(root->fs_info, path, &disk_key, 1);
5002                }
5003
5004                /* delete the leaf if it is mostly empty */
5005                if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
5006                        /* push_leaf_left fixes the path.
5007                         * make sure the path still points to our leaf
5008                         * for possible call to del_ptr below
5009                         */
5010                        slot = path->slots[1];
5011                        extent_buffer_get(leaf);
5012
5013                        btrfs_set_path_blocking(path);
5014                        wret = push_leaf_left(trans, root, path, 1, 1,
5015                                              1, (u32)-1);
5016                        if (wret < 0 && wret != -ENOSPC)
5017                                ret = wret;
5018
5019                        if (path->nodes[0] == leaf &&
5020                            btrfs_header_nritems(leaf)) {
5021                                wret = push_leaf_right(trans, root, path, 1,
5022                                                       1, 1, 0);
5023                                if (wret < 0 && wret != -ENOSPC)
5024                                        ret = wret;
5025                        }
5026
5027                        if (btrfs_header_nritems(leaf) == 0) {
5028                                path->slots[1] = slot;
5029                                btrfs_del_leaf(trans, root, path, leaf);
5030                                free_extent_buffer(leaf);
5031                                ret = 0;
5032                        } else {
5033                                /* if we're still in the path, make sure
5034                                 * we're dirty.  Otherwise, one of the
5035                                 * push_leaf functions must have already
5036                                 * dirtied this buffer
5037                                 */
5038                                if (path->nodes[0] == leaf)
5039                                        btrfs_mark_buffer_dirty(leaf);
5040                                free_extent_buffer(leaf);
5041                        }
5042                } else {
5043                        btrfs_mark_buffer_dirty(leaf);
5044                }
5045        }
5046        return ret;
5047}
5048
5049/*
5050 * search the tree again to find a leaf with lesser keys
5051 * returns 0 if it found something or 1 if there are no lesser leaves.
5052 * returns < 0 on io errors.
5053 *
5054 * This may release the path, and so you may lose any locks held at the
5055 * time you call it.
5056 */
5057int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5058{
5059        struct btrfs_key key;
5060        struct btrfs_disk_key found_key;
5061        int ret;
5062
5063        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5064
5065        if (key.offset > 0) {
5066                key.offset--;
5067        } else if (key.type > 0) {
5068                key.type--;
5069                key.offset = (u64)-1;
5070        } else if (key.objectid > 0) {
5071                key.objectid--;
5072                key.type = (u8)-1;
5073                key.offset = (u64)-1;
5074        } else {
5075                return 1;
5076        }
5077
5078        btrfs_release_path(path);
5079        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5080        if (ret < 0)
5081                return ret;
5082        btrfs_item_key(path->nodes[0], &found_key, 0);
5083        ret = comp_keys(&found_key, &key);
5084        /*
5085         * We might have had an item with the previous key in the tree right
5086         * before we released our path. And after we released our path, that
5087         * item might have been pushed to the first slot (0) of the leaf we
5088         * were holding due to a tree balance. Alternatively, an item with the
5089         * previous key can exist as the only element of a leaf (big fat item).
5090         * Therefore account for these 2 cases, so that our callers (like
5091         * btrfs_previous_item) don't miss an existing item with a key matching
5092         * the previous key we computed above.
5093         */
5094        if (ret <= 0)
5095                return 0;
5096        return 1;
5097}
5098
5099/*
5100 * A helper function to walk down the tree starting at min_key, and looking
5101 * for nodes or leaves that are have a minimum transaction id.
5102 * This is used by the btree defrag code, and tree logging
5103 *
5104 * This does not cow, but it does stuff the starting key it finds back
5105 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5106 * key and get a writable path.
5107 *
5108 * This does lock as it descends, and path->keep_locks should be set
5109 * to 1 by the caller.
5110 *
5111 * This honors path->lowest_level to prevent descent past a given level
5112 * of the tree.
5113 *
5114 * min_trans indicates the oldest transaction that you are interested
5115 * in walking through.  Any nodes or leaves older than min_trans are
5116 * skipped over (without reading them).
5117 *
5118 * returns zero if something useful was found, < 0 on error and 1 if there
5119 * was nothing in the tree that matched the search criteria.
5120 */
5121int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5122                         struct btrfs_path *path,
5123                         u64 min_trans)
5124{
5125        struct extent_buffer *cur;
5126        struct btrfs_key found_key;
5127        int slot;
5128        int sret;
5129        u32 nritems;
5130        int level;
5131        int ret = 1;
5132        int keep_locks = path->keep_locks;
5133
5134        path->keep_locks = 1;
5135again:
5136        cur = btrfs_read_lock_root_node(root);
5137        level = btrfs_header_level(cur);
5138        WARN_ON(path->nodes[level]);
5139        path->nodes[level] = cur;
5140        path->locks[level] = BTRFS_READ_LOCK;
5141
5142        if (btrfs_header_generation(cur) < min_trans) {
5143                ret = 1;
5144                goto out;
5145        }
5146        while (1) {
5147                nritems = btrfs_header_nritems(cur);
5148                level = btrfs_header_level(cur);
5149                sret = bin_search(cur, min_key, level, &slot);
5150
5151                /* at the lowest level, we're done, setup the path and exit */
5152                if (level == path->lowest_level) {
5153                        if (slot >= nritems)
5154                                goto find_next_key;
5155                        ret = 0;
5156                        path->slots[level] = slot;
5157                        btrfs_item_key_to_cpu(cur, &found_key, slot);
5158                        goto out;
5159                }
5160                if (sret && slot > 0)
5161                        slot--;
5162                /*
5163                 * check this node pointer against the min_trans parameters.
5164                 * If it is too old, old, skip to the next one.
5165                 */
5166                while (slot < nritems) {
5167                        u64 gen;
5168
5169                        gen = btrfs_node_ptr_generation(cur, slot);
5170                        if (gen < min_trans) {
5171                                slot++;
5172                                continue;
5173                        }
5174                        break;
5175                }
5176find_next_key:
5177                /*
5178                 * we didn't find a candidate key in this node, walk forward
5179                 * and find another one
5180                 */
5181                if (slot >= nritems) {
5182                        path->slots[level] = slot;
5183                        btrfs_set_path_blocking(path);
5184                        sret = btrfs_find_next_key(root, path, min_key, level,
5185                                                  min_trans);
5186                        if (sret == 0) {
5187                                btrfs_release_path(path);
5188                                goto again;
5189                        } else {
5190                                goto out;
5191                        }
5192                }
5193                /* save our key for returning back */
5194                btrfs_node_key_to_cpu(cur, &found_key, slot);
5195                path->slots[level] = slot;
5196                if (level == path->lowest_level) {
5197                        ret = 0;
5198                        goto out;
5199                }
5200                btrfs_set_path_blocking(path);
5201                cur = read_node_slot(root, cur, slot);
5202                BUG_ON(!cur); /* -ENOMEM */
5203
5204                btrfs_tree_read_lock(cur);
5205
5206                path->locks[level - 1] = BTRFS_READ_LOCK;
5207                path->nodes[level - 1] = cur;
5208                unlock_up(path, level, 1, 0, NULL);
5209                btrfs_clear_path_blocking(path, NULL, 0);
5210        }
5211out:
5212        path->keep_locks = keep_locks;
5213        if (ret == 0) {
5214                btrfs_unlock_up_safe(path, path->lowest_level + 1);
5215                btrfs_set_path_blocking(path);
5216                memcpy(min_key, &found_key, sizeof(found_key));
5217        }
5218        return ret;
5219}
5220
5221static void tree_move_down(struct btrfs_root *root,
5222                           struct btrfs_path *path,
5223                           int *level, int root_level)
5224{
5225        BUG_ON(*level == 0);
5226        path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5227                                        path->slots[*level]);
5228        path->slots[*level - 1] = 0;
5229        (*level)--;
5230}
5231
5232static int tree_move_next_or_upnext(struct btrfs_root *root,
5233                                    struct btrfs_path *path,
5234                                    int *level, int root_level)
5235{
5236        int ret = 0;
5237        int nritems;
5238        nritems = btrfs_header_nritems(path->nodes[*level]);
5239
5240        path->slots[*level]++;
5241
5242        while (path->slots[*level] >= nritems) {
5243                if (*level == root_level)
5244                        return -1;
5245
5246                /* move upnext */
5247                path->slots[*level] = 0;
5248                free_extent_buffer(path->nodes[*level]);
5249                path->nodes[*level] = NULL;
5250                (*level)++;
5251                path->slots[*level]++;
5252
5253                nritems = btrfs_header_nritems(path->nodes[*level]);
5254                ret = 1;
5255        }
5256        return ret;
5257}
5258
5259/*
5260 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5261 * or down.
5262 */
5263static int tree_advance(struct btrfs_root *root,
5264                        struct btrfs_path *path,
5265                        int *level, int root_level,
5266                        int allow_down,
5267                        struct btrfs_key *key)
5268{
5269        int ret;
5270
5271        if (*level == 0 || !allow_down) {
5272                ret = tree_move_next_or_upnext(root, path, level, root_level);
5273        } else {
5274                tree_move_down(root, path, level, root_level);
5275                ret = 0;
5276        }
5277        if (ret >= 0) {
5278                if (*level == 0)
5279                        btrfs_item_key_to_cpu(path->nodes[*level], key,
5280                                        path->slots[*level]);
5281                else
5282                        btrfs_node_key_to_cpu(path->nodes[*level], key,
5283                                        path->slots[*level]);
5284        }
5285        return ret;
5286}
5287
5288static int tree_compare_item(struct btrfs_root *left_root,
5289                             struct btrfs_path *left_path,
5290                             struct btrfs_path *right_path,
5291                             char *tmp_buf)
5292{
5293        int cmp;
5294        int len1, len2;
5295        unsigned long off1, off2;
5296
5297        len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5298        len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5299        if (len1 != len2)
5300                return 1;
5301
5302        off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5303        off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5304                                right_path->slots[0]);
5305
5306        read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5307
5308        cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5309        if (cmp)
5310                return 1;
5311        return 0;
5312}
5313
5314#define ADVANCE 1
5315#define ADVANCE_ONLY_NEXT -1
5316
5317/*
5318 * This function compares two trees and calls the provided callback for
5319 * every changed/new/deleted item it finds.
5320 * If shared tree blocks are encountered, whole subtrees are skipped, making
5321 * the compare pretty fast on snapshotted subvolumes.
5322 *
5323 * This currently works on commit roots only. As commit roots are read only,
5324 * we don't do any locking. The commit roots are protected with transactions.
5325 * Transactions are ended and rejoined when a commit is tried in between.
5326 *
5327 * This function checks for modifications done to the trees while comparing.
5328 * If it detects a change, it aborts immediately.
5329 */
5330int btrfs_compare_trees(struct btrfs_root *left_root,
5331                        struct btrfs_root *right_root,
5332                        btrfs_changed_cb_t changed_cb, void *ctx)
5333{
5334        int ret;
5335        int cmp;
5336        struct btrfs_path *left_path = NULL;
5337        struct btrfs_path *right_path = NULL;
5338        struct btrfs_key left_key;
5339        struct btrfs_key right_key;
5340        char *tmp_buf = NULL;
5341        int left_root_level;
5342        int right_root_level;
5343        int left_level;
5344        int right_level;
5345        int left_end_reached;
5346        int right_end_reached;
5347        int advance_left;
5348        int advance_right;
5349        u64 left_blockptr;
5350        u64 right_blockptr;
5351        u64 left_gen;
5352        u64 right_gen;
5353
5354        left_path = btrfs_alloc_path();
5355        if (!left_path) {
5356                ret = -ENOMEM;
5357                goto out;
5358        }
5359        right_path = btrfs_alloc_path();
5360        if (!right_path) {
5361                ret = -ENOMEM;
5362                goto out;
5363        }
5364
5365        tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN);
5366        if (!tmp_buf) {
5367                tmp_buf = vmalloc(left_root->nodesize);
5368                if (!tmp_buf) {
5369                        ret = -ENOMEM;
5370                        goto out;
5371                }
5372        }
5373
5374        left_path->search_commit_root = 1;
5375        left_path->skip_locking = 1;
5376        right_path->search_commit_root = 1;
5377        right_path->skip_locking = 1;
5378
5379        /*
5380         * Strategy: Go to the first items of both trees. Then do
5381         *
5382         * If both trees are at level 0
5383         *   Compare keys of current items
5384         *     If left < right treat left item as new, advance left tree
5385         *       and repeat
5386         *     If left > right treat right item as deleted, advance right tree
5387         *       and repeat
5388         *     If left == right do deep compare of items, treat as changed if
5389         *       needed, advance both trees and repeat
5390         * If both trees are at the same level but not at level 0
5391         *   Compare keys of current nodes/leafs
5392         *     If left < right advance left tree and repeat
5393         *     If left > right advance right tree and repeat
5394         *     If left == right compare blockptrs of the next nodes/leafs
5395         *       If they match advance both trees but stay at the same level
5396         *         and repeat
5397         *       If they don't match advance both trees while allowing to go
5398         *         deeper and repeat
5399         * If tree levels are different
5400         *   Advance the tree that needs it and repeat
5401         *
5402         * Advancing a tree means:
5403         *   If we are at level 0, try to go to the next slot. If that's not
5404         *   possible, go one level up and repeat. Stop when we found a level
5405         *   where we could go to the next slot. We may at this point be on a
5406         *   node or a leaf.
5407         *
5408         *   If we are not at level 0 and not on shared tree blocks, go one
5409         *   level deeper.
5410         *
5411         *   If we are not at level 0 and on shared tree blocks, go one slot to
5412         *   the right if possible or go up and right.
5413         */
5414
5415        down_read(&left_root->fs_info->commit_root_sem);
5416        left_level = btrfs_header_level(left_root->commit_root);
5417        left_root_level = left_level;
5418        left_path->nodes[left_level] = left_root->commit_root;
5419        extent_buffer_get(left_path->nodes[left_level]);
5420
5421        right_level = btrfs_header_level(right_root->commit_root);
5422        right_root_level = right_level;
5423        right_path->nodes[right_level] = right_root->commit_root;
5424        extent_buffer_get(right_path->nodes[right_level]);
5425        up_read(&left_root->fs_info->commit_root_sem);
5426
5427        if (left_level == 0)
5428                btrfs_item_key_to_cpu(left_path->nodes[left_level],
5429                                &left_key, left_path->slots[left_level]);
5430        else
5431                btrfs_node_key_to_cpu(left_path->nodes[left_level],
5432                                &left_key, left_path->slots[left_level]);
5433        if (right_level == 0)
5434                btrfs_item_key_to_cpu(right_path->nodes[right_level],
5435                                &right_key, right_path->slots[right_level]);
5436        else
5437                btrfs_node_key_to_cpu(right_path->nodes[right_level],
5438                                &right_key, right_path->slots[right_level]);
5439
5440        left_end_reached = right_end_reached = 0;
5441        advance_left = advance_right = 0;
5442
5443        while (1) {
5444                if (advance_left && !left_end_reached) {
5445                        ret = tree_advance(left_root, left_path, &left_level,
5446                                        left_root_level,
5447                                        advance_left != ADVANCE_ONLY_NEXT,
5448                                        &left_key);
5449                        if (ret < 0)
5450                                left_end_reached = ADVANCE;
5451                        advance_left = 0;
5452                }
5453                if (advance_right && !right_end_reached) {
5454                        ret = tree_advance(right_root, right_path, &right_level,
5455                                        right_root_level,
5456                                        advance_right != ADVANCE_ONLY_NEXT,
5457                                        &right_key);
5458                        if (ret < 0)
5459                                right_end_reached = ADVANCE;
5460                        advance_right = 0;
5461                }
5462
5463                if (left_end_reached && right_end_reached) {
5464                        ret = 0;
5465                        goto out;
5466                } else if (left_end_reached) {
5467                        if (right_level == 0) {
5468                                ret = changed_cb(left_root, right_root,
5469                                                left_path, right_path,
5470                                                &right_key,
5471                                                BTRFS_COMPARE_TREE_DELETED,
5472                                                ctx);
5473                                if (ret < 0)
5474                                        goto out;
5475                        }
5476                        advance_right = ADVANCE;
5477                        continue;
5478                } else if (right_end_reached) {
5479                        if (left_level == 0) {
5480                                ret = changed_cb(left_root, right_root,
5481                                                left_path, right_path,
5482                                                &left_key,
5483                                                BTRFS_COMPARE_TREE_NEW,
5484                                                ctx);
5485                                if (ret < 0)
5486                                        goto out;
5487                        }
5488                        advance_left = ADVANCE;
5489                        continue;
5490                }
5491
5492                if (left_level == 0 && right_level == 0) {
5493                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5494                        if (cmp < 0) {
5495                                ret = changed_cb(left_root, right_root,
5496                                                left_path, right_path,
5497                                                &left_key,
5498                                                BTRFS_COMPARE_TREE_NEW,
5499                                                ctx);
5500                                if (ret < 0)
5501                                        goto out;
5502                                advance_left = ADVANCE;
5503                        } else if (cmp > 0) {
5504                                ret = changed_cb(left_root, right_root,
5505                                                left_path, right_path,
5506                                                &right_key,
5507                                                BTRFS_COMPARE_TREE_DELETED,
5508                                                ctx);
5509                                if (ret < 0)
5510                                        goto out;
5511                                advance_right = ADVANCE;
5512                        } else {
5513                                enum btrfs_compare_tree_result result;
5514
5515                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5516                                ret = tree_compare_item(left_root, left_path,
5517                                                right_path, tmp_buf);
5518                                if (ret)
5519                                        result = BTRFS_COMPARE_TREE_CHANGED;
5520                                else
5521                                        result = BTRFS_COMPARE_TREE_SAME;
5522                                ret = changed_cb(left_root, right_root,
5523                                                 left_path, right_path,
5524                                                 &left_key, result, ctx);
5525                                if (ret < 0)
5526                                        goto out;
5527                                advance_left = ADVANCE;
5528                                advance_right = ADVANCE;
5529                        }
5530                } else if (left_level == right_level) {
5531                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5532                        if (cmp < 0) {
5533                                advance_left = ADVANCE;
5534                        } else if (cmp > 0) {
5535                                advance_right = ADVANCE;
5536                        } else {
5537                                left_blockptr = btrfs_node_blockptr(
5538                                                left_path->nodes[left_level],
5539                                                left_path->slots[left_level]);
5540                                right_blockptr = btrfs_node_blockptr(
5541                                                right_path->nodes[right_level],
5542                                                right_path->slots[right_level]);
5543                                left_gen = btrfs_node_ptr_generation(
5544                                                left_path->nodes[left_level],
5545                                                left_path->slots[left_level]);
5546                                right_gen = btrfs_node_ptr_generation(
5547                                                right_path->nodes[right_level],
5548                                                right_path->slots[right_level]);
5549                                if (left_blockptr == right_blockptr &&
5550                                    left_gen == right_gen) {
5551                                        /*
5552                                         * As we're on a shared block, don't
5553                                         * allow to go deeper.
5554                                         */
5555                                        advance_left = ADVANCE_ONLY_NEXT;
5556                                        advance_right = ADVANCE_ONLY_NEXT;
5557                                } else {
5558                                        advance_left = ADVANCE;
5559                                        advance_right = ADVANCE;
5560                                }
5561                        }
5562                } else if (left_level < right_level) {
5563                        advance_right = ADVANCE;
5564                } else {
5565                        advance_left = ADVANCE;
5566                }
5567        }
5568
5569out:
5570        btrfs_free_path(left_path);
5571        btrfs_free_path(right_path);
5572        kvfree(tmp_buf);
5573        return ret;
5574}
5575
5576/*
5577 * this is similar to btrfs_next_leaf, but does not try to preserve
5578 * and fixup the path.  It looks for and returns the next key in the
5579 * tree based on the current path and the min_trans parameters.
5580 *
5581 * 0 is returned if another key is found, < 0 if there are any errors
5582 * and 1 is returned if there are no higher keys in the tree
5583 *
5584 * path->keep_locks should be set to 1 on the search made before
5585 * calling this function.
5586 */
5587int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5588                        struct btrfs_key *key, int level, u64 min_trans)
5589{
5590        int slot;
5591        struct extent_buffer *c;
5592
5593        WARN_ON(!path->keep_locks);
5594        while (level < BTRFS_MAX_LEVEL) {
5595                if (!path->nodes[level])
5596                        return 1;
5597
5598                slot = path->slots[level] + 1;
5599                c = path->nodes[level];
5600next:
5601                if (slot >= btrfs_header_nritems(c)) {
5602                        int ret;
5603                        int orig_lowest;
5604                        struct btrfs_key cur_key;
5605                        if (level + 1 >= BTRFS_MAX_LEVEL ||
5606                            !path->nodes[level + 1])
5607                                return 1;
5608
5609                        if (path->locks[level + 1]) {
5610                                level++;
5611                                continue;
5612                        }
5613
5614                        slot = btrfs_header_nritems(c) - 1;
5615                        if (level == 0)
5616                                btrfs_item_key_to_cpu(c, &cur_key, slot);
5617                        else
5618                                btrfs_node_key_to_cpu(c, &cur_key, slot);
5619
5620                        orig_lowest = path->lowest_level;
5621                        btrfs_release_path(path);
5622                        path->lowest_level = level;
5623                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
5624                                                0, 0);
5625                        path->lowest_level = orig_lowest;
5626                        if (ret < 0)
5627                                return ret;
5628
5629                        c = path->nodes[level];
5630                        slot = path->slots[level];
5631                        if (ret == 0)
5632                                slot++;
5633                        goto next;
5634                }
5635
5636                if (level == 0)
5637                        btrfs_item_key_to_cpu(c, key, slot);
5638                else {
5639                        u64 gen = btrfs_node_ptr_generation(c, slot);
5640
5641                        if (gen < min_trans) {
5642                                slot++;
5643                                goto next;
5644                        }
5645                        btrfs_node_key_to_cpu(c, key, slot);
5646                }
5647                return 0;
5648        }
5649        return 1;
5650}
5651
5652/*
5653 * search the tree again to find a leaf with greater keys
5654 * returns 0 if it found something or 1 if there are no greater leaves.
5655 * returns < 0 on io errors.
5656 */
5657int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5658{
5659        return btrfs_next_old_leaf(root, path, 0);
5660}
5661
5662int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5663                        u64 time_seq)
5664{
5665        int slot;
5666        int level;
5667        struct extent_buffer *c;
5668        struct extent_buffer *next;
5669        struct btrfs_key key;
5670        u32 nritems;
5671        int ret;
5672        int old_spinning = path->leave_spinning;
5673        int next_rw_lock = 0;
5674
5675        nritems = btrfs_header_nritems(path->nodes[0]);
5676        if (nritems == 0)
5677                return 1;
5678
5679        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5680again:
5681        level = 1;
5682        next = NULL;
5683        next_rw_lock = 0;
5684        btrfs_release_path(path);
5685
5686        path->keep_locks = 1;
5687        path->leave_spinning = 1;
5688
5689        if (time_seq)
5690                ret = btrfs_search_old_slot(root, &key, path, time_seq);
5691        else
5692                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5693        path->keep_locks = 0;
5694
5695        if (ret < 0)
5696                return ret;
5697
5698        nritems = btrfs_header_nritems(path->nodes[0]);
5699        /*
5700         * by releasing the path above we dropped all our locks.  A balance
5701         * could have added more items next to the key that used to be
5702         * at the very end of the block.  So, check again here and
5703         * advance the path if there are now more items available.
5704         */
5705        if (nritems > 0 && path->slots[0] < nritems - 1) {
5706                if (ret == 0)
5707                        path->slots[0]++;
5708                ret = 0;
5709                goto done;
5710        }
5711        /*
5712         * So the above check misses one case:
5713         * - after releasing the path above, someone has removed the item that
5714         *   used to be at the very end of the block, and balance between leafs
5715         *   gets another one with bigger key.offset to replace it.
5716         *
5717         * This one should be returned as well, or we can get leaf corruption
5718         * later(esp. in __btrfs_drop_extents()).
5719         *
5720         * And a bit more explanation about this check,
5721         * with ret > 0, the key isn't found, the path points to the slot
5722         * where it should be inserted, so the path->slots[0] item must be the
5723         * bigger one.
5724         */
5725        if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5726                ret = 0;
5727                goto done;
5728        }
5729
5730        while (level < BTRFS_MAX_LEVEL) {
5731                if (!path->nodes[level]) {
5732                        ret = 1;
5733                        goto done;
5734                }
5735
5736                slot = path->slots[level] + 1;
5737                c = path->nodes[level];
5738                if (slot >= btrfs_header_nritems(c)) {
5739                        level++;
5740                        if (level == BTRFS_MAX_LEVEL) {
5741                                ret = 1;
5742                                goto done;
5743                        }
5744                        continue;
5745                }
5746
5747                if (next) {
5748                        btrfs_tree_unlock_rw(next, next_rw_lock);
5749                        free_extent_buffer(next);
5750                }
5751
5752                next = c;
5753                next_rw_lock = path->locks[level];
5754                ret = read_block_for_search(NULL, root, path, &next, level,
5755                                            slot, &key, 0);
5756                if (ret == -EAGAIN)
5757                        goto again;
5758
5759                if (ret < 0) {
5760                        btrfs_release_path(path);
5761                        goto done;
5762                }
5763
5764                if (!path->skip_locking) {
5765                        ret = btrfs_try_tree_read_lock(next);
5766                        if (!ret && time_seq) {
5767                                /*
5768                                 * If we don't get the lock, we may be racing
5769                                 * with push_leaf_left, holding that lock while
5770                                 * itself waiting for the leaf we've currently
5771                                 * locked. To solve this situation, we give up
5772                                 * on our lock and cycle.
5773                                 */
5774                                free_extent_buffer(next);
5775                                btrfs_release_path(path);
5776                                cond_resched();
5777                                goto again;
5778                        }
5779                        if (!ret) {
5780                                btrfs_set_path_blocking(path);
5781                                btrfs_tree_read_lock(next);
5782                                btrfs_clear_path_blocking(path, next,
5783                                                          BTRFS_READ_LOCK);
5784                        }
5785                        next_rw_lock = BTRFS_READ_LOCK;
5786                }
5787                break;
5788        }
5789        path->slots[level] = slot;
5790        while (1) {
5791                level--;
5792                c = path->nodes[level];
5793                if (path->locks[level])
5794                        btrfs_tree_unlock_rw(c, path->locks[level]);
5795
5796                free_extent_buffer(c);
5797                path->nodes[level] = next;
5798                path->slots[level] = 0;
5799                if (!path->skip_locking)
5800                        path->locks[level] = next_rw_lock;
5801                if (!level)
5802                        break;
5803
5804                ret = read_block_for_search(NULL, root, path, &next, level,
5805                                            0, &key, 0);
5806                if (ret == -EAGAIN)
5807                        goto again;
5808
5809                if (ret < 0) {
5810                        btrfs_release_path(path);
5811                        goto done;
5812                }
5813
5814                if (!path->skip_locking) {
5815                        ret = btrfs_try_tree_read_lock(next);
5816                        if (!ret) {
5817                                btrfs_set_path_blocking(path);
5818                                btrfs_tree_read_lock(next);
5819                                btrfs_clear_path_blocking(path, next,
5820                                                          BTRFS_READ_LOCK);
5821                        }
5822                        next_rw_lock = BTRFS_READ_LOCK;
5823                }
5824        }
5825        ret = 0;
5826done:
5827        unlock_up(path, 0, 1, 0, NULL);
5828        path->leave_spinning = old_spinning;
5829        if (!old_spinning)
5830                btrfs_set_path_blocking(path);
5831
5832        return ret;
5833}
5834
5835/*
5836 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5837 * searching until it gets past min_objectid or finds an item of 'type'
5838 *
5839 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5840 */
5841int btrfs_previous_item(struct btrfs_root *root,
5842                        struct btrfs_path *path, u64 min_objectid,
5843                        int type)
5844{
5845        struct btrfs_key found_key;
5846        struct extent_buffer *leaf;
5847        u32 nritems;
5848        int ret;
5849
5850        while (1) {
5851                if (path->slots[0] == 0) {
5852                        btrfs_set_path_blocking(path);
5853                        ret = btrfs_prev_leaf(root, path);
5854                        if (ret != 0)
5855                                return ret;
5856                } else {
5857                        path->slots[0]--;
5858                }
5859                leaf = path->nodes[0];
5860                nritems = btrfs_header_nritems(leaf);
5861                if (nritems == 0)
5862                        return 1;
5863                if (path->slots[0] == nritems)
5864                        path->slots[0]--;
5865
5866                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5867                if (found_key.objectid < min_objectid)
5868                        break;
5869                if (found_key.type == type)
5870                        return 0;
5871                if (found_key.objectid == min_objectid &&
5872                    found_key.type < type)
5873                        break;
5874        }
5875        return 1;
5876}
5877
5878/*
5879 * search in extent tree to find a previous Metadata/Data extent item with
5880 * min objecitd.
5881 *
5882 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5883 */
5884int btrfs_previous_extent_item(struct btrfs_root *root,
5885                        struct btrfs_path *path, u64 min_objectid)
5886{
5887        struct btrfs_key found_key;
5888        struct extent_buffer *leaf;
5889        u32 nritems;
5890        int ret;
5891
5892        while (1) {
5893                if (path->slots[0] == 0) {
5894                        btrfs_set_path_blocking(path);
5895                        ret = btrfs_prev_leaf(root, path);
5896                        if (ret != 0)
5897                                return ret;
5898                } else {
5899                        path->slots[0]--;
5900                }
5901                leaf = path->nodes[0];
5902                nritems = btrfs_header_nritems(leaf);
5903                if (nritems == 0)
5904                        return 1;
5905                if (path->slots[0] == nritems)
5906                        path->slots[0]--;
5907
5908                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5909                if (found_key.objectid < min_objectid)
5910                        break;
5911                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5912                    found_key.type == BTRFS_METADATA_ITEM_KEY)
5913                        return 0;
5914                if (found_key.objectid == min_objectid &&
5915                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5916                        break;
5917        }
5918        return 1;
5919}
5920