linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/slab.h>
  21#include <linux/rbtree.h>
  22#include <linux/vmalloc.h>
  23#include "ctree.h"
  24#include "disk-io.h"
  25#include "transaction.h"
  26#include "print-tree.h"
  27#include "locking.h"
  28
  29static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  30                      *root, struct btrfs_path *path, int level);
  31static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
  32                      *root, struct btrfs_key *ins_key,
  33                      struct btrfs_path *path, int data_size, int extend);
  34static int push_node_left(struct btrfs_trans_handle *trans,
  35                          struct btrfs_root *root, struct extent_buffer *dst,
  36                          struct extent_buffer *src, int empty);
  37static int balance_node_right(struct btrfs_trans_handle *trans,
  38                              struct btrfs_root *root,
  39                              struct extent_buffer *dst_buf,
  40                              struct extent_buffer *src_buf);
  41static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
  42                    int level, int slot);
  43static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
  44                                 struct extent_buffer *eb);
  45
  46struct btrfs_path *btrfs_alloc_path(void)
  47{
  48        return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  49}
  50
  51/*
  52 * set all locked nodes in the path to blocking locks.  This should
  53 * be done before scheduling
  54 */
  55noinline void btrfs_set_path_blocking(struct btrfs_path *p)
  56{
  57        int i;
  58        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  59                if (!p->nodes[i] || !p->locks[i])
  60                        continue;
  61                btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
  62                if (p->locks[i] == BTRFS_READ_LOCK)
  63                        p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
  64                else if (p->locks[i] == BTRFS_WRITE_LOCK)
  65                        p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
  66        }
  67}
  68
  69/*
  70 * reset all the locked nodes in the patch to spinning locks.
  71 *
  72 * held is used to keep lockdep happy, when lockdep is enabled
  73 * we set held to a blocking lock before we go around and
  74 * retake all the spinlocks in the path.  You can safely use NULL
  75 * for held
  76 */
  77noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
  78                                        struct extent_buffer *held, int held_rw)
  79{
  80        int i;
  81
  82        if (held) {
  83                btrfs_set_lock_blocking_rw(held, held_rw);
  84                if (held_rw == BTRFS_WRITE_LOCK)
  85                        held_rw = BTRFS_WRITE_LOCK_BLOCKING;
  86                else if (held_rw == BTRFS_READ_LOCK)
  87                        held_rw = BTRFS_READ_LOCK_BLOCKING;
  88        }
  89        btrfs_set_path_blocking(p);
  90
  91        for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
  92                if (p->nodes[i] && p->locks[i]) {
  93                        btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
  94                        if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
  95                                p->locks[i] = BTRFS_WRITE_LOCK;
  96                        else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
  97                                p->locks[i] = BTRFS_READ_LOCK;
  98                }
  99        }
 100
 101        if (held)
 102                btrfs_clear_lock_blocking_rw(held, held_rw);
 103}
 104
 105/* this also releases the path */
 106void btrfs_free_path(struct btrfs_path *p)
 107{
 108        if (!p)
 109                return;
 110        btrfs_release_path(p);
 111        kmem_cache_free(btrfs_path_cachep, p);
 112}
 113
 114/*
 115 * path release drops references on the extent buffers in the path
 116 * and it drops any locks held by this path
 117 *
 118 * It is safe to call this on paths that no locks or extent buffers held.
 119 */
 120noinline void btrfs_release_path(struct btrfs_path *p)
 121{
 122        int i;
 123
 124        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 125                p->slots[i] = 0;
 126                if (!p->nodes[i])
 127                        continue;
 128                if (p->locks[i]) {
 129                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
 130                        p->locks[i] = 0;
 131                }
 132                free_extent_buffer(p->nodes[i]);
 133                p->nodes[i] = NULL;
 134        }
 135}
 136
 137/*
 138 * safely gets a reference on the root node of a tree.  A lock
 139 * is not taken, so a concurrent writer may put a different node
 140 * at the root of the tree.  See btrfs_lock_root_node for the
 141 * looping required.
 142 *
 143 * The extent buffer returned by this has a reference taken, so
 144 * it won't disappear.  It may stop being the root of the tree
 145 * at any time because there are no locks held.
 146 */
 147struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 148{
 149        struct extent_buffer *eb;
 150
 151        while (1) {
 152                rcu_read_lock();
 153                eb = rcu_dereference(root->node);
 154
 155                /*
 156                 * RCU really hurts here, we could free up the root node because
 157                 * it was COWed but we may not get the new root node yet so do
 158                 * the inc_not_zero dance and if it doesn't work then
 159                 * synchronize_rcu and try again.
 160                 */
 161                if (atomic_inc_not_zero(&eb->refs)) {
 162                        rcu_read_unlock();
 163                        break;
 164                }
 165                rcu_read_unlock();
 166                synchronize_rcu();
 167        }
 168        return eb;
 169}
 170
 171/* loop around taking references on and locking the root node of the
 172 * tree until you end up with a lock on the root.  A locked buffer
 173 * is returned, with a reference held.
 174 */
 175struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 176{
 177        struct extent_buffer *eb;
 178
 179        while (1) {
 180                eb = btrfs_root_node(root);
 181                btrfs_tree_lock(eb);
 182                if (eb == root->node)
 183                        break;
 184                btrfs_tree_unlock(eb);
 185                free_extent_buffer(eb);
 186        }
 187        return eb;
 188}
 189
 190/* loop around taking references on and locking the root node of the
 191 * tree until you end up with a lock on the root.  A locked buffer
 192 * is returned, with a reference held.
 193 */
 194static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 195{
 196        struct extent_buffer *eb;
 197
 198        while (1) {
 199                eb = btrfs_root_node(root);
 200                btrfs_tree_read_lock(eb);
 201                if (eb == root->node)
 202                        break;
 203                btrfs_tree_read_unlock(eb);
 204                free_extent_buffer(eb);
 205        }
 206        return eb;
 207}
 208
 209/* cowonly root (everything not a reference counted cow subvolume), just get
 210 * put onto a simple dirty list.  transaction.c walks this to make sure they
 211 * get properly updated on disk.
 212 */
 213static void add_root_to_dirty_list(struct btrfs_root *root)
 214{
 215        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
 216            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
 217                return;
 218
 219        spin_lock(&root->fs_info->trans_lock);
 220        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
 221                /* Want the extent tree to be the last on the list */
 222                if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
 223                        list_move_tail(&root->dirty_list,
 224                                       &root->fs_info->dirty_cowonly_roots);
 225                else
 226                        list_move(&root->dirty_list,
 227                                  &root->fs_info->dirty_cowonly_roots);
 228        }
 229        spin_unlock(&root->fs_info->trans_lock);
 230}
 231
 232/*
 233 * used by snapshot creation to make a copy of a root for a tree with
 234 * a given objectid.  The buffer with the new root node is returned in
 235 * cow_ret, and this func returns zero on success or a negative error code.
 236 */
 237int btrfs_copy_root(struct btrfs_trans_handle *trans,
 238                      struct btrfs_root *root,
 239                      struct extent_buffer *buf,
 240                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 241{
 242        struct extent_buffer *cow;
 243        int ret = 0;
 244        int level;
 245        struct btrfs_disk_key disk_key;
 246
 247        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 248                trans->transid != root->fs_info->running_transaction->transid);
 249        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 250                trans->transid != root->last_trans);
 251
 252        level = btrfs_header_level(buf);
 253        if (level == 0)
 254                btrfs_item_key(buf, &disk_key, 0);
 255        else
 256                btrfs_node_key(buf, &disk_key, 0);
 257
 258        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 259                        &disk_key, level, buf->start, 0);
 260        if (IS_ERR(cow))
 261                return PTR_ERR(cow);
 262
 263        copy_extent_buffer(cow, buf, 0, 0, cow->len);
 264        btrfs_set_header_bytenr(cow, cow->start);
 265        btrfs_set_header_generation(cow, trans->transid);
 266        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 267        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 268                                     BTRFS_HEADER_FLAG_RELOC);
 269        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 270                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 271        else
 272                btrfs_set_header_owner(cow, new_root_objectid);
 273
 274        write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
 275                            BTRFS_FSID_SIZE);
 276
 277        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 278        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 279                ret = btrfs_inc_ref(trans, root, cow, 1);
 280        else
 281                ret = btrfs_inc_ref(trans, root, cow, 0);
 282
 283        if (ret)
 284                return ret;
 285
 286        btrfs_mark_buffer_dirty(cow);
 287        *cow_ret = cow;
 288        return 0;
 289}
 290
 291enum mod_log_op {
 292        MOD_LOG_KEY_REPLACE,
 293        MOD_LOG_KEY_ADD,
 294        MOD_LOG_KEY_REMOVE,
 295        MOD_LOG_KEY_REMOVE_WHILE_FREEING,
 296        MOD_LOG_KEY_REMOVE_WHILE_MOVING,
 297        MOD_LOG_MOVE_KEYS,
 298        MOD_LOG_ROOT_REPLACE,
 299};
 300
 301struct tree_mod_move {
 302        int dst_slot;
 303        int nr_items;
 304};
 305
 306struct tree_mod_root {
 307        u64 logical;
 308        u8 level;
 309};
 310
 311struct tree_mod_elem {
 312        struct rb_node node;
 313        u64 logical;
 314        u64 seq;
 315        enum mod_log_op op;
 316
 317        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
 318        int slot;
 319
 320        /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
 321        u64 generation;
 322
 323        /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
 324        struct btrfs_disk_key key;
 325        u64 blockptr;
 326
 327        /* this is used for op == MOD_LOG_MOVE_KEYS */
 328        struct tree_mod_move move;
 329
 330        /* this is used for op == MOD_LOG_ROOT_REPLACE */
 331        struct tree_mod_root old_root;
 332};
 333
 334static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
 335{
 336        read_lock(&fs_info->tree_mod_log_lock);
 337}
 338
 339static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
 340{
 341        read_unlock(&fs_info->tree_mod_log_lock);
 342}
 343
 344static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
 345{
 346        write_lock(&fs_info->tree_mod_log_lock);
 347}
 348
 349static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
 350{
 351        write_unlock(&fs_info->tree_mod_log_lock);
 352}
 353
 354/*
 355 * Pull a new tree mod seq number for our operation.
 356 */
 357static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 358{
 359        return atomic64_inc_return(&fs_info->tree_mod_seq);
 360}
 361
 362/*
 363 * This adds a new blocker to the tree mod log's blocker list if the @elem
 364 * passed does not already have a sequence number set. So when a caller expects
 365 * to record tree modifications, it should ensure to set elem->seq to zero
 366 * before calling btrfs_get_tree_mod_seq.
 367 * Returns a fresh, unused tree log modification sequence number, even if no new
 368 * blocker was added.
 369 */
 370u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 371                           struct seq_list *elem)
 372{
 373        tree_mod_log_write_lock(fs_info);
 374        spin_lock(&fs_info->tree_mod_seq_lock);
 375        if (!elem->seq) {
 376                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 377                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 378        }
 379        spin_unlock(&fs_info->tree_mod_seq_lock);
 380        tree_mod_log_write_unlock(fs_info);
 381
 382        return elem->seq;
 383}
 384
 385void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 386                            struct seq_list *elem)
 387{
 388        struct rb_root *tm_root;
 389        struct rb_node *node;
 390        struct rb_node *next;
 391        struct seq_list *cur_elem;
 392        struct tree_mod_elem *tm;
 393        u64 min_seq = (u64)-1;
 394        u64 seq_putting = elem->seq;
 395
 396        if (!seq_putting)
 397                return;
 398
 399        spin_lock(&fs_info->tree_mod_seq_lock);
 400        list_del(&elem->list);
 401        elem->seq = 0;
 402
 403        list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
 404                if (cur_elem->seq < min_seq) {
 405                        if (seq_putting > cur_elem->seq) {
 406                                /*
 407                                 * blocker with lower sequence number exists, we
 408                                 * cannot remove anything from the log
 409                                 */
 410                                spin_unlock(&fs_info->tree_mod_seq_lock);
 411                                return;
 412                        }
 413                        min_seq = cur_elem->seq;
 414                }
 415        }
 416        spin_unlock(&fs_info->tree_mod_seq_lock);
 417
 418        /*
 419         * anything that's lower than the lowest existing (read: blocked)
 420         * sequence number can be removed from the tree.
 421         */
 422        tree_mod_log_write_lock(fs_info);
 423        tm_root = &fs_info->tree_mod_log;
 424        for (node = rb_first(tm_root); node; node = next) {
 425                next = rb_next(node);
 426                tm = container_of(node, struct tree_mod_elem, node);
 427                if (tm->seq > min_seq)
 428                        continue;
 429                rb_erase(node, tm_root);
 430                kfree(tm);
 431        }
 432        tree_mod_log_write_unlock(fs_info);
 433}
 434
 435/*
 436 * key order of the log:
 437 *       node/leaf start address -> sequence
 438 *
 439 * The 'start address' is the logical address of the *new* root node
 440 * for root replace operations, or the logical address of the affected
 441 * block for all other operations.
 442 *
 443 * Note: must be called with write lock (tree_mod_log_write_lock).
 444 */
 445static noinline int
 446__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 447{
 448        struct rb_root *tm_root;
 449        struct rb_node **new;
 450        struct rb_node *parent = NULL;
 451        struct tree_mod_elem *cur;
 452
 453        BUG_ON(!tm);
 454
 455        tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 456
 457        tm_root = &fs_info->tree_mod_log;
 458        new = &tm_root->rb_node;
 459        while (*new) {
 460                cur = container_of(*new, struct tree_mod_elem, node);
 461                parent = *new;
 462                if (cur->logical < tm->logical)
 463                        new = &((*new)->rb_left);
 464                else if (cur->logical > tm->logical)
 465                        new = &((*new)->rb_right);
 466                else if (cur->seq < tm->seq)
 467                        new = &((*new)->rb_left);
 468                else if (cur->seq > tm->seq)
 469                        new = &((*new)->rb_right);
 470                else
 471                        return -EEXIST;
 472        }
 473
 474        rb_link_node(&tm->node, parent, new);
 475        rb_insert_color(&tm->node, tm_root);
 476        return 0;
 477}
 478
 479/*
 480 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 481 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 482 * this until all tree mod log insertions are recorded in the rb tree and then
 483 * call tree_mod_log_write_unlock() to release.
 484 */
 485static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 486                                    struct extent_buffer *eb) {
 487        smp_mb();
 488        if (list_empty(&(fs_info)->tree_mod_seq_list))
 489                return 1;
 490        if (eb && btrfs_header_level(eb) == 0)
 491                return 1;
 492
 493        tree_mod_log_write_lock(fs_info);
 494        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 495                tree_mod_log_write_unlock(fs_info);
 496                return 1;
 497        }
 498
 499        return 0;
 500}
 501
 502/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 503static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 504                                    struct extent_buffer *eb)
 505{
 506        smp_mb();
 507        if (list_empty(&(fs_info)->tree_mod_seq_list))
 508                return 0;
 509        if (eb && btrfs_header_level(eb) == 0)
 510                return 0;
 511
 512        return 1;
 513}
 514
 515static struct tree_mod_elem *
 516alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
 517                    enum mod_log_op op, gfp_t flags)
 518{
 519        struct tree_mod_elem *tm;
 520
 521        tm = kzalloc(sizeof(*tm), flags);
 522        if (!tm)
 523                return NULL;
 524
 525        tm->logical = eb->start;
 526        if (op != MOD_LOG_KEY_ADD) {
 527                btrfs_node_key(eb, &tm->key, slot);
 528                tm->blockptr = btrfs_node_blockptr(eb, slot);
 529        }
 530        tm->op = op;
 531        tm->slot = slot;
 532        tm->generation = btrfs_node_ptr_generation(eb, slot);
 533        RB_CLEAR_NODE(&tm->node);
 534
 535        return tm;
 536}
 537
 538static noinline int
 539tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 540                        struct extent_buffer *eb, int slot,
 541                        enum mod_log_op op, gfp_t flags)
 542{
 543        struct tree_mod_elem *tm;
 544        int ret;
 545
 546        if (!tree_mod_need_log(fs_info, eb))
 547                return 0;
 548
 549        tm = alloc_tree_mod_elem(eb, slot, op, flags);
 550        if (!tm)
 551                return -ENOMEM;
 552
 553        if (tree_mod_dont_log(fs_info, eb)) {
 554                kfree(tm);
 555                return 0;
 556        }
 557
 558        ret = __tree_mod_log_insert(fs_info, tm);
 559        tree_mod_log_write_unlock(fs_info);
 560        if (ret)
 561                kfree(tm);
 562
 563        return ret;
 564}
 565
 566static noinline int
 567tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 568                         struct extent_buffer *eb, int dst_slot, int src_slot,
 569                         int nr_items, gfp_t flags)
 570{
 571        struct tree_mod_elem *tm = NULL;
 572        struct tree_mod_elem **tm_list = NULL;
 573        int ret = 0;
 574        int i;
 575        int locked = 0;
 576
 577        if (!tree_mod_need_log(fs_info, eb))
 578                return 0;
 579
 580        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
 581        if (!tm_list)
 582                return -ENOMEM;
 583
 584        tm = kzalloc(sizeof(*tm), flags);
 585        if (!tm) {
 586                ret = -ENOMEM;
 587                goto free_tms;
 588        }
 589
 590        tm->logical = eb->start;
 591        tm->slot = src_slot;
 592        tm->move.dst_slot = dst_slot;
 593        tm->move.nr_items = nr_items;
 594        tm->op = MOD_LOG_MOVE_KEYS;
 595
 596        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 597                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 598                    MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
 599                if (!tm_list[i]) {
 600                        ret = -ENOMEM;
 601                        goto free_tms;
 602                }
 603        }
 604
 605        if (tree_mod_dont_log(fs_info, eb))
 606                goto free_tms;
 607        locked = 1;
 608
 609        /*
 610         * When we override something during the move, we log these removals.
 611         * This can only happen when we move towards the beginning of the
 612         * buffer, i.e. dst_slot < src_slot.
 613         */
 614        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 615                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 616                if (ret)
 617                        goto free_tms;
 618        }
 619
 620        ret = __tree_mod_log_insert(fs_info, tm);
 621        if (ret)
 622                goto free_tms;
 623        tree_mod_log_write_unlock(fs_info);
 624        kfree(tm_list);
 625
 626        return 0;
 627free_tms:
 628        for (i = 0; i < nr_items; i++) {
 629                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 630                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 631                kfree(tm_list[i]);
 632        }
 633        if (locked)
 634                tree_mod_log_write_unlock(fs_info);
 635        kfree(tm_list);
 636        kfree(tm);
 637
 638        return ret;
 639}
 640
 641static inline int
 642__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 643                       struct tree_mod_elem **tm_list,
 644                       int nritems)
 645{
 646        int i, j;
 647        int ret;
 648
 649        for (i = nritems - 1; i >= 0; i--) {
 650                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 651                if (ret) {
 652                        for (j = nritems - 1; j > i; j--)
 653                                rb_erase(&tm_list[j]->node,
 654                                         &fs_info->tree_mod_log);
 655                        return ret;
 656                }
 657        }
 658
 659        return 0;
 660}
 661
 662static noinline int
 663tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 664                         struct extent_buffer *old_root,
 665                         struct extent_buffer *new_root, gfp_t flags,
 666                         int log_removal)
 667{
 668        struct tree_mod_elem *tm = NULL;
 669        struct tree_mod_elem **tm_list = NULL;
 670        int nritems = 0;
 671        int ret = 0;
 672        int i;
 673
 674        if (!tree_mod_need_log(fs_info, NULL))
 675                return 0;
 676
 677        if (log_removal && btrfs_header_level(old_root) > 0) {
 678                nritems = btrfs_header_nritems(old_root);
 679                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 680                                  flags);
 681                if (!tm_list) {
 682                        ret = -ENOMEM;
 683                        goto free_tms;
 684                }
 685                for (i = 0; i < nritems; i++) {
 686                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
 687                            MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
 688                        if (!tm_list[i]) {
 689                                ret = -ENOMEM;
 690                                goto free_tms;
 691                        }
 692                }
 693        }
 694
 695        tm = kzalloc(sizeof(*tm), flags);
 696        if (!tm) {
 697                ret = -ENOMEM;
 698                goto free_tms;
 699        }
 700
 701        tm->logical = new_root->start;
 702        tm->old_root.logical = old_root->start;
 703        tm->old_root.level = btrfs_header_level(old_root);
 704        tm->generation = btrfs_header_generation(old_root);
 705        tm->op = MOD_LOG_ROOT_REPLACE;
 706
 707        if (tree_mod_dont_log(fs_info, NULL))
 708                goto free_tms;
 709
 710        if (tm_list)
 711                ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 712        if (!ret)
 713                ret = __tree_mod_log_insert(fs_info, tm);
 714
 715        tree_mod_log_write_unlock(fs_info);
 716        if (ret)
 717                goto free_tms;
 718        kfree(tm_list);
 719
 720        return ret;
 721
 722free_tms:
 723        if (tm_list) {
 724                for (i = 0; i < nritems; i++)
 725                        kfree(tm_list[i]);
 726                kfree(tm_list);
 727        }
 728        kfree(tm);
 729
 730        return ret;
 731}
 732
 733static struct tree_mod_elem *
 734__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 735                      int smallest)
 736{
 737        struct rb_root *tm_root;
 738        struct rb_node *node;
 739        struct tree_mod_elem *cur = NULL;
 740        struct tree_mod_elem *found = NULL;
 741
 742        tree_mod_log_read_lock(fs_info);
 743        tm_root = &fs_info->tree_mod_log;
 744        node = tm_root->rb_node;
 745        while (node) {
 746                cur = container_of(node, struct tree_mod_elem, node);
 747                if (cur->logical < start) {
 748                        node = node->rb_left;
 749                } else if (cur->logical > start) {
 750                        node = node->rb_right;
 751                } else if (cur->seq < min_seq) {
 752                        node = node->rb_left;
 753                } else if (!smallest) {
 754                        /* we want the node with the highest seq */
 755                        if (found)
 756                                BUG_ON(found->seq > cur->seq);
 757                        found = cur;
 758                        node = node->rb_left;
 759                } else if (cur->seq > min_seq) {
 760                        /* we want the node with the smallest seq */
 761                        if (found)
 762                                BUG_ON(found->seq < cur->seq);
 763                        found = cur;
 764                        node = node->rb_right;
 765                } else {
 766                        found = cur;
 767                        break;
 768                }
 769        }
 770        tree_mod_log_read_unlock(fs_info);
 771
 772        return found;
 773}
 774
 775/*
 776 * this returns the element from the log with the smallest time sequence
 777 * value that's in the log (the oldest log item). any element with a time
 778 * sequence lower than min_seq will be ignored.
 779 */
 780static struct tree_mod_elem *
 781tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
 782                           u64 min_seq)
 783{
 784        return __tree_mod_log_search(fs_info, start, min_seq, 1);
 785}
 786
 787/*
 788 * this returns the element from the log with the largest time sequence
 789 * value that's in the log (the most recent log item). any element with
 790 * a time sequence lower than min_seq will be ignored.
 791 */
 792static struct tree_mod_elem *
 793tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 794{
 795        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 796}
 797
 798static noinline int
 799tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 800                     struct extent_buffer *src, unsigned long dst_offset,
 801                     unsigned long src_offset, int nr_items)
 802{
 803        int ret = 0;
 804        struct tree_mod_elem **tm_list = NULL;
 805        struct tree_mod_elem **tm_list_add, **tm_list_rem;
 806        int i;
 807        int locked = 0;
 808
 809        if (!tree_mod_need_log(fs_info, NULL))
 810                return 0;
 811
 812        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 813                return 0;
 814
 815        tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 816                          GFP_NOFS);
 817        if (!tm_list)
 818                return -ENOMEM;
 819
 820        tm_list_add = tm_list;
 821        tm_list_rem = tm_list + nr_items;
 822        for (i = 0; i < nr_items; i++) {
 823                tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 824                    MOD_LOG_KEY_REMOVE, GFP_NOFS);
 825                if (!tm_list_rem[i]) {
 826                        ret = -ENOMEM;
 827                        goto free_tms;
 828                }
 829
 830                tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 831                    MOD_LOG_KEY_ADD, GFP_NOFS);
 832                if (!tm_list_add[i]) {
 833                        ret = -ENOMEM;
 834                        goto free_tms;
 835                }
 836        }
 837
 838        if (tree_mod_dont_log(fs_info, NULL))
 839                goto free_tms;
 840        locked = 1;
 841
 842        for (i = 0; i < nr_items; i++) {
 843                ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
 844                if (ret)
 845                        goto free_tms;
 846                ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
 847                if (ret)
 848                        goto free_tms;
 849        }
 850
 851        tree_mod_log_write_unlock(fs_info);
 852        kfree(tm_list);
 853
 854        return 0;
 855
 856free_tms:
 857        for (i = 0; i < nr_items * 2; i++) {
 858                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 859                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 860                kfree(tm_list[i]);
 861        }
 862        if (locked)
 863                tree_mod_log_write_unlock(fs_info);
 864        kfree(tm_list);
 865
 866        return ret;
 867}
 868
 869static inline void
 870tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 871                     int dst_offset, int src_offset, int nr_items)
 872{
 873        int ret;
 874        ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
 875                                       nr_items, GFP_NOFS);
 876        BUG_ON(ret < 0);
 877}
 878
 879static noinline void
 880tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 881                          struct extent_buffer *eb, int slot, int atomic)
 882{
 883        int ret;
 884
 885        ret = tree_mod_log_insert_key(fs_info, eb, slot,
 886                                        MOD_LOG_KEY_REPLACE,
 887                                        atomic ? GFP_ATOMIC : GFP_NOFS);
 888        BUG_ON(ret < 0);
 889}
 890
 891static noinline int
 892tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 893{
 894        struct tree_mod_elem **tm_list = NULL;
 895        int nritems = 0;
 896        int i;
 897        int ret = 0;
 898
 899        if (btrfs_header_level(eb) == 0)
 900                return 0;
 901
 902        if (!tree_mod_need_log(fs_info, NULL))
 903                return 0;
 904
 905        nritems = btrfs_header_nritems(eb);
 906        tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 907        if (!tm_list)
 908                return -ENOMEM;
 909
 910        for (i = 0; i < nritems; i++) {
 911                tm_list[i] = alloc_tree_mod_elem(eb, i,
 912                    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 913                if (!tm_list[i]) {
 914                        ret = -ENOMEM;
 915                        goto free_tms;
 916                }
 917        }
 918
 919        if (tree_mod_dont_log(fs_info, eb))
 920                goto free_tms;
 921
 922        ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 923        tree_mod_log_write_unlock(fs_info);
 924        if (ret)
 925                goto free_tms;
 926        kfree(tm_list);
 927
 928        return 0;
 929
 930free_tms:
 931        for (i = 0; i < nritems; i++)
 932                kfree(tm_list[i]);
 933        kfree(tm_list);
 934
 935        return ret;
 936}
 937
 938static noinline void
 939tree_mod_log_set_root_pointer(struct btrfs_root *root,
 940                              struct extent_buffer *new_root_node,
 941                              int log_removal)
 942{
 943        int ret;
 944        ret = tree_mod_log_insert_root(root->fs_info, root->node,
 945                                       new_root_node, GFP_NOFS, log_removal);
 946        BUG_ON(ret < 0);
 947}
 948
 949/*
 950 * check if the tree block can be shared by multiple trees
 951 */
 952int btrfs_block_can_be_shared(struct btrfs_root *root,
 953                              struct extent_buffer *buf)
 954{
 955        /*
 956         * Tree blocks not in reference counted trees and tree roots
 957         * are never shared. If a block was allocated after the last
 958         * snapshot and the block was not allocated by tree relocation,
 959         * we know the block is not shared.
 960         */
 961        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 962            buf != root->node && buf != root->commit_root &&
 963            (btrfs_header_generation(buf) <=
 964             btrfs_root_last_snapshot(&root->root_item) ||
 965             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 966                return 1;
 967#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 968        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 969            btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 970                return 1;
 971#endif
 972        return 0;
 973}
 974
 975static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 976                                       struct btrfs_root *root,
 977                                       struct extent_buffer *buf,
 978                                       struct extent_buffer *cow,
 979                                       int *last_ref)
 980{
 981        u64 refs;
 982        u64 owner;
 983        u64 flags;
 984        u64 new_flags = 0;
 985        int ret;
 986
 987        /*
 988         * Backrefs update rules:
 989         *
 990         * Always use full backrefs for extent pointers in tree block
 991         * allocated by tree relocation.
 992         *
 993         * If a shared tree block is no longer referenced by its owner
 994         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 995         * use full backrefs for extent pointers in tree block.
 996         *
 997         * If a tree block is been relocating
 998         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
 999         * use full backrefs for extent pointers in tree block.
1000         * The reason for this is some operations (such as drop tree)
1001         * are only allowed for blocks use full backrefs.
1002         */
1003
1004        if (btrfs_block_can_be_shared(root, buf)) {
1005                ret = btrfs_lookup_extent_info(trans, root, buf->start,
1006                                               btrfs_header_level(buf), 1,
1007                                               &refs, &flags);
1008                if (ret)
1009                        return ret;
1010                if (refs == 0) {
1011                        ret = -EROFS;
1012                        btrfs_handle_fs_error(root->fs_info, ret, NULL);
1013                        return ret;
1014                }
1015        } else {
1016                refs = 1;
1017                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1018                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1019                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1020                else
1021                        flags = 0;
1022        }
1023
1024        owner = btrfs_header_owner(buf);
1025        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1026               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1027
1028        if (refs > 1) {
1029                if ((owner == root->root_key.objectid ||
1030                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1031                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1032                        ret = btrfs_inc_ref(trans, root, buf, 1);
1033                        BUG_ON(ret); /* -ENOMEM */
1034
1035                        if (root->root_key.objectid ==
1036                            BTRFS_TREE_RELOC_OBJECTID) {
1037                                ret = btrfs_dec_ref(trans, root, buf, 0);
1038                                BUG_ON(ret); /* -ENOMEM */
1039                                ret = btrfs_inc_ref(trans, root, cow, 1);
1040                                BUG_ON(ret); /* -ENOMEM */
1041                        }
1042                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1043                } else {
1044
1045                        if (root->root_key.objectid ==
1046                            BTRFS_TREE_RELOC_OBJECTID)
1047                                ret = btrfs_inc_ref(trans, root, cow, 1);
1048                        else
1049                                ret = btrfs_inc_ref(trans, root, cow, 0);
1050                        BUG_ON(ret); /* -ENOMEM */
1051                }
1052                if (new_flags != 0) {
1053                        int level = btrfs_header_level(buf);
1054
1055                        ret = btrfs_set_disk_extent_flags(trans, root,
1056                                                          buf->start,
1057                                                          buf->len,
1058                                                          new_flags, level, 0);
1059                        if (ret)
1060                                return ret;
1061                }
1062        } else {
1063                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1064                        if (root->root_key.objectid ==
1065                            BTRFS_TREE_RELOC_OBJECTID)
1066                                ret = btrfs_inc_ref(trans, root, cow, 1);
1067                        else
1068                                ret = btrfs_inc_ref(trans, root, cow, 0);
1069                        BUG_ON(ret); /* -ENOMEM */
1070                        ret = btrfs_dec_ref(trans, root, buf, 1);
1071                        BUG_ON(ret); /* -ENOMEM */
1072                }
1073                clean_tree_block(trans, root->fs_info, buf);
1074                *last_ref = 1;
1075        }
1076        return 0;
1077}
1078
1079/*
1080 * does the dirty work in cow of a single block.  The parent block (if
1081 * supplied) is updated to point to the new cow copy.  The new buffer is marked
1082 * dirty and returned locked.  If you modify the block it needs to be marked
1083 * dirty again.
1084 *
1085 * search_start -- an allocation hint for the new block
1086 *
1087 * empty_size -- a hint that you plan on doing more cow.  This is the size in
1088 * bytes the allocator should try to find free next to the block it returns.
1089 * This is just a hint and may be ignored by the allocator.
1090 */
1091static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1092                             struct btrfs_root *root,
1093                             struct extent_buffer *buf,
1094                             struct extent_buffer *parent, int parent_slot,
1095                             struct extent_buffer **cow_ret,
1096                             u64 search_start, u64 empty_size)
1097{
1098        struct btrfs_disk_key disk_key;
1099        struct extent_buffer *cow;
1100        int level, ret;
1101        int last_ref = 0;
1102        int unlock_orig = 0;
1103        u64 parent_start = 0;
1104
1105        if (*cow_ret == buf)
1106                unlock_orig = 1;
1107
1108        btrfs_assert_tree_locked(buf);
1109
1110        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1111                trans->transid != root->fs_info->running_transaction->transid);
1112        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1113                trans->transid != root->last_trans);
1114
1115        level = btrfs_header_level(buf);
1116
1117        if (level == 0)
1118                btrfs_item_key(buf, &disk_key, 0);
1119        else
1120                btrfs_node_key(buf, &disk_key, 0);
1121
1122        if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1123                parent_start = parent->start;
1124
1125        cow = btrfs_alloc_tree_block(trans, root, parent_start,
1126                        root->root_key.objectid, &disk_key, level,
1127                        search_start, empty_size);
1128        if (IS_ERR(cow))
1129                return PTR_ERR(cow);
1130
1131        /* cow is set to blocking by btrfs_init_new_buffer */
1132
1133        copy_extent_buffer(cow, buf, 0, 0, cow->len);
1134        btrfs_set_header_bytenr(cow, cow->start);
1135        btrfs_set_header_generation(cow, trans->transid);
1136        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1137        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1138                                     BTRFS_HEADER_FLAG_RELOC);
1139        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1140                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1141        else
1142                btrfs_set_header_owner(cow, root->root_key.objectid);
1143
1144        write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1145                            BTRFS_FSID_SIZE);
1146
1147        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1148        if (ret) {
1149                btrfs_abort_transaction(trans, ret);
1150                return ret;
1151        }
1152
1153        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1154                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1155                if (ret) {
1156                        btrfs_abort_transaction(trans, ret);
1157                        return ret;
1158                }
1159        }
1160
1161        if (buf == root->node) {
1162                WARN_ON(parent && parent != buf);
1163                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1164                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1165                        parent_start = buf->start;
1166
1167                extent_buffer_get(cow);
1168                tree_mod_log_set_root_pointer(root, cow, 1);
1169                rcu_assign_pointer(root->node, cow);
1170
1171                btrfs_free_tree_block(trans, root, buf, parent_start,
1172                                      last_ref);
1173                free_extent_buffer(buf);
1174                add_root_to_dirty_list(root);
1175        } else {
1176                WARN_ON(trans->transid != btrfs_header_generation(parent));
1177                tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1178                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1179                btrfs_set_node_blockptr(parent, parent_slot,
1180                                        cow->start);
1181                btrfs_set_node_ptr_generation(parent, parent_slot,
1182                                              trans->transid);
1183                btrfs_mark_buffer_dirty(parent);
1184                if (last_ref) {
1185                        ret = tree_mod_log_free_eb(root->fs_info, buf);
1186                        if (ret) {
1187                                btrfs_abort_transaction(trans, ret);
1188                                return ret;
1189                        }
1190                }
1191                btrfs_free_tree_block(trans, root, buf, parent_start,
1192                                      last_ref);
1193        }
1194        if (unlock_orig)
1195                btrfs_tree_unlock(buf);
1196        free_extent_buffer_stale(buf);
1197        btrfs_mark_buffer_dirty(cow);
1198        *cow_ret = cow;
1199        return 0;
1200}
1201
1202/*
1203 * returns the logical address of the oldest predecessor of the given root.
1204 * entries older than time_seq are ignored.
1205 */
1206static struct tree_mod_elem *
1207__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1208                           struct extent_buffer *eb_root, u64 time_seq)
1209{
1210        struct tree_mod_elem *tm;
1211        struct tree_mod_elem *found = NULL;
1212        u64 root_logical = eb_root->start;
1213        int looped = 0;
1214
1215        if (!time_seq)
1216                return NULL;
1217
1218        /*
1219         * the very last operation that's logged for a root is the
1220         * replacement operation (if it is replaced at all). this has
1221         * the logical address of the *new* root, making it the very
1222         * first operation that's logged for this root.
1223         */
1224        while (1) {
1225                tm = tree_mod_log_search_oldest(fs_info, root_logical,
1226                                                time_seq);
1227                if (!looped && !tm)
1228                        return NULL;
1229                /*
1230                 * if there are no tree operation for the oldest root, we simply
1231                 * return it. this should only happen if that (old) root is at
1232                 * level 0.
1233                 */
1234                if (!tm)
1235                        break;
1236
1237                /*
1238                 * if there's an operation that's not a root replacement, we
1239                 * found the oldest version of our root. normally, we'll find a
1240                 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1241                 */
1242                if (tm->op != MOD_LOG_ROOT_REPLACE)
1243                        break;
1244
1245                found = tm;
1246                root_logical = tm->old_root.logical;
1247                looped = 1;
1248        }
1249
1250        /* if there's no old root to return, return what we found instead */
1251        if (!found)
1252                found = tm;
1253
1254        return found;
1255}
1256
1257/*
1258 * tm is a pointer to the first operation to rewind within eb. then, all
1259 * previous operations will be rewound (until we reach something older than
1260 * time_seq).
1261 */
1262static void
1263__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1264                      u64 time_seq, struct tree_mod_elem *first_tm)
1265{
1266        u32 n;
1267        struct rb_node *next;
1268        struct tree_mod_elem *tm = first_tm;
1269        unsigned long o_dst;
1270        unsigned long o_src;
1271        unsigned long p_size = sizeof(struct btrfs_key_ptr);
1272
1273        n = btrfs_header_nritems(eb);
1274        tree_mod_log_read_lock(fs_info);
1275        while (tm && tm->seq >= time_seq) {
1276                /*
1277                 * all the operations are recorded with the operator used for
1278                 * the modification. as we're going backwards, we do the
1279                 * opposite of each operation here.
1280                 */
1281                switch (tm->op) {
1282                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1283                        BUG_ON(tm->slot < n);
1284                        /* Fallthrough */
1285                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1286                case MOD_LOG_KEY_REMOVE:
1287                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1288                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1289                        btrfs_set_node_ptr_generation(eb, tm->slot,
1290                                                      tm->generation);
1291                        n++;
1292                        break;
1293                case MOD_LOG_KEY_REPLACE:
1294                        BUG_ON(tm->slot >= n);
1295                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1296                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1297                        btrfs_set_node_ptr_generation(eb, tm->slot,
1298                                                      tm->generation);
1299                        break;
1300                case MOD_LOG_KEY_ADD:
1301                        /* if a move operation is needed it's in the log */
1302                        n--;
1303                        break;
1304                case MOD_LOG_MOVE_KEYS:
1305                        o_dst = btrfs_node_key_ptr_offset(tm->slot);
1306                        o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1307                        memmove_extent_buffer(eb, o_dst, o_src,
1308                                              tm->move.nr_items * p_size);
1309                        break;
1310                case MOD_LOG_ROOT_REPLACE:
1311                        /*
1312                         * this operation is special. for roots, this must be
1313                         * handled explicitly before rewinding.
1314                         * for non-roots, this operation may exist if the node
1315                         * was a root: root A -> child B; then A gets empty and
1316                         * B is promoted to the new root. in the mod log, we'll
1317                         * have a root-replace operation for B, a tree block
1318                         * that is no root. we simply ignore that operation.
1319                         */
1320                        break;
1321                }
1322                next = rb_next(&tm->node);
1323                if (!next)
1324                        break;
1325                tm = container_of(next, struct tree_mod_elem, node);
1326                if (tm->logical != first_tm->logical)
1327                        break;
1328        }
1329        tree_mod_log_read_unlock(fs_info);
1330        btrfs_set_header_nritems(eb, n);
1331}
1332
1333/*
1334 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1335 * is returned. If rewind operations happen, a fresh buffer is returned. The
1336 * returned buffer is always read-locked. If the returned buffer is not the
1337 * input buffer, the lock on the input buffer is released and the input buffer
1338 * is freed (its refcount is decremented).
1339 */
1340static struct extent_buffer *
1341tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1342                    struct extent_buffer *eb, u64 time_seq)
1343{
1344        struct extent_buffer *eb_rewin;
1345        struct tree_mod_elem *tm;
1346
1347        if (!time_seq)
1348                return eb;
1349
1350        if (btrfs_header_level(eb) == 0)
1351                return eb;
1352
1353        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1354        if (!tm)
1355                return eb;
1356
1357        btrfs_set_path_blocking(path);
1358        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1359
1360        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1361                BUG_ON(tm->slot != 0);
1362                eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start,
1363                                                eb->len);
1364                if (!eb_rewin) {
1365                        btrfs_tree_read_unlock_blocking(eb);
1366                        free_extent_buffer(eb);
1367                        return NULL;
1368                }
1369                btrfs_set_header_bytenr(eb_rewin, eb->start);
1370                btrfs_set_header_backref_rev(eb_rewin,
1371                                             btrfs_header_backref_rev(eb));
1372                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1373                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1374        } else {
1375                eb_rewin = btrfs_clone_extent_buffer(eb);
1376                if (!eb_rewin) {
1377                        btrfs_tree_read_unlock_blocking(eb);
1378                        free_extent_buffer(eb);
1379                        return NULL;
1380                }
1381        }
1382
1383        btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1384        btrfs_tree_read_unlock_blocking(eb);
1385        free_extent_buffer(eb);
1386
1387        extent_buffer_get(eb_rewin);
1388        btrfs_tree_read_lock(eb_rewin);
1389        __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1390        WARN_ON(btrfs_header_nritems(eb_rewin) >
1391                BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1392
1393        return eb_rewin;
1394}
1395
1396/*
1397 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1398 * value. If there are no changes, the current root->root_node is returned. If
1399 * anything changed in between, there's a fresh buffer allocated on which the
1400 * rewind operations are done. In any case, the returned buffer is read locked.
1401 * Returns NULL on error (with no locks held).
1402 */
1403static inline struct extent_buffer *
1404get_old_root(struct btrfs_root *root, u64 time_seq)
1405{
1406        struct tree_mod_elem *tm;
1407        struct extent_buffer *eb = NULL;
1408        struct extent_buffer *eb_root;
1409        struct extent_buffer *old;
1410        struct tree_mod_root *old_root = NULL;
1411        u64 old_generation = 0;
1412        u64 logical;
1413
1414        eb_root = btrfs_read_lock_root_node(root);
1415        tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1416        if (!tm)
1417                return eb_root;
1418
1419        if (tm->op == MOD_LOG_ROOT_REPLACE) {
1420                old_root = &tm->old_root;
1421                old_generation = tm->generation;
1422                logical = old_root->logical;
1423        } else {
1424                logical = eb_root->start;
1425        }
1426
1427        tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1428        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1429                btrfs_tree_read_unlock(eb_root);
1430                free_extent_buffer(eb_root);
1431                old = read_tree_block(root, logical, 0);
1432                if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1433                        if (!IS_ERR(old))
1434                                free_extent_buffer(old);
1435                        btrfs_warn(root->fs_info,
1436                                "failed to read tree block %llu from get_old_root", logical);
1437                } else {
1438                        eb = btrfs_clone_extent_buffer(old);
1439                        free_extent_buffer(old);
1440                }
1441        } else if (old_root) {
1442                btrfs_tree_read_unlock(eb_root);
1443                free_extent_buffer(eb_root);
1444                eb = alloc_dummy_extent_buffer(root->fs_info, logical,
1445                                        root->nodesize);
1446        } else {
1447                btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1448                eb = btrfs_clone_extent_buffer(eb_root);
1449                btrfs_tree_read_unlock_blocking(eb_root);
1450                free_extent_buffer(eb_root);
1451        }
1452
1453        if (!eb)
1454                return NULL;
1455        extent_buffer_get(eb);
1456        btrfs_tree_read_lock(eb);
1457        if (old_root) {
1458                btrfs_set_header_bytenr(eb, eb->start);
1459                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1460                btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1461                btrfs_set_header_level(eb, old_root->level);
1462                btrfs_set_header_generation(eb, old_generation);
1463        }
1464        if (tm)
1465                __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1466        else
1467                WARN_ON(btrfs_header_level(eb) != 0);
1468        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1469
1470        return eb;
1471}
1472
1473int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1474{
1475        struct tree_mod_elem *tm;
1476        int level;
1477        struct extent_buffer *eb_root = btrfs_root_node(root);
1478
1479        tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1480        if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1481                level = tm->old_root.level;
1482        } else {
1483                level = btrfs_header_level(eb_root);
1484        }
1485        free_extent_buffer(eb_root);
1486
1487        return level;
1488}
1489
1490static inline int should_cow_block(struct btrfs_trans_handle *trans,
1491                                   struct btrfs_root *root,
1492                                   struct extent_buffer *buf)
1493{
1494        if (btrfs_is_testing(root->fs_info))
1495                return 0;
1496
1497        /* ensure we can see the force_cow */
1498        smp_rmb();
1499
1500        /*
1501         * We do not need to cow a block if
1502         * 1) this block is not created or changed in this transaction;
1503         * 2) this block does not belong to TREE_RELOC tree;
1504         * 3) the root is not forced COW.
1505         *
1506         * What is forced COW:
1507         *    when we create snapshot during committing the transaction,
1508         *    after we've finished coping src root, we must COW the shared
1509         *    block to ensure the metadata consistency.
1510         */
1511        if (btrfs_header_generation(buf) == trans->transid &&
1512            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1513            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1514              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1515            !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1516                return 0;
1517        return 1;
1518}
1519
1520/*
1521 * cows a single block, see __btrfs_cow_block for the real work.
1522 * This version of it has extra checks so that a block isn't COWed more than
1523 * once per transaction, as long as it hasn't been written yet
1524 */
1525noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1526                    struct btrfs_root *root, struct extent_buffer *buf,
1527                    struct extent_buffer *parent, int parent_slot,
1528                    struct extent_buffer **cow_ret)
1529{
1530        u64 search_start;
1531        int ret;
1532
1533        if (trans->transaction != root->fs_info->running_transaction)
1534                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1535                       trans->transid,
1536                       root->fs_info->running_transaction->transid);
1537
1538        if (trans->transid != root->fs_info->generation)
1539                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1540                       trans->transid, root->fs_info->generation);
1541
1542        if (!should_cow_block(trans, root, buf)) {
1543                trans->dirty = true;
1544                *cow_ret = buf;
1545                return 0;
1546        }
1547
1548        search_start = buf->start & ~((u64)SZ_1G - 1);
1549
1550        if (parent)
1551                btrfs_set_lock_blocking(parent);
1552        btrfs_set_lock_blocking(buf);
1553
1554        ret = __btrfs_cow_block(trans, root, buf, parent,
1555                                 parent_slot, cow_ret, search_start, 0);
1556
1557        trace_btrfs_cow_block(root, buf, *cow_ret);
1558
1559        return ret;
1560}
1561
1562/*
1563 * helper function for defrag to decide if two blocks pointed to by a
1564 * node are actually close by
1565 */
1566static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1567{
1568        if (blocknr < other && other - (blocknr + blocksize) < 32768)
1569                return 1;
1570        if (blocknr > other && blocknr - (other + blocksize) < 32768)
1571                return 1;
1572        return 0;
1573}
1574
1575/*
1576 * compare two keys in a memcmp fashion
1577 */
1578static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1579{
1580        struct btrfs_key k1;
1581
1582        btrfs_disk_key_to_cpu(&k1, disk);
1583
1584        return btrfs_comp_cpu_keys(&k1, k2);
1585}
1586
1587/*
1588 * same as comp_keys only with two btrfs_key's
1589 */
1590int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1591{
1592        if (k1->objectid > k2->objectid)
1593                return 1;
1594        if (k1->objectid < k2->objectid)
1595                return -1;
1596        if (k1->type > k2->type)
1597                return 1;
1598        if (k1->type < k2->type)
1599                return -1;
1600        if (k1->offset > k2->offset)
1601                return 1;
1602        if (k1->offset < k2->offset)
1603                return -1;
1604        return 0;
1605}
1606
1607/*
1608 * this is used by the defrag code to go through all the
1609 * leaves pointed to by a node and reallocate them so that
1610 * disk order is close to key order
1611 */
1612int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1613                       struct btrfs_root *root, struct extent_buffer *parent,
1614                       int start_slot, u64 *last_ret,
1615                       struct btrfs_key *progress)
1616{
1617        struct extent_buffer *cur;
1618        u64 blocknr;
1619        u64 gen;
1620        u64 search_start = *last_ret;
1621        u64 last_block = 0;
1622        u64 other;
1623        u32 parent_nritems;
1624        int end_slot;
1625        int i;
1626        int err = 0;
1627        int parent_level;
1628        int uptodate;
1629        u32 blocksize;
1630        int progress_passed = 0;
1631        struct btrfs_disk_key disk_key;
1632
1633        parent_level = btrfs_header_level(parent);
1634
1635        WARN_ON(trans->transaction != root->fs_info->running_transaction);
1636        WARN_ON(trans->transid != root->fs_info->generation);
1637
1638        parent_nritems = btrfs_header_nritems(parent);
1639        blocksize = root->nodesize;
1640        end_slot = parent_nritems - 1;
1641
1642        if (parent_nritems <= 1)
1643                return 0;
1644
1645        btrfs_set_lock_blocking(parent);
1646
1647        for (i = start_slot; i <= end_slot; i++) {
1648                int close = 1;
1649
1650                btrfs_node_key(parent, &disk_key, i);
1651                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1652                        continue;
1653
1654                progress_passed = 1;
1655                blocknr = btrfs_node_blockptr(parent, i);
1656                gen = btrfs_node_ptr_generation(parent, i);
1657                if (last_block == 0)
1658                        last_block = blocknr;
1659
1660                if (i > 0) {
1661                        other = btrfs_node_blockptr(parent, i - 1);
1662                        close = close_blocks(blocknr, other, blocksize);
1663                }
1664                if (!close && i < end_slot) {
1665                        other = btrfs_node_blockptr(parent, i + 1);
1666                        close = close_blocks(blocknr, other, blocksize);
1667                }
1668                if (close) {
1669                        last_block = blocknr;
1670                        continue;
1671                }
1672
1673                cur = btrfs_find_tree_block(root->fs_info, blocknr);
1674                if (cur)
1675                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1676                else
1677                        uptodate = 0;
1678                if (!cur || !uptodate) {
1679                        if (!cur) {
1680                                cur = read_tree_block(root, blocknr, gen);
1681                                if (IS_ERR(cur)) {
1682                                        return PTR_ERR(cur);
1683                                } else if (!extent_buffer_uptodate(cur)) {
1684                                        free_extent_buffer(cur);
1685                                        return -EIO;
1686                                }
1687                        } else if (!uptodate) {
1688                                err = btrfs_read_buffer(cur, gen);
1689                                if (err) {
1690                                        free_extent_buffer(cur);
1691                                        return err;
1692                                }
1693                        }
1694                }
1695                if (search_start == 0)
1696                        search_start = last_block;
1697
1698                btrfs_tree_lock(cur);
1699                btrfs_set_lock_blocking(cur);
1700                err = __btrfs_cow_block(trans, root, cur, parent, i,
1701                                        &cur, search_start,
1702                                        min(16 * blocksize,
1703                                            (end_slot - i) * blocksize));
1704                if (err) {
1705                        btrfs_tree_unlock(cur);
1706                        free_extent_buffer(cur);
1707                        break;
1708                }
1709                search_start = cur->start;
1710                last_block = cur->start;
1711                *last_ret = search_start;
1712                btrfs_tree_unlock(cur);
1713                free_extent_buffer(cur);
1714        }
1715        return err;
1716}
1717
1718
1719/*
1720 * search for key in the extent_buffer.  The items start at offset p,
1721 * and they are item_size apart.  There are 'max' items in p.
1722 *
1723 * the slot in the array is returned via slot, and it points to
1724 * the place where you would insert key if it is not found in
1725 * the array.
1726 *
1727 * slot may point to max if the key is bigger than all of the keys
1728 */
1729static noinline int generic_bin_search(struct extent_buffer *eb,
1730                                       unsigned long p,
1731                                       int item_size, struct btrfs_key *key,
1732                                       int max, int *slot)
1733{
1734        int low = 0;
1735        int high = max;
1736        int mid;
1737        int ret;
1738        struct btrfs_disk_key *tmp = NULL;
1739        struct btrfs_disk_key unaligned;
1740        unsigned long offset;
1741        char *kaddr = NULL;
1742        unsigned long map_start = 0;
1743        unsigned long map_len = 0;
1744        int err;
1745
1746        if (low > high) {
1747                btrfs_err(eb->fs_info,
1748                 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1749                          __func__, low, high, eb->start,
1750                          btrfs_header_owner(eb), btrfs_header_level(eb));
1751                return -EINVAL;
1752        }
1753
1754        while (low < high) {
1755                mid = (low + high) / 2;
1756                offset = p + mid * item_size;
1757
1758                if (!kaddr || offset < map_start ||
1759                    (offset + sizeof(struct btrfs_disk_key)) >
1760                    map_start + map_len) {
1761
1762                        err = map_private_extent_buffer(eb, offset,
1763                                                sizeof(struct btrfs_disk_key),
1764                                                &kaddr, &map_start, &map_len);
1765
1766                        if (!err) {
1767                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
1768                                                        map_start);
1769                        } else if (err == 1) {
1770                                read_extent_buffer(eb, &unaligned,
1771                                                   offset, sizeof(unaligned));
1772                                tmp = &unaligned;
1773                        } else {
1774                                return err;
1775                        }
1776
1777                } else {
1778                        tmp = (struct btrfs_disk_key *)(kaddr + offset -
1779                                                        map_start);
1780                }
1781                ret = comp_keys(tmp, key);
1782
1783                if (ret < 0)
1784                        low = mid + 1;
1785                else if (ret > 0)
1786                        high = mid;
1787                else {
1788                        *slot = mid;
1789                        return 0;
1790                }
1791        }
1792        *slot = low;
1793        return 1;
1794}
1795
1796/*
1797 * simple bin_search frontend that does the right thing for
1798 * leaves vs nodes
1799 */
1800static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1801                      int level, int *slot)
1802{
1803        if (level == 0)
1804                return generic_bin_search(eb,
1805                                          offsetof(struct btrfs_leaf, items),
1806                                          sizeof(struct btrfs_item),
1807                                          key, btrfs_header_nritems(eb),
1808                                          slot);
1809        else
1810                return generic_bin_search(eb,
1811                                          offsetof(struct btrfs_node, ptrs),
1812                                          sizeof(struct btrfs_key_ptr),
1813                                          key, btrfs_header_nritems(eb),
1814                                          slot);
1815}
1816
1817int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1818                     int level, int *slot)
1819{
1820        return bin_search(eb, key, level, slot);
1821}
1822
1823static void root_add_used(struct btrfs_root *root, u32 size)
1824{
1825        spin_lock(&root->accounting_lock);
1826        btrfs_set_root_used(&root->root_item,
1827                            btrfs_root_used(&root->root_item) + size);
1828        spin_unlock(&root->accounting_lock);
1829}
1830
1831static void root_sub_used(struct btrfs_root *root, u32 size)
1832{
1833        spin_lock(&root->accounting_lock);
1834        btrfs_set_root_used(&root->root_item,
1835                            btrfs_root_used(&root->root_item) - size);
1836        spin_unlock(&root->accounting_lock);
1837}
1838
1839/* given a node and slot number, this reads the blocks it points to.  The
1840 * extent buffer is returned with a reference taken (but unlocked).
1841 */
1842static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1843                                   struct extent_buffer *parent, int slot)
1844{
1845        int level = btrfs_header_level(parent);
1846        struct extent_buffer *eb;
1847
1848        if (slot < 0 || slot >= btrfs_header_nritems(parent))
1849                return ERR_PTR(-ENOENT);
1850
1851        BUG_ON(level == 0);
1852
1853        eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1854                             btrfs_node_ptr_generation(parent, slot));
1855        if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1856                free_extent_buffer(eb);
1857                eb = ERR_PTR(-EIO);
1858        }
1859
1860        return eb;
1861}
1862
1863/*
1864 * node level balancing, used to make sure nodes are in proper order for
1865 * item deletion.  We balance from the top down, so we have to make sure
1866 * that a deletion won't leave an node completely empty later on.
1867 */
1868static noinline int balance_level(struct btrfs_trans_handle *trans,
1869                         struct btrfs_root *root,
1870                         struct btrfs_path *path, int level)
1871{
1872        struct extent_buffer *right = NULL;
1873        struct extent_buffer *mid;
1874        struct extent_buffer *left = NULL;
1875        struct extent_buffer *parent = NULL;
1876        int ret = 0;
1877        int wret;
1878        int pslot;
1879        int orig_slot = path->slots[level];
1880        u64 orig_ptr;
1881
1882        if (level == 0)
1883                return 0;
1884
1885        mid = path->nodes[level];
1886
1887        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1888                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1889        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1890
1891        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1892
1893        if (level < BTRFS_MAX_LEVEL - 1) {
1894                parent = path->nodes[level + 1];
1895                pslot = path->slots[level + 1];
1896        }
1897
1898        /*
1899         * deal with the case where there is only one pointer in the root
1900         * by promoting the node below to a root
1901         */
1902        if (!parent) {
1903                struct extent_buffer *child;
1904
1905                if (btrfs_header_nritems(mid) != 1)
1906                        return 0;
1907
1908                /* promote the child to a root */
1909                child = read_node_slot(root, mid, 0);
1910                if (IS_ERR(child)) {
1911                        ret = PTR_ERR(child);
1912                        btrfs_handle_fs_error(root->fs_info, ret, NULL);
1913                        goto enospc;
1914                }
1915
1916                btrfs_tree_lock(child);
1917                btrfs_set_lock_blocking(child);
1918                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1919                if (ret) {
1920                        btrfs_tree_unlock(child);
1921                        free_extent_buffer(child);
1922                        goto enospc;
1923                }
1924
1925                tree_mod_log_set_root_pointer(root, child, 1);
1926                rcu_assign_pointer(root->node, child);
1927
1928                add_root_to_dirty_list(root);
1929                btrfs_tree_unlock(child);
1930
1931                path->locks[level] = 0;
1932                path->nodes[level] = NULL;
1933                clean_tree_block(trans, root->fs_info, mid);
1934                btrfs_tree_unlock(mid);
1935                /* once for the path */
1936                free_extent_buffer(mid);
1937
1938                root_sub_used(root, mid->len);
1939                btrfs_free_tree_block(trans, root, mid, 0, 1);
1940                /* once for the root ptr */
1941                free_extent_buffer_stale(mid);
1942                return 0;
1943        }
1944        if (btrfs_header_nritems(mid) >
1945            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1946                return 0;
1947
1948        left = read_node_slot(root, parent, pslot - 1);
1949        if (IS_ERR(left))
1950                left = NULL;
1951
1952        if (left) {
1953                btrfs_tree_lock(left);
1954                btrfs_set_lock_blocking(left);
1955                wret = btrfs_cow_block(trans, root, left,
1956                                       parent, pslot - 1, &left);
1957                if (wret) {
1958                        ret = wret;
1959                        goto enospc;
1960                }
1961        }
1962
1963        right = read_node_slot(root, parent, pslot + 1);
1964        if (IS_ERR(right))
1965                right = NULL;
1966
1967        if (right) {
1968                btrfs_tree_lock(right);
1969                btrfs_set_lock_blocking(right);
1970                wret = btrfs_cow_block(trans, root, right,
1971                                       parent, pslot + 1, &right);
1972                if (wret) {
1973                        ret = wret;
1974                        goto enospc;
1975                }
1976        }
1977
1978        /* first, try to make some room in the middle buffer */
1979        if (left) {
1980                orig_slot += btrfs_header_nritems(left);
1981                wret = push_node_left(trans, root, left, mid, 1);
1982                if (wret < 0)
1983                        ret = wret;
1984        }
1985
1986        /*
1987         * then try to empty the right most buffer into the middle
1988         */
1989        if (right) {
1990                wret = push_node_left(trans, root, mid, right, 1);
1991                if (wret < 0 && wret != -ENOSPC)
1992                        ret = wret;
1993                if (btrfs_header_nritems(right) == 0) {
1994                        clean_tree_block(trans, root->fs_info, right);
1995                        btrfs_tree_unlock(right);
1996                        del_ptr(root, path, level + 1, pslot + 1);
1997                        root_sub_used(root, right->len);
1998                        btrfs_free_tree_block(trans, root, right, 0, 1);
1999                        free_extent_buffer_stale(right);
2000                        right = NULL;
2001                } else {
2002                        struct btrfs_disk_key right_key;
2003                        btrfs_node_key(right, &right_key, 0);
2004                        tree_mod_log_set_node_key(root->fs_info, parent,
2005                                                  pslot + 1, 0);
2006                        btrfs_set_node_key(parent, &right_key, pslot + 1);
2007                        btrfs_mark_buffer_dirty(parent);
2008                }
2009        }
2010        if (btrfs_header_nritems(mid) == 1) {
2011                /*
2012                 * we're not allowed to leave a node with one item in the
2013                 * tree during a delete.  A deletion from lower in the tree
2014                 * could try to delete the only pointer in this node.
2015                 * So, pull some keys from the left.
2016                 * There has to be a left pointer at this point because
2017                 * otherwise we would have pulled some pointers from the
2018                 * right
2019                 */
2020                if (!left) {
2021                        ret = -EROFS;
2022                        btrfs_handle_fs_error(root->fs_info, ret, NULL);
2023                        goto enospc;
2024                }
2025                wret = balance_node_right(trans, root, mid, left);
2026                if (wret < 0) {
2027                        ret = wret;
2028                        goto enospc;
2029                }
2030                if (wret == 1) {
2031                        wret = push_node_left(trans, root, left, mid, 1);
2032                        if (wret < 0)
2033                                ret = wret;
2034                }
2035                BUG_ON(wret == 1);
2036        }
2037        if (btrfs_header_nritems(mid) == 0) {
2038                clean_tree_block(trans, root->fs_info, mid);
2039                btrfs_tree_unlock(mid);
2040                del_ptr(root, path, level + 1, pslot);
2041                root_sub_used(root, mid->len);
2042                btrfs_free_tree_block(trans, root, mid, 0, 1);
2043                free_extent_buffer_stale(mid);
2044                mid = NULL;
2045        } else {
2046                /* update the parent key to reflect our changes */
2047                struct btrfs_disk_key mid_key;
2048                btrfs_node_key(mid, &mid_key, 0);
2049                tree_mod_log_set_node_key(root->fs_info, parent,
2050                                          pslot, 0);
2051                btrfs_set_node_key(parent, &mid_key, pslot);
2052                btrfs_mark_buffer_dirty(parent);
2053        }
2054
2055        /* update the path */
2056        if (left) {
2057                if (btrfs_header_nritems(left) > orig_slot) {
2058                        extent_buffer_get(left);
2059                        /* left was locked after cow */
2060                        path->nodes[level] = left;
2061                        path->slots[level + 1] -= 1;
2062                        path->slots[level] = orig_slot;
2063                        if (mid) {
2064                                btrfs_tree_unlock(mid);
2065                                free_extent_buffer(mid);
2066                        }
2067                } else {
2068                        orig_slot -= btrfs_header_nritems(left);
2069                        path->slots[level] = orig_slot;
2070                }
2071        }
2072        /* double check we haven't messed things up */
2073        if (orig_ptr !=
2074            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2075                BUG();
2076enospc:
2077        if (right) {
2078                btrfs_tree_unlock(right);
2079                free_extent_buffer(right);
2080        }
2081        if (left) {
2082                if (path->nodes[level] != left)
2083                        btrfs_tree_unlock(left);
2084                free_extent_buffer(left);
2085        }
2086        return ret;
2087}
2088
2089/* Node balancing for insertion.  Here we only split or push nodes around
2090 * when they are completely full.  This is also done top down, so we
2091 * have to be pessimistic.
2092 */
2093static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2094                                          struct btrfs_root *root,
2095                                          struct btrfs_path *path, int level)
2096{
2097        struct extent_buffer *right = NULL;
2098        struct extent_buffer *mid;
2099        struct extent_buffer *left = NULL;
2100        struct extent_buffer *parent = NULL;
2101        int ret = 0;
2102        int wret;
2103        int pslot;
2104        int orig_slot = path->slots[level];
2105
2106        if (level == 0)
2107                return 1;
2108
2109        mid = path->nodes[level];
2110        WARN_ON(btrfs_header_generation(mid) != trans->transid);
2111
2112        if (level < BTRFS_MAX_LEVEL - 1) {
2113                parent = path->nodes[level + 1];
2114                pslot = path->slots[level + 1];
2115        }
2116
2117        if (!parent)
2118                return 1;
2119
2120        left = read_node_slot(root, parent, pslot - 1);
2121        if (IS_ERR(left))
2122                left = NULL;
2123
2124        /* first, try to make some room in the middle buffer */
2125        if (left) {
2126                u32 left_nr;
2127
2128                btrfs_tree_lock(left);
2129                btrfs_set_lock_blocking(left);
2130
2131                left_nr = btrfs_header_nritems(left);
2132                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2133                        wret = 1;
2134                } else {
2135                        ret = btrfs_cow_block(trans, root, left, parent,
2136                                              pslot - 1, &left);
2137                        if (ret)
2138                                wret = 1;
2139                        else {
2140                                wret = push_node_left(trans, root,
2141                                                      left, mid, 0);
2142                        }
2143                }
2144                if (wret < 0)
2145                        ret = wret;
2146                if (wret == 0) {
2147                        struct btrfs_disk_key disk_key;
2148                        orig_slot += left_nr;
2149                        btrfs_node_key(mid, &disk_key, 0);
2150                        tree_mod_log_set_node_key(root->fs_info, parent,
2151                                                  pslot, 0);
2152                        btrfs_set_node_key(parent, &disk_key, pslot);
2153                        btrfs_mark_buffer_dirty(parent);
2154                        if (btrfs_header_nritems(left) > orig_slot) {
2155                                path->nodes[level] = left;
2156                                path->slots[level + 1] -= 1;
2157                                path->slots[level] = orig_slot;
2158                                btrfs_tree_unlock(mid);
2159                                free_extent_buffer(mid);
2160                        } else {
2161                                orig_slot -=
2162                                        btrfs_header_nritems(left);
2163                                path->slots[level] = orig_slot;
2164                                btrfs_tree_unlock(left);
2165                                free_extent_buffer(left);
2166                        }
2167                        return 0;
2168                }
2169                btrfs_tree_unlock(left);
2170                free_extent_buffer(left);
2171        }
2172        right = read_node_slot(root, parent, pslot + 1);
2173        if (IS_ERR(right))
2174                right = NULL;
2175
2176        /*
2177         * then try to empty the right most buffer into the middle
2178         */
2179        if (right) {
2180                u32 right_nr;
2181
2182                btrfs_tree_lock(right);
2183                btrfs_set_lock_blocking(right);
2184
2185                right_nr = btrfs_header_nritems(right);
2186                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2187                        wret = 1;
2188                } else {
2189                        ret = btrfs_cow_block(trans, root, right,
2190                                              parent, pslot + 1,
2191                                              &right);
2192                        if (ret)
2193                                wret = 1;
2194                        else {
2195                                wret = balance_node_right(trans, root,
2196                                                          right, mid);
2197                        }
2198                }
2199                if (wret < 0)
2200                        ret = wret;
2201                if (wret == 0) {
2202                        struct btrfs_disk_key disk_key;
2203
2204                        btrfs_node_key(right, &disk_key, 0);
2205                        tree_mod_log_set_node_key(root->fs_info, parent,
2206                                                  pslot + 1, 0);
2207                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
2208                        btrfs_mark_buffer_dirty(parent);
2209
2210                        if (btrfs_header_nritems(mid) <= orig_slot) {
2211                                path->nodes[level] = right;
2212                                path->slots[level + 1] += 1;
2213                                path->slots[level] = orig_slot -
2214                                        btrfs_header_nritems(mid);
2215                                btrfs_tree_unlock(mid);
2216                                free_extent_buffer(mid);
2217                        } else {
2218                                btrfs_tree_unlock(right);
2219                                free_extent_buffer(right);
2220                        }
2221                        return 0;
2222                }
2223                btrfs_tree_unlock(right);
2224                free_extent_buffer(right);
2225        }
2226        return 1;
2227}
2228
2229/*
2230 * readahead one full node of leaves, finding things that are close
2231 * to the block in 'slot', and triggering ra on them.
2232 */
2233static void reada_for_search(struct btrfs_root *root,
2234                             struct btrfs_path *path,
2235                             int level, int slot, u64 objectid)
2236{
2237        struct extent_buffer *node;
2238        struct btrfs_disk_key disk_key;
2239        u32 nritems;
2240        u64 search;
2241        u64 target;
2242        u64 nread = 0;
2243        struct extent_buffer *eb;
2244        u32 nr;
2245        u32 blocksize;
2246        u32 nscan = 0;
2247
2248        if (level != 1)
2249                return;
2250
2251        if (!path->nodes[level])
2252                return;
2253
2254        node = path->nodes[level];
2255
2256        search = btrfs_node_blockptr(node, slot);
2257        blocksize = root->nodesize;
2258        eb = btrfs_find_tree_block(root->fs_info, search);
2259        if (eb) {
2260                free_extent_buffer(eb);
2261                return;
2262        }
2263
2264        target = search;
2265
2266        nritems = btrfs_header_nritems(node);
2267        nr = slot;
2268
2269        while (1) {
2270                if (path->reada == READA_BACK) {
2271                        if (nr == 0)
2272                                break;
2273                        nr--;
2274                } else if (path->reada == READA_FORWARD) {
2275                        nr++;
2276                        if (nr >= nritems)
2277                                break;
2278                }
2279                if (path->reada == READA_BACK && objectid) {
2280                        btrfs_node_key(node, &disk_key, nr);
2281                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
2282                                break;
2283                }
2284                search = btrfs_node_blockptr(node, nr);
2285                if ((search <= target && target - search <= 65536) ||
2286                    (search > target && search - target <= 65536)) {
2287                        readahead_tree_block(root, search);
2288                        nread += blocksize;
2289                }
2290                nscan++;
2291                if ((nread > 65536 || nscan > 32))
2292                        break;
2293        }
2294}
2295
2296static noinline void reada_for_balance(struct btrfs_root *root,
2297                                       struct btrfs_path *path, int level)
2298{
2299        int slot;
2300        int nritems;
2301        struct extent_buffer *parent;
2302        struct extent_buffer *eb;
2303        u64 gen;
2304        u64 block1 = 0;
2305        u64 block2 = 0;
2306
2307        parent = path->nodes[level + 1];
2308        if (!parent)
2309                return;
2310
2311        nritems = btrfs_header_nritems(parent);
2312        slot = path->slots[level + 1];
2313
2314        if (slot > 0) {
2315                block1 = btrfs_node_blockptr(parent, slot - 1);
2316                gen = btrfs_node_ptr_generation(parent, slot - 1);
2317                eb = btrfs_find_tree_block(root->fs_info, block1);
2318                /*
2319                 * if we get -eagain from btrfs_buffer_uptodate, we
2320                 * don't want to return eagain here.  That will loop
2321                 * forever
2322                 */
2323                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2324                        block1 = 0;
2325                free_extent_buffer(eb);
2326        }
2327        if (slot + 1 < nritems) {
2328                block2 = btrfs_node_blockptr(parent, slot + 1);
2329                gen = btrfs_node_ptr_generation(parent, slot + 1);
2330                eb = btrfs_find_tree_block(root->fs_info, block2);
2331                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2332                        block2 = 0;
2333                free_extent_buffer(eb);
2334        }
2335
2336        if (block1)
2337                readahead_tree_block(root, block1);
2338        if (block2)
2339                readahead_tree_block(root, block2);
2340}
2341
2342
2343/*
2344 * when we walk down the tree, it is usually safe to unlock the higher layers
2345 * in the tree.  The exceptions are when our path goes through slot 0, because
2346 * operations on the tree might require changing key pointers higher up in the
2347 * tree.
2348 *
2349 * callers might also have set path->keep_locks, which tells this code to keep
2350 * the lock if the path points to the last slot in the block.  This is part of
2351 * walking through the tree, and selecting the next slot in the higher block.
2352 *
2353 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2354 * if lowest_unlock is 1, level 0 won't be unlocked
2355 */
2356static noinline void unlock_up(struct btrfs_path *path, int level,
2357                               int lowest_unlock, int min_write_lock_level,
2358                               int *write_lock_level)
2359{
2360        int i;
2361        int skip_level = level;
2362        int no_skips = 0;
2363        struct extent_buffer *t;
2364
2365        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2366                if (!path->nodes[i])
2367                        break;
2368                if (!path->locks[i])
2369                        break;
2370                if (!no_skips && path->slots[i] == 0) {
2371                        skip_level = i + 1;
2372                        continue;
2373                }
2374                if (!no_skips && path->keep_locks) {
2375                        u32 nritems;
2376                        t = path->nodes[i];
2377                        nritems = btrfs_header_nritems(t);
2378                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
2379                                skip_level = i + 1;
2380                                continue;
2381                        }
2382                }
2383                if (skip_level < i && i >= lowest_unlock)
2384                        no_skips = 1;
2385
2386                t = path->nodes[i];
2387                if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2388                        btrfs_tree_unlock_rw(t, path->locks[i]);
2389                        path->locks[i] = 0;
2390                        if (write_lock_level &&
2391                            i > min_write_lock_level &&
2392                            i <= *write_lock_level) {
2393                                *write_lock_level = i - 1;
2394                        }
2395                }
2396        }
2397}
2398
2399/*
2400 * This releases any locks held in the path starting at level and
2401 * going all the way up to the root.
2402 *
2403 * btrfs_search_slot will keep the lock held on higher nodes in a few
2404 * corner cases, such as COW of the block at slot zero in the node.  This
2405 * ignores those rules, and it should only be called when there are no
2406 * more updates to be done higher up in the tree.
2407 */
2408noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2409{
2410        int i;
2411
2412        if (path->keep_locks)
2413                return;
2414
2415        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2416                if (!path->nodes[i])
2417                        continue;
2418                if (!path->locks[i])
2419                        continue;
2420                btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2421                path->locks[i] = 0;
2422        }
2423}
2424
2425/*
2426 * helper function for btrfs_search_slot.  The goal is to find a block
2427 * in cache without setting the path to blocking.  If we find the block
2428 * we return zero and the path is unchanged.
2429 *
2430 * If we can't find the block, we set the path blocking and do some
2431 * reada.  -EAGAIN is returned and the search must be repeated.
2432 */
2433static int
2434read_block_for_search(struct btrfs_trans_handle *trans,
2435                       struct btrfs_root *root, struct btrfs_path *p,
2436                       struct extent_buffer **eb_ret, int level, int slot,
2437                       struct btrfs_key *key, u64 time_seq)
2438{
2439        u64 blocknr;
2440        u64 gen;
2441        struct extent_buffer *b = *eb_ret;
2442        struct extent_buffer *tmp;
2443        int ret;
2444
2445        blocknr = btrfs_node_blockptr(b, slot);
2446        gen = btrfs_node_ptr_generation(b, slot);
2447
2448        tmp = btrfs_find_tree_block(root->fs_info, blocknr);
2449        if (tmp) {
2450                /* first we do an atomic uptodate check */
2451                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2452                        *eb_ret = tmp;
2453                        return 0;
2454                }
2455
2456                /* the pages were up to date, but we failed
2457                 * the generation number check.  Do a full
2458                 * read for the generation number that is correct.
2459                 * We must do this without dropping locks so
2460                 * we can trust our generation number
2461                 */
2462                btrfs_set_path_blocking(p);
2463
2464                /* now we're allowed to do a blocking uptodate check */
2465                ret = btrfs_read_buffer(tmp, gen);
2466                if (!ret) {
2467                        *eb_ret = tmp;
2468                        return 0;
2469                }
2470                free_extent_buffer(tmp);
2471                btrfs_release_path(p);
2472                return -EIO;
2473        }
2474
2475        /*
2476         * reduce lock contention at high levels
2477         * of the btree by dropping locks before
2478         * we read.  Don't release the lock on the current
2479         * level because we need to walk this node to figure
2480         * out which blocks to read.
2481         */
2482        btrfs_unlock_up_safe(p, level + 1);
2483        btrfs_set_path_blocking(p);
2484
2485        free_extent_buffer(tmp);
2486        if (p->reada != READA_NONE)
2487                reada_for_search(root, p, level, slot, key->objectid);
2488
2489        btrfs_release_path(p);
2490
2491        ret = -EAGAIN;
2492        tmp = read_tree_block(root, blocknr, 0);
2493        if (!IS_ERR(tmp)) {
2494                /*
2495                 * If the read above didn't mark this buffer up to date,
2496                 * it will never end up being up to date.  Set ret to EIO now
2497                 * and give up so that our caller doesn't loop forever
2498                 * on our EAGAINs.
2499                 */
2500                if (!btrfs_buffer_uptodate(tmp, 0, 0))
2501                        ret = -EIO;
2502                free_extent_buffer(tmp);
2503        } else {
2504                ret = PTR_ERR(tmp);
2505        }
2506        return ret;
2507}
2508
2509/*
2510 * helper function for btrfs_search_slot.  This does all of the checks
2511 * for node-level blocks and does any balancing required based on
2512 * the ins_len.
2513 *
2514 * If no extra work was required, zero is returned.  If we had to
2515 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2516 * start over
2517 */
2518static int
2519setup_nodes_for_search(struct btrfs_trans_handle *trans,
2520                       struct btrfs_root *root, struct btrfs_path *p,
2521                       struct extent_buffer *b, int level, int ins_len,
2522                       int *write_lock_level)
2523{
2524        int ret;
2525        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2526            BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2527                int sret;
2528
2529                if (*write_lock_level < level + 1) {
2530                        *write_lock_level = level + 1;
2531                        btrfs_release_path(p);
2532                        goto again;
2533                }
2534
2535                btrfs_set_path_blocking(p);
2536                reada_for_balance(root, p, level);
2537                sret = split_node(trans, root, p, level);
2538                btrfs_clear_path_blocking(p, NULL, 0);
2539
2540                BUG_ON(sret > 0);
2541                if (sret) {
2542                        ret = sret;
2543                        goto done;
2544                }
2545                b = p->nodes[level];
2546        } else if (ins_len < 0 && btrfs_header_nritems(b) <
2547                   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2548                int sret;
2549
2550                if (*write_lock_level < level + 1) {
2551                        *write_lock_level = level + 1;
2552                        btrfs_release_path(p);
2553                        goto again;
2554                }
2555
2556                btrfs_set_path_blocking(p);
2557                reada_for_balance(root, p, level);
2558                sret = balance_level(trans, root, p, level);
2559                btrfs_clear_path_blocking(p, NULL, 0);
2560
2561                if (sret) {
2562                        ret = sret;
2563                        goto done;
2564                }
2565                b = p->nodes[level];
2566                if (!b) {
2567                        btrfs_release_path(p);
2568                        goto again;
2569                }
2570                BUG_ON(btrfs_header_nritems(b) == 1);
2571        }
2572        return 0;
2573
2574again:
2575        ret = -EAGAIN;
2576done:
2577        return ret;
2578}
2579
2580static void key_search_validate(struct extent_buffer *b,
2581                                struct btrfs_key *key,
2582                                int level)
2583{
2584#ifdef CONFIG_BTRFS_ASSERT
2585        struct btrfs_disk_key disk_key;
2586
2587        btrfs_cpu_key_to_disk(&disk_key, key);
2588
2589        if (level == 0)
2590                ASSERT(!memcmp_extent_buffer(b, &disk_key,
2591                    offsetof(struct btrfs_leaf, items[0].key),
2592                    sizeof(disk_key)));
2593        else
2594                ASSERT(!memcmp_extent_buffer(b, &disk_key,
2595                    offsetof(struct btrfs_node, ptrs[0].key),
2596                    sizeof(disk_key)));
2597#endif
2598}
2599
2600static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2601                      int level, int *prev_cmp, int *slot)
2602{
2603        if (*prev_cmp != 0) {
2604                *prev_cmp = bin_search(b, key, level, slot);
2605                return *prev_cmp;
2606        }
2607
2608        key_search_validate(b, key, level);
2609        *slot = 0;
2610
2611        return 0;
2612}
2613
2614int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2615                u64 iobjectid, u64 ioff, u8 key_type,
2616                struct btrfs_key *found_key)
2617{
2618        int ret;
2619        struct btrfs_key key;
2620        struct extent_buffer *eb;
2621
2622        ASSERT(path);
2623        ASSERT(found_key);
2624
2625        key.type = key_type;
2626        key.objectid = iobjectid;
2627        key.offset = ioff;
2628
2629        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2630        if (ret < 0)
2631                return ret;
2632
2633        eb = path->nodes[0];
2634        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2635                ret = btrfs_next_leaf(fs_root, path);
2636                if (ret)
2637                        return ret;
2638                eb = path->nodes[0];
2639        }
2640
2641        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2642        if (found_key->type != key.type ||
2643                        found_key->objectid != key.objectid)
2644                return 1;
2645
2646        return 0;
2647}
2648
2649/*
2650 * look for key in the tree.  path is filled in with nodes along the way
2651 * if key is found, we return zero and you can find the item in the leaf
2652 * level of the path (level 0)
2653 *
2654 * If the key isn't found, the path points to the slot where it should
2655 * be inserted, and 1 is returned.  If there are other errors during the
2656 * search a negative error number is returned.
2657 *
2658 * if ins_len > 0, nodes and leaves will be split as we walk down the
2659 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2660 * possible)
2661 */
2662int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2663                      *root, struct btrfs_key *key, struct btrfs_path *p, int
2664                      ins_len, int cow)
2665{
2666        struct extent_buffer *b;
2667        int slot;
2668        int ret;
2669        int err;
2670        int level;
2671        int lowest_unlock = 1;
2672        int root_lock;
2673        /* everything at write_lock_level or lower must be write locked */
2674        int write_lock_level = 0;
2675        u8 lowest_level = 0;
2676        int min_write_lock_level;
2677        int prev_cmp;
2678
2679        lowest_level = p->lowest_level;
2680        WARN_ON(lowest_level && ins_len > 0);
2681        WARN_ON(p->nodes[0] != NULL);
2682        BUG_ON(!cow && ins_len);
2683
2684        if (ins_len < 0) {
2685                lowest_unlock = 2;
2686
2687                /* when we are removing items, we might have to go up to level
2688                 * two as we update tree pointers  Make sure we keep write
2689                 * for those levels as well
2690                 */
2691                write_lock_level = 2;
2692        } else if (ins_len > 0) {
2693                /*
2694                 * for inserting items, make sure we have a write lock on
2695                 * level 1 so we can update keys
2696                 */
2697                write_lock_level = 1;
2698        }
2699
2700        if (!cow)
2701                write_lock_level = -1;
2702
2703        if (cow && (p->keep_locks || p->lowest_level))
2704                write_lock_level = BTRFS_MAX_LEVEL;
2705
2706        min_write_lock_level = write_lock_level;
2707
2708again:
2709        prev_cmp = -1;
2710        /*
2711         * we try very hard to do read locks on the root
2712         */
2713        root_lock = BTRFS_READ_LOCK;
2714        level = 0;
2715        if (p->search_commit_root) {
2716                /*
2717                 * the commit roots are read only
2718                 * so we always do read locks
2719                 */
2720                if (p->need_commit_sem)
2721                        down_read(&root->fs_info->commit_root_sem);
2722                b = root->commit_root;
2723                extent_buffer_get(b);
2724                level = btrfs_header_level(b);
2725                if (p->need_commit_sem)
2726                        up_read(&root->fs_info->commit_root_sem);
2727                if (!p->skip_locking)
2728                        btrfs_tree_read_lock(b);
2729        } else {
2730                if (p->skip_locking) {
2731                        b = btrfs_root_node(root);
2732                        level = btrfs_header_level(b);
2733                } else {
2734                        /* we don't know the level of the root node
2735                         * until we actually have it read locked
2736                         */
2737                        b = btrfs_read_lock_root_node(root);
2738                        level = btrfs_header_level(b);
2739                        if (level <= write_lock_level) {
2740                                /* whoops, must trade for write lock */
2741                                btrfs_tree_read_unlock(b);
2742                                free_extent_buffer(b);
2743                                b = btrfs_lock_root_node(root);
2744                                root_lock = BTRFS_WRITE_LOCK;
2745
2746                                /* the level might have changed, check again */
2747                                level = btrfs_header_level(b);
2748                        }
2749                }
2750        }
2751        p->nodes[level] = b;
2752        if (!p->skip_locking)
2753                p->locks[level] = root_lock;
2754
2755        while (b) {
2756                level = btrfs_header_level(b);
2757
2758                /*
2759                 * setup the path here so we can release it under lock
2760                 * contention with the cow code
2761                 */
2762                if (cow) {
2763                        /*
2764                         * if we don't really need to cow this block
2765                         * then we don't want to set the path blocking,
2766                         * so we test it here
2767                         */
2768                        if (!should_cow_block(trans, root, b)) {
2769                                trans->dirty = true;
2770                                goto cow_done;
2771                        }
2772
2773                        /*
2774                         * must have write locks on this node and the
2775                         * parent
2776                         */
2777                        if (level > write_lock_level ||
2778                            (level + 1 > write_lock_level &&
2779                            level + 1 < BTRFS_MAX_LEVEL &&
2780                            p->nodes[level + 1])) {
2781                                write_lock_level = level + 1;
2782                                btrfs_release_path(p);
2783                                goto again;
2784                        }
2785
2786                        btrfs_set_path_blocking(p);
2787                        err = btrfs_cow_block(trans, root, b,
2788                                              p->nodes[level + 1],
2789                                              p->slots[level + 1], &b);
2790                        if (err) {
2791                                ret = err;
2792                                goto done;
2793                        }
2794                }
2795cow_done:
2796                p->nodes[level] = b;
2797                btrfs_clear_path_blocking(p, NULL, 0);
2798
2799                /*
2800                 * we have a lock on b and as long as we aren't changing
2801                 * the tree, there is no way to for the items in b to change.
2802                 * It is safe to drop the lock on our parent before we
2803                 * go through the expensive btree search on b.
2804                 *
2805                 * If we're inserting or deleting (ins_len != 0), then we might
2806                 * be changing slot zero, which may require changing the parent.
2807                 * So, we can't drop the lock until after we know which slot
2808                 * we're operating on.
2809                 */
2810                if (!ins_len && !p->keep_locks) {
2811                        int u = level + 1;
2812
2813                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2814                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2815                                p->locks[u] = 0;
2816                        }
2817                }
2818
2819                ret = key_search(b, key, level, &prev_cmp, &slot);
2820                if (ret < 0)
2821                        goto done;
2822
2823                if (level != 0) {
2824                        int dec = 0;
2825                        if (ret && slot > 0) {
2826                                dec = 1;
2827                                slot -= 1;
2828                        }
2829                        p->slots[level] = slot;
2830                        err = setup_nodes_for_search(trans, root, p, b, level,
2831                                             ins_len, &write_lock_level);
2832                        if (err == -EAGAIN)
2833                                goto again;
2834                        if (err) {
2835                                ret = err;
2836                                goto done;
2837                        }
2838                        b = p->nodes[level];
2839                        slot = p->slots[level];
2840
2841                        /*
2842                         * slot 0 is special, if we change the key
2843                         * we have to update the parent pointer
2844                         * which means we must have a write lock
2845                         * on the parent
2846                         */
2847                        if (slot == 0 && ins_len &&
2848                            write_lock_level < level + 1) {
2849                                write_lock_level = level + 1;
2850                                btrfs_release_path(p);
2851                                goto again;
2852                        }
2853
2854                        unlock_up(p, level, lowest_unlock,
2855                                  min_write_lock_level, &write_lock_level);
2856
2857                        if (level == lowest_level) {
2858                                if (dec)
2859                                        p->slots[level]++;
2860                                goto done;
2861                        }
2862
2863                        err = read_block_for_search(trans, root, p,
2864                                                    &b, level, slot, key, 0);
2865                        if (err == -EAGAIN)
2866                                goto again;
2867                        if (err) {
2868                                ret = err;
2869                                goto done;
2870                        }
2871
2872                        if (!p->skip_locking) {
2873                                level = btrfs_header_level(b);
2874                                if (level <= write_lock_level) {
2875                                        err = btrfs_try_tree_write_lock(b);
2876                                        if (!err) {
2877                                                btrfs_set_path_blocking(p);
2878                                                btrfs_tree_lock(b);
2879                                                btrfs_clear_path_blocking(p, b,
2880                                                                  BTRFS_WRITE_LOCK);
2881                                        }
2882                                        p->locks[level] = BTRFS_WRITE_LOCK;
2883                                } else {
2884                                        err = btrfs_tree_read_lock_atomic(b);
2885                                        if (!err) {
2886                                                btrfs_set_path_blocking(p);
2887                                                btrfs_tree_read_lock(b);
2888                                                btrfs_clear_path_blocking(p, b,
2889                                                                  BTRFS_READ_LOCK);
2890                                        }
2891                                        p->locks[level] = BTRFS_READ_LOCK;
2892                                }
2893                                p->nodes[level] = b;
2894                        }
2895                } else {
2896                        p->slots[level] = slot;
2897                        if (ins_len > 0 &&
2898                            btrfs_leaf_free_space(root, b) < ins_len) {
2899                                if (write_lock_level < 1) {
2900                                        write_lock_level = 1;
2901                                        btrfs_release_path(p);
2902                                        goto again;
2903                                }
2904
2905                                btrfs_set_path_blocking(p);
2906                                err = split_leaf(trans, root, key,
2907                                                 p, ins_len, ret == 0);
2908                                btrfs_clear_path_blocking(p, NULL, 0);
2909
2910                                BUG_ON(err > 0);
2911                                if (err) {
2912                                        ret = err;
2913                                        goto done;
2914                                }
2915                        }
2916                        if (!p->search_for_split)
2917                                unlock_up(p, level, lowest_unlock,
2918                                          min_write_lock_level, &write_lock_level);
2919                        goto done;
2920                }
2921        }
2922        ret = 1;
2923done:
2924        /*
2925         * we don't really know what they plan on doing with the path
2926         * from here on, so for now just mark it as blocking
2927         */
2928        if (!p->leave_spinning)
2929                btrfs_set_path_blocking(p);
2930        if (ret < 0 && !p->skip_release_on_error)
2931                btrfs_release_path(p);
2932        return ret;
2933}
2934
2935/*
2936 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2937 * current state of the tree together with the operations recorded in the tree
2938 * modification log to search for the key in a previous version of this tree, as
2939 * denoted by the time_seq parameter.
2940 *
2941 * Naturally, there is no support for insert, delete or cow operations.
2942 *
2943 * The resulting path and return value will be set up as if we called
2944 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2945 */
2946int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2947                          struct btrfs_path *p, u64 time_seq)
2948{
2949        struct extent_buffer *b;
2950        int slot;
2951        int ret;
2952        int err;
2953        int level;
2954        int lowest_unlock = 1;
2955        u8 lowest_level = 0;
2956        int prev_cmp = -1;
2957
2958        lowest_level = p->lowest_level;
2959        WARN_ON(p->nodes[0] != NULL);
2960
2961        if (p->search_commit_root) {
2962                BUG_ON(time_seq);
2963                return btrfs_search_slot(NULL, root, key, p, 0, 0);
2964        }
2965
2966again:
2967        b = get_old_root(root, time_seq);
2968        level = btrfs_header_level(b);
2969        p->locks[level] = BTRFS_READ_LOCK;
2970
2971        while (b) {
2972                level = btrfs_header_level(b);
2973                p->nodes[level] = b;
2974                btrfs_clear_path_blocking(p, NULL, 0);
2975
2976                /*
2977                 * we have a lock on b and as long as we aren't changing
2978                 * the tree, there is no way to for the items in b to change.
2979                 * It is safe to drop the lock on our parent before we
2980                 * go through the expensive btree search on b.
2981                 */
2982                btrfs_unlock_up_safe(p, level + 1);
2983
2984                /*
2985                 * Since we can unwind ebs we want to do a real search every
2986                 * time.
2987                 */
2988                prev_cmp = -1;
2989                ret = key_search(b, key, level, &prev_cmp, &slot);
2990
2991                if (level != 0) {
2992                        int dec = 0;
2993                        if (ret && slot > 0) {
2994                                dec = 1;
2995                                slot -= 1;
2996                        }
2997                        p->slots[level] = slot;
2998                        unlock_up(p, level, lowest_unlock, 0, NULL);
2999
3000                        if (level == lowest_level) {
3001                                if (dec)
3002                                        p->slots[level]++;
3003                                goto done;
3004                        }
3005
3006                        err = read_block_for_search(NULL, root, p, &b, level,
3007                                                    slot, key, time_seq);
3008                        if (err == -EAGAIN)
3009                                goto again;
3010                        if (err) {
3011                                ret = err;
3012                                goto done;
3013                        }
3014
3015                        level = btrfs_header_level(b);
3016                        err = btrfs_tree_read_lock_atomic(b);
3017                        if (!err) {
3018                                btrfs_set_path_blocking(p);
3019                                btrfs_tree_read_lock(b);
3020                                btrfs_clear_path_blocking(p, b,
3021                                                          BTRFS_READ_LOCK);
3022                        }
3023                        b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3024                        if (!b) {
3025                                ret = -ENOMEM;
3026                                goto done;
3027                        }
3028                        p->locks[level] = BTRFS_READ_LOCK;
3029                        p->nodes[level] = b;
3030                } else {
3031                        p->slots[level] = slot;
3032                        unlock_up(p, level, lowest_unlock, 0, NULL);
3033                        goto done;
3034                }
3035        }
3036        ret = 1;
3037done:
3038        if (!p->leave_spinning)
3039                btrfs_set_path_blocking(p);
3040        if (ret < 0)
3041                btrfs_release_path(p);
3042
3043        return ret;
3044}
3045
3046/*
3047 * helper to use instead of search slot if no exact match is needed but
3048 * instead the next or previous item should be returned.
3049 * When find_higher is true, the next higher item is returned, the next lower
3050 * otherwise.
3051 * When return_any and find_higher are both true, and no higher item is found,
3052 * return the next lower instead.
3053 * When return_any is true and find_higher is false, and no lower item is found,
3054 * return the next higher instead.
3055 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3056 * < 0 on error
3057 */
3058int btrfs_search_slot_for_read(struct btrfs_root *root,
3059                               struct btrfs_key *key, struct btrfs_path *p,
3060                               int find_higher, int return_any)
3061{
3062        int ret;
3063        struct extent_buffer *leaf;
3064
3065again:
3066        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3067        if (ret <= 0)
3068                return ret;
3069        /*
3070         * a return value of 1 means the path is at the position where the
3071         * item should be inserted. Normally this is the next bigger item,
3072         * but in case the previous item is the last in a leaf, path points
3073         * to the first free slot in the previous leaf, i.e. at an invalid
3074         * item.
3075         */
3076        leaf = p->nodes[0];
3077
3078        if (find_higher) {
3079                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3080                        ret = btrfs_next_leaf(root, p);
3081                        if (ret <= 0)
3082                                return ret;
3083                        if (!return_any)
3084                                return 1;
3085                        /*
3086                         * no higher item found, return the next
3087                         * lower instead
3088                         */
3089                        return_any = 0;
3090                        find_higher = 0;
3091                        btrfs_release_path(p);
3092                        goto again;
3093                }
3094        } else {
3095                if (p->slots[0] == 0) {
3096                        ret = btrfs_prev_leaf(root, p);
3097                        if (ret < 0)
3098                                return ret;
3099                        if (!ret) {
3100                                leaf = p->nodes[0];
3101                                if (p->slots[0] == btrfs_header_nritems(leaf))
3102                                        p->slots[0]--;
3103                                return 0;
3104                        }
3105                        if (!return_any)
3106                                return 1;
3107                        /*
3108                         * no lower item found, return the next
3109                         * higher instead
3110                         */
3111                        return_any = 0;
3112                        find_higher = 1;
3113                        btrfs_release_path(p);
3114                        goto again;
3115                } else {
3116                        --p->slots[0];
3117                }
3118        }
3119        return 0;
3120}
3121
3122/*
3123 * adjust the pointers going up the tree, starting at level
3124 * making sure the right key of each node is points to 'key'.
3125 * This is used after shifting pointers to the left, so it stops
3126 * fixing up pointers when a given leaf/node is not in slot 0 of the
3127 * higher levels
3128 *
3129 */
3130static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3131                           struct btrfs_path *path,
3132                           struct btrfs_disk_key *key, int level)
3133{
3134        int i;
3135        struct extent_buffer *t;
3136
3137        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3138                int tslot = path->slots[i];
3139                if (!path->nodes[i])
3140                        break;
3141                t = path->nodes[i];
3142                tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3143                btrfs_set_node_key(t, key, tslot);
3144                btrfs_mark_buffer_dirty(path->nodes[i]);
3145                if (tslot != 0)
3146                        break;
3147        }
3148}
3149
3150/*
3151 * update item key.
3152 *
3153 * This function isn't completely safe. It's the caller's responsibility
3154 * that the new key won't break the order
3155 */
3156void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3157                             struct btrfs_path *path,
3158                             struct btrfs_key *new_key)
3159{
3160        struct btrfs_disk_key disk_key;
3161        struct extent_buffer *eb;
3162        int slot;
3163
3164        eb = path->nodes[0];
3165        slot = path->slots[0];
3166        if (slot > 0) {
3167                btrfs_item_key(eb, &disk_key, slot - 1);
3168                BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3169        }
3170        if (slot < btrfs_header_nritems(eb) - 1) {
3171                btrfs_item_key(eb, &disk_key, slot + 1);
3172                BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3173        }
3174
3175        btrfs_cpu_key_to_disk(&disk_key, new_key);
3176        btrfs_set_item_key(eb, &disk_key, slot);
3177        btrfs_mark_buffer_dirty(eb);
3178        if (slot == 0)
3179                fixup_low_keys(fs_info, path, &disk_key, 1);
3180}
3181
3182/*
3183 * try to push data from one node into the next node left in the
3184 * tree.
3185 *
3186 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3187 * error, and > 0 if there was no room in the left hand block.
3188 */
3189static int push_node_left(struct btrfs_trans_handle *trans,
3190                          struct btrfs_root *root, struct extent_buffer *dst,
3191                          struct extent_buffer *src, int empty)
3192{
3193        int push_items = 0;
3194        int src_nritems;
3195        int dst_nritems;
3196        int ret = 0;
3197
3198        src_nritems = btrfs_header_nritems(src);
3199        dst_nritems = btrfs_header_nritems(dst);
3200        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3201        WARN_ON(btrfs_header_generation(src) != trans->transid);
3202        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3203
3204        if (!empty && src_nritems <= 8)
3205                return 1;
3206
3207        if (push_items <= 0)
3208                return 1;
3209
3210        if (empty) {
3211                push_items = min(src_nritems, push_items);
3212                if (push_items < src_nritems) {
3213                        /* leave at least 8 pointers in the node if
3214                         * we aren't going to empty it
3215                         */
3216                        if (src_nritems - push_items < 8) {
3217                                if (push_items <= 8)
3218                                        return 1;
3219                                push_items -= 8;
3220                        }
3221                }
3222        } else
3223                push_items = min(src_nritems - 8, push_items);
3224
3225        ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3226                                   push_items);
3227        if (ret) {
3228                btrfs_abort_transaction(trans, ret);
3229                return ret;
3230        }
3231        copy_extent_buffer(dst, src,
3232                           btrfs_node_key_ptr_offset(dst_nritems),
3233                           btrfs_node_key_ptr_offset(0),
3234                           push_items * sizeof(struct btrfs_key_ptr));
3235
3236        if (push_items < src_nritems) {
3237                /*
3238                 * don't call tree_mod_log_eb_move here, key removal was already
3239                 * fully logged by tree_mod_log_eb_copy above.
3240                 */
3241                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3242                                      btrfs_node_key_ptr_offset(push_items),
3243                                      (src_nritems - push_items) *
3244                                      sizeof(struct btrfs_key_ptr));
3245        }
3246        btrfs_set_header_nritems(src, src_nritems - push_items);
3247        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3248        btrfs_mark_buffer_dirty(src);
3249        btrfs_mark_buffer_dirty(dst);
3250
3251        return ret;
3252}
3253
3254/*
3255 * try to push data from one node into the next node right in the
3256 * tree.
3257 *
3258 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3259 * error, and > 0 if there was no room in the right hand block.
3260 *
3261 * this will  only push up to 1/2 the contents of the left node over
3262 */
3263static int balance_node_right(struct btrfs_trans_handle *trans,
3264                              struct btrfs_root *root,
3265                              struct extent_buffer *dst,
3266                              struct extent_buffer *src)
3267{
3268        int push_items = 0;
3269        int max_push;
3270        int src_nritems;
3271        int dst_nritems;
3272        int ret = 0;
3273
3274        WARN_ON(btrfs_header_generation(src) != trans->transid);
3275        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3276
3277        src_nritems = btrfs_header_nritems(src);
3278        dst_nritems = btrfs_header_nritems(dst);
3279        push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3280        if (push_items <= 0)
3281                return 1;
3282
3283        if (src_nritems < 4)
3284                return 1;
3285
3286        max_push = src_nritems / 2 + 1;
3287        /* don't try to empty the node */
3288        if (max_push >= src_nritems)
3289                return 1;
3290
3291        if (max_push < push_items)
3292                push_items = max_push;
3293
3294        tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3295        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3296                                      btrfs_node_key_ptr_offset(0),
3297                                      (dst_nritems) *
3298                                      sizeof(struct btrfs_key_ptr));
3299
3300        ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3301                                   src_nritems - push_items, push_items);
3302        if (ret) {
3303                btrfs_abort_transaction(trans, ret);
3304                return ret;
3305        }
3306        copy_extent_buffer(dst, src,
3307                           btrfs_node_key_ptr_offset(0),
3308                           btrfs_node_key_ptr_offset(src_nritems - push_items),
3309                           push_items * sizeof(struct btrfs_key_ptr));
3310
3311        btrfs_set_header_nritems(src, src_nritems - push_items);
3312        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3313
3314        btrfs_mark_buffer_dirty(src);
3315        btrfs_mark_buffer_dirty(dst);
3316
3317        return ret;
3318}
3319
3320/*
3321 * helper function to insert a new root level in the tree.
3322 * A new node is allocated, and a single item is inserted to
3323 * point to the existing root
3324 *
3325 * returns zero on success or < 0 on failure.
3326 */
3327static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3328                           struct btrfs_root *root,
3329                           struct btrfs_path *path, int level)
3330{
3331        u64 lower_gen;
3332        struct extent_buffer *lower;
3333        struct extent_buffer *c;
3334        struct extent_buffer *old;
3335        struct btrfs_disk_key lower_key;
3336
3337        BUG_ON(path->nodes[level]);
3338        BUG_ON(path->nodes[level-1] != root->node);
3339
3340        lower = path->nodes[level-1];
3341        if (level == 1)
3342                btrfs_item_key(lower, &lower_key, 0);
3343        else
3344                btrfs_node_key(lower, &lower_key, 0);
3345
3346        c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3347                                   &lower_key, level, root->node->start, 0);
3348        if (IS_ERR(c))
3349                return PTR_ERR(c);
3350
3351        root_add_used(root, root->nodesize);
3352
3353        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3354        btrfs_set_header_nritems(c, 1);
3355        btrfs_set_header_level(c, level);
3356        btrfs_set_header_bytenr(c, c->start);
3357        btrfs_set_header_generation(c, trans->transid);
3358        btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3359        btrfs_set_header_owner(c, root->root_key.objectid);
3360
3361        write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3362                            BTRFS_FSID_SIZE);
3363
3364        write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3365                            btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3366
3367        btrfs_set_node_key(c, &lower_key, 0);
3368        btrfs_set_node_blockptr(c, 0, lower->start);
3369        lower_gen = btrfs_header_generation(lower);
3370        WARN_ON(lower_gen != trans->transid);
3371
3372        btrfs_set_node_ptr_generation(c, 0, lower_gen);
3373
3374        btrfs_mark_buffer_dirty(c);
3375
3376        old = root->node;
3377        tree_mod_log_set_root_pointer(root, c, 0);
3378        rcu_assign_pointer(root->node, c);
3379
3380        /* the super has an extra ref to root->node */
3381        free_extent_buffer(old);
3382
3383        add_root_to_dirty_list(root);
3384        extent_buffer_get(c);
3385        path->nodes[level] = c;
3386        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3387        path->slots[level] = 0;
3388        return 0;
3389}
3390
3391/*
3392 * worker function to insert a single pointer in a node.
3393 * the node should have enough room for the pointer already
3394 *
3395 * slot and level indicate where you want the key to go, and
3396 * blocknr is the block the key points to.
3397 */
3398static void insert_ptr(struct btrfs_trans_handle *trans,
3399                       struct btrfs_root *root, struct btrfs_path *path,
3400                       struct btrfs_disk_key *key, u64 bytenr,
3401                       int slot, int level)
3402{
3403        struct extent_buffer *lower;
3404        int nritems;
3405        int ret;
3406
3407        BUG_ON(!path->nodes[level]);
3408        btrfs_assert_tree_locked(path->nodes[level]);
3409        lower = path->nodes[level];
3410        nritems = btrfs_header_nritems(lower);
3411        BUG_ON(slot > nritems);
3412        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3413        if (slot != nritems) {
3414                if (level)
3415                        tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3416                                             slot, nritems - slot);
3417                memmove_extent_buffer(lower,
3418                              btrfs_node_key_ptr_offset(slot + 1),
3419                              btrfs_node_key_ptr_offset(slot),
3420                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
3421        }
3422        if (level) {
3423                ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3424                                              MOD_LOG_KEY_ADD, GFP_NOFS);
3425                BUG_ON(ret < 0);
3426        }
3427        btrfs_set_node_key(lower, key, slot);
3428        btrfs_set_node_blockptr(lower, slot, bytenr);
3429        WARN_ON(trans->transid == 0);
3430        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3431        btrfs_set_header_nritems(lower, nritems + 1);
3432        btrfs_mark_buffer_dirty(lower);
3433}
3434
3435/*
3436 * split the node at the specified level in path in two.
3437 * The path is corrected to point to the appropriate node after the split
3438 *
3439 * Before splitting this tries to make some room in the node by pushing
3440 * left and right, if either one works, it returns right away.
3441 *
3442 * returns 0 on success and < 0 on failure
3443 */
3444static noinline int split_node(struct btrfs_trans_handle *trans,
3445                               struct btrfs_root *root,
3446                               struct btrfs_path *path, int level)
3447{
3448        struct extent_buffer *c;
3449        struct extent_buffer *split;
3450        struct btrfs_disk_key disk_key;
3451        int mid;
3452        int ret;
3453        u32 c_nritems;
3454
3455        c = path->nodes[level];
3456        WARN_ON(btrfs_header_generation(c) != trans->transid);
3457        if (c == root->node) {
3458                /*
3459                 * trying to split the root, lets make a new one
3460                 *
3461                 * tree mod log: We don't log_removal old root in
3462                 * insert_new_root, because that root buffer will be kept as a
3463                 * normal node. We are going to log removal of half of the
3464                 * elements below with tree_mod_log_eb_copy. We're holding a
3465                 * tree lock on the buffer, which is why we cannot race with
3466                 * other tree_mod_log users.
3467                 */
3468                ret = insert_new_root(trans, root, path, level + 1);
3469                if (ret)
3470                        return ret;
3471        } else {
3472                ret = push_nodes_for_insert(trans, root, path, level);
3473                c = path->nodes[level];
3474                if (!ret && btrfs_header_nritems(c) <
3475                    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3476                        return 0;
3477                if (ret < 0)
3478                        return ret;
3479        }
3480
3481        c_nritems = btrfs_header_nritems(c);
3482        mid = (c_nritems + 1) / 2;
3483        btrfs_node_key(c, &disk_key, mid);
3484
3485        split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3486                        &disk_key, level, c->start, 0);
3487        if (IS_ERR(split))
3488                return PTR_ERR(split);
3489
3490        root_add_used(root, root->nodesize);
3491
3492        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3493        btrfs_set_header_level(split, btrfs_header_level(c));
3494        btrfs_set_header_bytenr(split, split->start);
3495        btrfs_set_header_generation(split, trans->transid);
3496        btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3497        btrfs_set_header_owner(split, root->root_key.objectid);
3498        write_extent_buffer(split, root->fs_info->fsid,
3499                            btrfs_header_fsid(), BTRFS_FSID_SIZE);
3500        write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3501                            btrfs_header_chunk_tree_uuid(split),
3502                            BTRFS_UUID_SIZE);
3503
3504        ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3505                                   mid, c_nritems - mid);
3506        if (ret) {
3507                btrfs_abort_transaction(trans, ret);
3508                return ret;
3509        }
3510        copy_extent_buffer(split, c,
3511                           btrfs_node_key_ptr_offset(0),
3512                           btrfs_node_key_ptr_offset(mid),
3513                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3514        btrfs_set_header_nritems(split, c_nritems - mid);
3515        btrfs_set_header_nritems(c, mid);
3516        ret = 0;
3517
3518        btrfs_mark_buffer_dirty(c);
3519        btrfs_mark_buffer_dirty(split);
3520
3521        insert_ptr(trans, root, path, &disk_key, split->start,
3522                   path->slots[level + 1] + 1, level + 1);
3523
3524        if (path->slots[level] >= mid) {
3525                path->slots[level] -= mid;
3526                btrfs_tree_unlock(c);
3527                free_extent_buffer(c);
3528                path->nodes[level] = split;
3529                path->slots[level + 1] += 1;
3530        } else {
3531                btrfs_tree_unlock(split);
3532                free_extent_buffer(split);
3533        }
3534        return ret;
3535}
3536
3537/*
3538 * how many bytes are required to store the items in a leaf.  start
3539 * and nr indicate which items in the leaf to check.  This totals up the
3540 * space used both by the item structs and the item data
3541 */
3542static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3543{
3544        struct btrfs_item *start_item;
3545        struct btrfs_item *end_item;
3546        struct btrfs_map_token token;
3547        int data_len;
3548        int nritems = btrfs_header_nritems(l);
3549        int end = min(nritems, start + nr) - 1;
3550
3551        if (!nr)
3552                return 0;
3553        btrfs_init_map_token(&token);
3554        start_item = btrfs_item_nr(start);
3555        end_item = btrfs_item_nr(end);
3556        data_len = btrfs_token_item_offset(l, start_item, &token) +
3557                btrfs_token_item_size(l, start_item, &token);
3558        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3559        data_len += sizeof(struct btrfs_item) * nr;
3560        WARN_ON(data_len < 0);
3561        return data_len;
3562}
3563
3564/*
3565 * The space between the end of the leaf items and
3566 * the start of the leaf data.  IOW, how much room
3567 * the leaf has left for both items and data
3568 */
3569noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3570                                   struct extent_buffer *leaf)
3571{
3572        int nritems = btrfs_header_nritems(leaf);
3573        int ret;
3574        ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3575        if (ret < 0) {
3576                btrfs_crit(root->fs_info,
3577                        "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3578                       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3579                       leaf_space_used(leaf, 0, nritems), nritems);
3580        }
3581        return ret;
3582}
3583
3584/*
3585 * min slot controls the lowest index we're willing to push to the
3586 * right.  We'll push up to and including min_slot, but no lower
3587 */
3588static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3589                                      struct btrfs_root *root,
3590                                      struct btrfs_path *path,
3591                                      int data_size, int empty,
3592                                      struct extent_buffer *right,
3593                                      int free_space, u32 left_nritems,
3594                                      u32 min_slot)
3595{
3596        struct extent_buffer *left = path->nodes[0];
3597        struct extent_buffer *upper = path->nodes[1];
3598        struct btrfs_map_token token;
3599        struct btrfs_disk_key disk_key;
3600        int slot;
3601        u32 i;
3602        int push_space = 0;
3603        int push_items = 0;
3604        struct btrfs_item *item;
3605        u32 nr;
3606        u32 right_nritems;
3607        u32 data_end;
3608        u32 this_item_size;
3609
3610        btrfs_init_map_token(&token);
3611
3612        if (empty)
3613                nr = 0;
3614        else
3615                nr = max_t(u32, 1, min_slot);
3616
3617        if (path->slots[0] >= left_nritems)
3618                push_space += data_size;
3619
3620        slot = path->slots[1];
3621        i = left_nritems - 1;
3622        while (i >= nr) {
3623                item = btrfs_item_nr(i);
3624
3625                if (!empty && push_items > 0) {
3626                        if (path->slots[0] > i)
3627                                break;
3628                        if (path->slots[0] == i) {
3629                                int space = btrfs_leaf_free_space(root, left);
3630                                if (space + push_space * 2 > free_space)
3631                                        break;
3632                        }
3633                }
3634
3635                if (path->slots[0] == i)
3636                        push_space += data_size;
3637
3638                this_item_size = btrfs_item_size(left, item);
3639                if (this_item_size + sizeof(*item) + push_space > free_space)
3640                        break;
3641
3642                push_items++;
3643                push_space += this_item_size + sizeof(*item);
3644                if (i == 0)
3645                        break;
3646                i--;
3647        }
3648
3649        if (push_items == 0)
3650                goto out_unlock;
3651
3652        WARN_ON(!empty && push_items == left_nritems);
3653
3654        /* push left to right */
3655        right_nritems = btrfs_header_nritems(right);
3656
3657        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3658        push_space -= leaf_data_end(root, left);
3659
3660        /* make room in the right data area */
3661        data_end = leaf_data_end(root, right);
3662        memmove_extent_buffer(right,
3663                              btrfs_leaf_data(right) + data_end - push_space,
3664                              btrfs_leaf_data(right) + data_end,
3665                              BTRFS_LEAF_DATA_SIZE(root) - data_end);
3666
3667        /* copy from the left data area */
3668        copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3669                     BTRFS_LEAF_DATA_SIZE(root) - push_space,
3670                     btrfs_leaf_data(left) + leaf_data_end(root, left),
3671                     push_space);
3672
3673        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3674                              btrfs_item_nr_offset(0),
3675                              right_nritems * sizeof(struct btrfs_item));
3676
3677        /* copy the items from left to right */
3678        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3679                   btrfs_item_nr_offset(left_nritems - push_items),
3680                   push_items * sizeof(struct btrfs_item));
3681
3682        /* update the item pointers */
3683        right_nritems += push_items;
3684        btrfs_set_header_nritems(right, right_nritems);
3685        push_space = BTRFS_LEAF_DATA_SIZE(root);
3686        for (i = 0; i < right_nritems; i++) {
3687                item = btrfs_item_nr(i);
3688                push_space -= btrfs_token_item_size(right, item, &token);
3689                btrfs_set_token_item_offset(right, item, push_space, &token);
3690        }
3691
3692        left_nritems -= push_items;
3693        btrfs_set_header_nritems(left, left_nritems);
3694
3695        if (left_nritems)
3696                btrfs_mark_buffer_dirty(left);
3697        else
3698                clean_tree_block(trans, root->fs_info, left);
3699
3700        btrfs_mark_buffer_dirty(right);
3701
3702        btrfs_item_key(right, &disk_key, 0);
3703        btrfs_set_node_key(upper, &disk_key, slot + 1);
3704        btrfs_mark_buffer_dirty(upper);
3705
3706        /* then fixup the leaf pointer in the path */
3707        if (path->slots[0] >= left_nritems) {
3708                path->slots[0] -= left_nritems;
3709                if (btrfs_header_nritems(path->nodes[0]) == 0)
3710                        clean_tree_block(trans, root->fs_info, path->nodes[0]);
3711                btrfs_tree_unlock(path->nodes[0]);
3712                free_extent_buffer(path->nodes[0]);
3713                path->nodes[0] = right;
3714                path->slots[1] += 1;
3715        } else {
3716                btrfs_tree_unlock(right);
3717                free_extent_buffer(right);
3718        }
3719        return 0;
3720
3721out_unlock:
3722        btrfs_tree_unlock(right);
3723        free_extent_buffer(right);
3724        return 1;
3725}
3726
3727/*
3728 * push some data in the path leaf to the right, trying to free up at
3729 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3730 *
3731 * returns 1 if the push failed because the other node didn't have enough
3732 * room, 0 if everything worked out and < 0 if there were major errors.
3733 *
3734 * this will push starting from min_slot to the end of the leaf.  It won't
3735 * push any slot lower than min_slot
3736 */
3737static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3738                           *root, struct btrfs_path *path,
3739                           int min_data_size, int data_size,
3740                           int empty, u32 min_slot)
3741{
3742        struct extent_buffer *left = path->nodes[0];
3743        struct extent_buffer *right;
3744        struct extent_buffer *upper;
3745        int slot;
3746        int free_space;
3747        u32 left_nritems;
3748        int ret;
3749
3750        if (!path->nodes[1])
3751                return 1;
3752
3753        slot = path->slots[1];
3754        upper = path->nodes[1];
3755        if (slot >= btrfs_header_nritems(upper) - 1)
3756                return 1;
3757
3758        btrfs_assert_tree_locked(path->nodes[1]);
3759
3760        right = read_node_slot(root, upper, slot + 1);
3761        /*
3762         * slot + 1 is not valid or we fail to read the right node,
3763         * no big deal, just return.
3764         */
3765        if (IS_ERR(right))
3766                return 1;
3767
3768        btrfs_tree_lock(right);
3769        btrfs_set_lock_blocking(right);
3770
3771        free_space = btrfs_leaf_free_space(root, right);
3772        if (free_space < data_size)
3773                goto out_unlock;
3774
3775        /* cow and double check */
3776        ret = btrfs_cow_block(trans, root, right, upper,
3777                              slot + 1, &right);
3778        if (ret)
3779                goto out_unlock;
3780
3781        free_space = btrfs_leaf_free_space(root, right);
3782        if (free_space < data_size)
3783                goto out_unlock;
3784
3785        left_nritems = btrfs_header_nritems(left);
3786        if (left_nritems == 0)
3787                goto out_unlock;
3788
3789        if (path->slots[0] == left_nritems && !empty) {
3790                /* Key greater than all keys in the leaf, right neighbor has
3791                 * enough room for it and we're not emptying our leaf to delete
3792                 * it, therefore use right neighbor to insert the new item and
3793                 * no need to touch/dirty our left leaft. */
3794                btrfs_tree_unlock(left);
3795                free_extent_buffer(left);
3796                path->nodes[0] = right;
3797                path->slots[0] = 0;
3798                path->slots[1]++;
3799                return 0;
3800        }
3801
3802        return __push_leaf_right(trans, root, path, min_data_size, empty,
3803                                right, free_space, left_nritems, min_slot);
3804out_unlock:
3805        btrfs_tree_unlock(right);
3806        free_extent_buffer(right);
3807        return 1;
3808}
3809
3810/*
3811 * push some data in the path leaf to the left, trying to free up at
3812 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3813 *
3814 * max_slot can put a limit on how far into the leaf we'll push items.  The
3815 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3816 * items
3817 */
3818static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3819                                     struct btrfs_root *root,
3820                                     struct btrfs_path *path, int data_size,
3821                                     int empty, struct extent_buffer *left,
3822                                     int free_space, u32 right_nritems,
3823                                     u32 max_slot)
3824{
3825        struct btrfs_disk_key disk_key;
3826        struct extent_buffer *right = path->nodes[0];
3827        int i;
3828        int push_space = 0;
3829        int push_items = 0;
3830        struct btrfs_item *item;
3831        u32 old_left_nritems;
3832        u32 nr;
3833        int ret = 0;
3834        u32 this_item_size;
3835        u32 old_left_item_size;
3836        struct btrfs_map_token token;
3837
3838        btrfs_init_map_token(&token);
3839
3840        if (empty)
3841                nr = min(right_nritems, max_slot);
3842        else
3843                nr = min(right_nritems - 1, max_slot);
3844
3845        for (i = 0; i < nr; i++) {
3846                item = btrfs_item_nr(i);
3847
3848                if (!empty && push_items > 0) {
3849                        if (path->slots[0] < i)
3850                                break;
3851                        if (path->slots[0] == i) {
3852                                int space = btrfs_leaf_free_space(root, right);
3853                                if (space + push_space * 2 > free_space)
3854                                        break;
3855                        }
3856                }
3857
3858                if (path->slots[0] == i)
3859                        push_space += data_size;
3860
3861                this_item_size = btrfs_item_size(right, item);
3862                if (this_item_size + sizeof(*item) + push_space > free_space)
3863                        break;
3864
3865                push_items++;
3866                push_space += this_item_size + sizeof(*item);
3867        }
3868
3869        if (push_items == 0) {
3870                ret = 1;
3871                goto out;
3872        }
3873        WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3874
3875        /* push data from right to left */
3876        copy_extent_buffer(left, right,
3877                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
3878                           btrfs_item_nr_offset(0),
3879                           push_items * sizeof(struct btrfs_item));
3880
3881        push_space = BTRFS_LEAF_DATA_SIZE(root) -
3882                     btrfs_item_offset_nr(right, push_items - 1);
3883
3884        copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3885                     leaf_data_end(root, left) - push_space,
3886                     btrfs_leaf_data(right) +
3887                     btrfs_item_offset_nr(right, push_items - 1),
3888                     push_space);
3889        old_left_nritems = btrfs_header_nritems(left);
3890        BUG_ON(old_left_nritems <= 0);
3891
3892        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3893        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3894                u32 ioff;
3895
3896                item = btrfs_item_nr(i);
3897
3898                ioff = btrfs_token_item_offset(left, item, &token);
3899                btrfs_set_token_item_offset(left, item,
3900                      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3901                      &token);
3902        }
3903        btrfs_set_header_nritems(left, old_left_nritems + push_items);
3904
3905        /* fixup right node */
3906        if (push_items > right_nritems)
3907                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3908                       right_nritems);
3909
3910        if (push_items < right_nritems) {
3911                push_space = btrfs_item_offset_nr(right, push_items - 1) -
3912                                                  leaf_data_end(root, right);
3913                memmove_extent_buffer(right, btrfs_leaf_data(right) +
3914                                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3915                                      btrfs_leaf_data(right) +
3916                                      leaf_data_end(root, right), push_space);
3917
3918                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3919                              btrfs_item_nr_offset(push_items),
3920                             (btrfs_header_nritems(right) - push_items) *
3921                             sizeof(struct btrfs_item));
3922        }
3923        right_nritems -= push_items;
3924        btrfs_set_header_nritems(right, right_nritems);
3925        push_space = BTRFS_LEAF_DATA_SIZE(root);
3926        for (i = 0; i < right_nritems; i++) {
3927                item = btrfs_item_nr(i);
3928
3929                push_space = push_space - btrfs_token_item_size(right,
3930                                                                item, &token);
3931                btrfs_set_token_item_offset(right, item, push_space, &token);
3932        }
3933
3934        btrfs_mark_buffer_dirty(left);
3935        if (right_nritems)
3936                btrfs_mark_buffer_dirty(right);
3937        else
3938                clean_tree_block(trans, root->fs_info, right);
3939
3940        btrfs_item_key(right, &disk_key, 0);
3941        fixup_low_keys(root->fs_info, path, &disk_key, 1);
3942
3943        /* then fixup the leaf pointer in the path */
3944        if (path->slots[0] < push_items) {
3945                path->slots[0] += old_left_nritems;
3946                btrfs_tree_unlock(path->nodes[0]);
3947                free_extent_buffer(path->nodes[0]);
3948                path->nodes[0] = left;
3949                path->slots[1] -= 1;
3950        } else {
3951                btrfs_tree_unlock(left);
3952                free_extent_buffer(left);
3953                path->slots[0] -= push_items;
3954        }
3955        BUG_ON(path->slots[0] < 0);
3956        return ret;
3957out:
3958        btrfs_tree_unlock(left);
3959        free_extent_buffer(left);
3960        return ret;
3961}
3962
3963/*
3964 * push some data in the path leaf to the left, trying to free up at
3965 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3966 *
3967 * max_slot can put a limit on how far into the leaf we'll push items.  The
3968 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3969 * items
3970 */
3971static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3972                          *root, struct btrfs_path *path, int min_data_size,
3973                          int data_size, int empty, u32 max_slot)
3974{
3975        struct extent_buffer *right = path->nodes[0];
3976        struct extent_buffer *left;
3977        int slot;
3978        int free_space;
3979        u32 right_nritems;
3980        int ret = 0;
3981
3982        slot = path->slots[1];
3983        if (slot == 0)
3984                return 1;
3985        if (!path->nodes[1])
3986                return 1;
3987
3988        right_nritems = btrfs_header_nritems(right);
3989        if (right_nritems == 0)
3990                return 1;
3991
3992        btrfs_assert_tree_locked(path->nodes[1]);
3993
3994        left = read_node_slot(root, path->nodes[1], slot - 1);
3995        /*
3996         * slot - 1 is not valid or we fail to read the left node,
3997         * no big deal, just return.
3998         */
3999        if (IS_ERR(left))
4000                return 1;
4001
4002        btrfs_tree_lock(left);
4003        btrfs_set_lock_blocking(left);
4004
4005        free_space = btrfs_leaf_free_space(root, left);
4006        if (free_space < data_size) {
4007                ret = 1;
4008                goto out;
4009        }
4010
4011        /* cow and double check */
4012        ret = btrfs_cow_block(trans, root, left,
4013                              path->nodes[1], slot - 1, &left);
4014        if (ret) {
4015                /* we hit -ENOSPC, but it isn't fatal here */
4016                if (ret == -ENOSPC)
4017                        ret = 1;
4018                goto out;
4019        }
4020
4021        free_space = btrfs_leaf_free_space(root, left);
4022        if (free_space < data_size) {
4023                ret = 1;
4024                goto out;
4025        }
4026
4027        return __push_leaf_left(trans, root, path, min_data_size,
4028                               empty, left, free_space, right_nritems,
4029                               max_slot);
4030out:
4031        btrfs_tree_unlock(left);
4032        free_extent_buffer(left);
4033        return ret;
4034}
4035
4036/*
4037 * split the path's leaf in two, making sure there is at least data_size
4038 * available for the resulting leaf level of the path.
4039 */
4040static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4041                                    struct btrfs_root *root,
4042                                    struct btrfs_path *path,
4043                                    struct extent_buffer *l,
4044                                    struct extent_buffer *right,
4045                                    int slot, int mid, int nritems)
4046{
4047        int data_copy_size;
4048        int rt_data_off;
4049        int i;
4050        struct btrfs_disk_key disk_key;
4051        struct btrfs_map_token token;
4052
4053        btrfs_init_map_token(&token);
4054
4055        nritems = nritems - mid;
4056        btrfs_set_header_nritems(right, nritems);
4057        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4058
4059        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4060                           btrfs_item_nr_offset(mid),
4061                           nritems * sizeof(struct btrfs_item));
4062
4063        copy_extent_buffer(right, l,
4064                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4065                     data_copy_size, btrfs_leaf_data(l) +
4066                     leaf_data_end(root, l), data_copy_size);
4067
4068        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4069                      btrfs_item_end_nr(l, mid);
4070
4071        for (i = 0; i < nritems; i++) {
4072                struct btrfs_item *item = btrfs_item_nr(i);
4073                u32 ioff;
4074
4075                ioff = btrfs_token_item_offset(right, item, &token);
4076                btrfs_set_token_item_offset(right, item,
4077                                            ioff + rt_data_off, &token);
4078        }
4079
4080        btrfs_set_header_nritems(l, mid);
4081        btrfs_item_key(right, &disk_key, 0);
4082        insert_ptr(trans, root, path, &disk_key, right->start,
4083                   path->slots[1] + 1, 1);
4084
4085        btrfs_mark_buffer_dirty(right);
4086        btrfs_mark_buffer_dirty(l);
4087        BUG_ON(path->slots[0] != slot);
4088
4089        if (mid <= slot) {
4090                btrfs_tree_unlock(path->nodes[0]);
4091                free_extent_buffer(path->nodes[0]);
4092                path->nodes[0] = right;
4093                path->slots[0] -= mid;
4094                path->slots[1] += 1;
4095        } else {
4096                btrfs_tree_unlock(right);
4097                free_extent_buffer(right);
4098        }
4099
4100        BUG_ON(path->slots[0] < 0);
4101}
4102
4103/*
4104 * double splits happen when we need to insert a big item in the middle
4105 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4106 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4107 *          A                 B                 C
4108 *
4109 * We avoid this by trying to push the items on either side of our target
4110 * into the adjacent leaves.  If all goes well we can avoid the double split
4111 * completely.
4112 */
4113static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4114                                          struct btrfs_root *root,
4115                                          struct btrfs_path *path,
4116                                          int data_size)
4117{
4118        int ret;
4119        int progress = 0;
4120        int slot;
4121        u32 nritems;
4122        int space_needed = data_size;
4123
4124        slot = path->slots[0];
4125        if (slot < btrfs_header_nritems(path->nodes[0]))
4126                space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4127
4128        /*
4129         * try to push all the items after our slot into the
4130         * right leaf
4131         */
4132        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4133        if (ret < 0)
4134                return ret;
4135
4136        if (ret == 0)
4137                progress++;
4138
4139        nritems = btrfs_header_nritems(path->nodes[0]);
4140        /*
4141         * our goal is to get our slot at the start or end of a leaf.  If
4142         * we've done so we're done
4143         */
4144        if (path->slots[0] == 0 || path->slots[0] == nritems)
4145                return 0;
4146
4147        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4148                return 0;
4149
4150        /* try to push all the items before our slot into the next leaf */
4151        slot = path->slots[0];
4152        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4153        if (ret < 0)
4154                return ret;
4155
4156        if (ret == 0)
4157                progress++;
4158
4159        if (progress)
4160                return 0;
4161        return 1;
4162}
4163
4164/*
4165 * split the path's leaf in two, making sure there is at least data_size
4166 * available for the resulting leaf level of the path.
4167 *
4168 * returns 0 if all went well and < 0 on failure.
4169 */
4170static noinline int split_leaf(struct btrfs_trans_handle *trans,
4171                               struct btrfs_root *root,
4172                               struct btrfs_key *ins_key,
4173                               struct btrfs_path *path, int data_size,
4174                               int extend)
4175{
4176        struct btrfs_disk_key disk_key;
4177        struct extent_buffer *l;
4178        u32 nritems;
4179        int mid;
4180        int slot;
4181        struct extent_buffer *right;
4182        struct btrfs_fs_info *fs_info = root->fs_info;
4183        int ret = 0;
4184        int wret;
4185        int split;
4186        int num_doubles = 0;
4187        int tried_avoid_double = 0;
4188
4189        l = path->nodes[0];
4190        slot = path->slots[0];
4191        if (extend && data_size + btrfs_item_size_nr(l, slot) +
4192            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4193                return -EOVERFLOW;
4194
4195        /* first try to make some room by pushing left and right */
4196        if (data_size && path->nodes[1]) {
4197                int space_needed = data_size;
4198
4199                if (slot < btrfs_header_nritems(l))
4200                        space_needed -= btrfs_leaf_free_space(root, l);
4201
4202                wret = push_leaf_right(trans, root, path, space_needed,
4203                                       space_needed, 0, 0);
4204                if (wret < 0)
4205                        return wret;
4206                if (wret) {
4207                        wret = push_leaf_left(trans, root, path, space_needed,
4208                                              space_needed, 0, (u32)-1);
4209                        if (wret < 0)
4210                                return wret;
4211                }
4212                l = path->nodes[0];
4213
4214                /* did the pushes work? */
4215                if (btrfs_leaf_free_space(root, l) >= data_size)
4216                        return 0;
4217        }
4218
4219        if (!path->nodes[1]) {
4220                ret = insert_new_root(trans, root, path, 1);
4221                if (ret)
4222                        return ret;
4223        }
4224again:
4225        split = 1;
4226        l = path->nodes[0];
4227        slot = path->slots[0];
4228        nritems = btrfs_header_nritems(l);
4229        mid = (nritems + 1) / 2;
4230
4231        if (mid <= slot) {
4232                if (nritems == 1 ||
4233                    leaf_space_used(l, mid, nritems - mid) + data_size >
4234                        BTRFS_LEAF_DATA_SIZE(root)) {
4235                        if (slot >= nritems) {
4236                                split = 0;
4237                        } else {
4238                                mid = slot;
4239                                if (mid != nritems &&
4240                                    leaf_space_used(l, mid, nritems - mid) +
4241                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4242                                        if (data_size && !tried_avoid_double)
4243                                                goto push_for_double;
4244                                        split = 2;
4245                                }
4246                        }
4247                }
4248        } else {
4249                if (leaf_space_used(l, 0, mid) + data_size >
4250                        BTRFS_LEAF_DATA_SIZE(root)) {
4251                        if (!extend && data_size && slot == 0) {
4252                                split = 0;
4253                        } else if ((extend || !data_size) && slot == 0) {
4254                                mid = 1;
4255                        } else {
4256                                mid = slot;
4257                                if (mid != nritems &&
4258                                    leaf_space_used(l, mid, nritems - mid) +
4259                                    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4260                                        if (data_size && !tried_avoid_double)
4261                                                goto push_for_double;
4262                                        split = 2;
4263                                }
4264                        }
4265                }
4266        }
4267
4268        if (split == 0)
4269                btrfs_cpu_key_to_disk(&disk_key, ins_key);
4270        else
4271                btrfs_item_key(l, &disk_key, mid);
4272
4273        right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4274                        &disk_key, 0, l->start, 0);
4275        if (IS_ERR(right))
4276                return PTR_ERR(right);
4277
4278        root_add_used(root, root->nodesize);
4279
4280        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4281        btrfs_set_header_bytenr(right, right->start);
4282        btrfs_set_header_generation(right, trans->transid);
4283        btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4284        btrfs_set_header_owner(right, root->root_key.objectid);
4285        btrfs_set_header_level(right, 0);
4286        write_extent_buffer(right, fs_info->fsid,
4287                            btrfs_header_fsid(), BTRFS_FSID_SIZE);
4288
4289        write_extent_buffer(right, fs_info->chunk_tree_uuid,
4290                            btrfs_header_chunk_tree_uuid(right),
4291                            BTRFS_UUID_SIZE);
4292
4293        if (split == 0) {
4294                if (mid <= slot) {
4295                        btrfs_set_header_nritems(right, 0);
4296                        insert_ptr(trans, root, path, &disk_key, right->start,
4297                                   path->slots[1] + 1, 1);
4298                        btrfs_tree_unlock(path->nodes[0]);
4299                        free_extent_buffer(path->nodes[0]);
4300                        path->nodes[0] = right;
4301                        path->slots[0] = 0;
4302                        path->slots[1] += 1;
4303                } else {
4304                        btrfs_set_header_nritems(right, 0);
4305                        insert_ptr(trans, root, path, &disk_key, right->start,
4306                                          path->slots[1], 1);
4307                        btrfs_tree_unlock(path->nodes[0]);
4308                        free_extent_buffer(path->nodes[0]);
4309                        path->nodes[0] = right;
4310                        path->slots[0] = 0;
4311                        if (path->slots[1] == 0)
4312                                fixup_low_keys(fs_info, path, &disk_key, 1);
4313                }
4314                /*
4315                 * We create a new leaf 'right' for the required ins_len and
4316                 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4317                 * the content of ins_len to 'right'.
4318                 */
4319                return ret;
4320        }
4321
4322        copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4323
4324        if (split == 2) {
4325                BUG_ON(num_doubles != 0);
4326                num_doubles++;
4327                goto again;
4328        }
4329
4330        return 0;
4331
4332push_for_double:
4333        push_for_double_split(trans, root, path, data_size);
4334        tried_avoid_double = 1;
4335        if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4336                return 0;
4337        goto again;
4338}
4339
4340static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4341                                         struct btrfs_root *root,
4342                                         struct btrfs_path *path, int ins_len)
4343{
4344        struct btrfs_key key;
4345        struct extent_buffer *leaf;
4346        struct btrfs_file_extent_item *fi;
4347        u64 extent_len = 0;
4348        u32 item_size;
4349        int ret;
4350
4351        leaf = path->nodes[0];
4352        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4353
4354        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4355               key.type != BTRFS_EXTENT_CSUM_KEY);
4356
4357        if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4358                return 0;
4359
4360        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4361        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4362                fi = btrfs_item_ptr(leaf, path->slots[0],
4363                                    struct btrfs_file_extent_item);
4364                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4365        }
4366        btrfs_release_path(path);
4367
4368        path->keep_locks = 1;
4369        path->search_for_split = 1;
4370        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4371        path->search_for_split = 0;
4372        if (ret > 0)
4373                ret = -EAGAIN;
4374        if (ret < 0)
4375                goto err;
4376
4377        ret = -EAGAIN;
4378        leaf = path->nodes[0];
4379        /* if our item isn't there, return now */
4380        if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4381                goto err;
4382
4383        /* the leaf has  changed, it now has room.  return now */
4384        if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4385                goto err;
4386
4387        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4388                fi = btrfs_item_ptr(leaf, path->slots[0],
4389                                    struct btrfs_file_extent_item);
4390                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4391                        goto err;
4392        }
4393
4394        btrfs_set_path_blocking(path);
4395        ret = split_leaf(trans, root, &key, path, ins_len, 1);
4396        if (ret)
4397                goto err;
4398
4399        path->keep_locks = 0;
4400        btrfs_unlock_up_safe(path, 1);
4401        return 0;
4402err:
4403        path->keep_locks = 0;
4404        return ret;
4405}
4406
4407static noinline int split_item(struct btrfs_trans_handle *trans,
4408                               struct btrfs_root *root,
4409                               struct btrfs_path *path,
4410                               struct btrfs_key *new_key,
4411                               unsigned long split_offset)
4412{
4413        struct extent_buffer *leaf;
4414        struct btrfs_item *item;
4415        struct btrfs_item *new_item;
4416        int slot;
4417        char *buf;
4418        u32 nritems;
4419        u32 item_size;
4420        u32 orig_offset;
4421        struct btrfs_disk_key disk_key;
4422
4423        leaf = path->nodes[0];
4424        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4425
4426        btrfs_set_path_blocking(path);
4427
4428        item = btrfs_item_nr(path->slots[0]);
4429        orig_offset = btrfs_item_offset(leaf, item);
4430        item_size = btrfs_item_size(leaf, item);
4431
4432        buf = kmalloc(item_size, GFP_NOFS);
4433        if (!buf)
4434                return -ENOMEM;
4435
4436        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4437                            path->slots[0]), item_size);
4438
4439        slot = path->slots[0] + 1;
4440        nritems = btrfs_header_nritems(leaf);
4441        if (slot != nritems) {
4442                /* shift the items */
4443                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4444                                btrfs_item_nr_offset(slot),
4445                                (nritems - slot) * sizeof(struct btrfs_item));
4446        }
4447
4448        btrfs_cpu_key_to_disk(&disk_key, new_key);
4449        btrfs_set_item_key(leaf, &disk_key, slot);
4450
4451        new_item = btrfs_item_nr(slot);
4452
4453        btrfs_set_item_offset(leaf, new_item, orig_offset);
4454        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4455
4456        btrfs_set_item_offset(leaf, item,
4457                              orig_offset + item_size - split_offset);
4458        btrfs_set_item_size(leaf, item, split_offset);
4459
4460        btrfs_set_header_nritems(leaf, nritems + 1);
4461
4462        /* write the data for the start of the original item */
4463        write_extent_buffer(leaf, buf,
4464                            btrfs_item_ptr_offset(leaf, path->slots[0]),
4465                            split_offset);
4466
4467        /* write the data for the new item */
4468        write_extent_buffer(leaf, buf + split_offset,
4469                            btrfs_item_ptr_offset(leaf, slot),
4470                            item_size - split_offset);
4471        btrfs_mark_buffer_dirty(leaf);
4472
4473        BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4474        kfree(buf);
4475        return 0;
4476}
4477
4478/*
4479 * This function splits a single item into two items,
4480 * giving 'new_key' to the new item and splitting the
4481 * old one at split_offset (from the start of the item).
4482 *
4483 * The path may be released by this operation.  After
4484 * the split, the path is pointing to the old item.  The
4485 * new item is going to be in the same node as the old one.
4486 *
4487 * Note, the item being split must be smaller enough to live alone on
4488 * a tree block with room for one extra struct btrfs_item
4489 *
4490 * This allows us to split the item in place, keeping a lock on the
4491 * leaf the entire time.
4492 */
4493int btrfs_split_item(struct btrfs_trans_handle *trans,
4494                     struct btrfs_root *root,
4495                     struct btrfs_path *path,
4496                     struct btrfs_key *new_key,
4497                     unsigned long split_offset)
4498{
4499        int ret;
4500        ret = setup_leaf_for_split(trans, root, path,
4501                                   sizeof(struct btrfs_item));
4502        if (ret)
4503                return ret;
4504
4505        ret = split_item(trans, root, path, new_key, split_offset);
4506        return ret;
4507}
4508
4509/*
4510 * This function duplicate a item, giving 'new_key' to the new item.
4511 * It guarantees both items live in the same tree leaf and the new item
4512 * is contiguous with the original item.
4513 *
4514 * This allows us to split file extent in place, keeping a lock on the
4515 * leaf the entire time.
4516 */
4517int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4518                         struct btrfs_root *root,
4519                         struct btrfs_path *path,
4520                         struct btrfs_key *new_key)
4521{
4522        struct extent_buffer *leaf;
4523        int ret;
4524        u32 item_size;
4525
4526        leaf = path->nodes[0];
4527        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4528        ret = setup_leaf_for_split(trans, root, path,
4529                                   item_size + sizeof(struct btrfs_item));
4530        if (ret)
4531                return ret;
4532
4533        path->slots[0]++;
4534        setup_items_for_insert(root, path, new_key, &item_size,
4535                               item_size, item_size +
4536                               sizeof(struct btrfs_item), 1);
4537        leaf = path->nodes[0];
4538        memcpy_extent_buffer(leaf,
4539                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4540                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4541                             item_size);
4542        return 0;
4543}
4544
4545/*
4546 * make the item pointed to by the path smaller.  new_size indicates
4547 * how small to make it, and from_end tells us if we just chop bytes
4548 * off the end of the item or if we shift the item to chop bytes off
4549 * the front.
4550 */
4551void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4552                         u32 new_size, int from_end)
4553{
4554        int slot;
4555        struct extent_buffer *leaf;
4556        struct btrfs_item *item;
4557        u32 nritems;
4558        unsigned int data_end;
4559        unsigned int old_data_start;
4560        unsigned int old_size;
4561        unsigned int size_diff;
4562        int i;
4563        struct btrfs_map_token token;
4564
4565        btrfs_init_map_token(&token);
4566
4567        leaf = path->nodes[0];
4568        slot = path->slots[0];
4569
4570        old_size = btrfs_item_size_nr(leaf, slot);
4571        if (old_size == new_size)
4572                return;
4573
4574        nritems = btrfs_header_nritems(leaf);
4575        data_end = leaf_data_end(root, leaf);
4576
4577        old_data_start = btrfs_item_offset_nr(leaf, slot);
4578
4579        size_diff = old_size - new_size;
4580
4581        BUG_ON(slot < 0);
4582        BUG_ON(slot >= nritems);
4583
4584        /*
4585         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4586         */
4587        /* first correct the data pointers */
4588        for (i = slot; i < nritems; i++) {
4589                u32 ioff;
4590                item = btrfs_item_nr(i);
4591
4592                ioff = btrfs_token_item_offset(leaf, item, &token);
4593                btrfs_set_token_item_offset(leaf, item,
4594                                            ioff + size_diff, &token);
4595        }
4596
4597        /* shift the data */
4598        if (from_end) {
4599                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4600                              data_end + size_diff, btrfs_leaf_data(leaf) +
4601                              data_end, old_data_start + new_size - data_end);
4602        } else {
4603                struct btrfs_disk_key disk_key;
4604                u64 offset;
4605
4606                btrfs_item_key(leaf, &disk_key, slot);
4607
4608                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4609                        unsigned long ptr;
4610                        struct btrfs_file_extent_item *fi;
4611
4612                        fi = btrfs_item_ptr(leaf, slot,
4613                                            struct btrfs_file_extent_item);
4614                        fi = (struct btrfs_file_extent_item *)(
4615                             (unsigned long)fi - size_diff);
4616
4617                        if (btrfs_file_extent_type(leaf, fi) ==
4618                            BTRFS_FILE_EXTENT_INLINE) {
4619                                ptr = btrfs_item_ptr_offset(leaf, slot);
4620                                memmove_extent_buffer(leaf, ptr,
4621                                      (unsigned long)fi,
4622                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4623                        }
4624                }
4625
4626                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4627                              data_end + size_diff, btrfs_leaf_data(leaf) +
4628                              data_end, old_data_start - data_end);
4629
4630                offset = btrfs_disk_key_offset(&disk_key);
4631                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4632                btrfs_set_item_key(leaf, &disk_key, slot);
4633                if (slot == 0)
4634                        fixup_low_keys(root->fs_info, path, &disk_key, 1);
4635        }
4636
4637        item = btrfs_item_nr(slot);
4638        btrfs_set_item_size(leaf, item, new_size);
4639        btrfs_mark_buffer_dirty(leaf);
4640
4641        if (btrfs_leaf_free_space(root, leaf) < 0) {
4642                btrfs_print_leaf(root, leaf);
4643                BUG();
4644        }
4645}
4646
4647/*
4648 * make the item pointed to by the path bigger, data_size is the added size.
4649 */
4650void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4651                       u32 data_size)
4652{
4653        int slot;
4654        struct extent_buffer *leaf;
4655        struct btrfs_item *item;
4656        u32 nritems;
4657        unsigned int data_end;
4658        unsigned int old_data;
4659        unsigned int old_size;
4660        int i;
4661        struct btrfs_map_token token;
4662
4663        btrfs_init_map_token(&token);
4664
4665        leaf = path->nodes[0];
4666
4667        nritems = btrfs_header_nritems(leaf);
4668        data_end = leaf_data_end(root, leaf);
4669
4670        if (btrfs_leaf_free_space(root, leaf) < data_size) {
4671                btrfs_print_leaf(root, leaf);
4672                BUG();
4673        }
4674        slot = path->slots[0];
4675        old_data = btrfs_item_end_nr(leaf, slot);
4676
4677        BUG_ON(slot < 0);
4678        if (slot >= nritems) {
4679                btrfs_print_leaf(root, leaf);
4680                btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4681                       slot, nritems);
4682                BUG_ON(1);
4683        }
4684
4685        /*
4686         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4687         */
4688        /* first correct the data pointers */
4689        for (i = slot; i < nritems; i++) {
4690                u32 ioff;
4691                item = btrfs_item_nr(i);
4692
4693                ioff = btrfs_token_item_offset(leaf, item, &token);
4694                btrfs_set_token_item_offset(leaf, item,
4695                                            ioff - data_size, &token);
4696        }
4697
4698        /* shift the data */
4699        memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4700                      data_end - data_size, btrfs_leaf_data(leaf) +
4701                      data_end, old_data - data_end);
4702
4703        data_end = old_data;
4704        old_size = btrfs_item_size_nr(leaf, slot);
4705        item = btrfs_item_nr(slot);
4706        btrfs_set_item_size(leaf, item, old_size + data_size);
4707        btrfs_mark_buffer_dirty(leaf);
4708
4709        if (btrfs_leaf_free_space(root, leaf) < 0) {
4710                btrfs_print_leaf(root, leaf);
4711                BUG();
4712        }
4713}
4714
4715/*
4716 * this is a helper for btrfs_insert_empty_items, the main goal here is
4717 * to save stack depth by doing the bulk of the work in a function
4718 * that doesn't call btrfs_search_slot
4719 */
4720void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4721                            struct btrfs_key *cpu_key, u32 *data_size,
4722                            u32 total_data, u32 total_size, int nr)
4723{
4724        struct btrfs_item *item;
4725        int i;
4726        u32 nritems;
4727        unsigned int data_end;
4728        struct btrfs_disk_key disk_key;
4729        struct extent_buffer *leaf;
4730        int slot;
4731        struct btrfs_map_token token;
4732
4733        if (path->slots[0] == 0) {
4734                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4735                fixup_low_keys(root->fs_info, path, &disk_key, 1);
4736        }
4737        btrfs_unlock_up_safe(path, 1);
4738
4739        btrfs_init_map_token(&token);
4740
4741        leaf = path->nodes[0];
4742        slot = path->slots[0];
4743
4744        nritems = btrfs_header_nritems(leaf);
4745        data_end = leaf_data_end(root, leaf);
4746
4747        if (btrfs_leaf_free_space(root, leaf) < total_size) {
4748                btrfs_print_leaf(root, leaf);
4749                btrfs_crit(root->fs_info,
4750                           "not enough freespace need %u have %d",
4751                           total_size, btrfs_leaf_free_space(root, leaf));
4752                BUG();
4753        }
4754
4755        if (slot != nritems) {
4756                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4757
4758                if (old_data < data_end) {
4759                        btrfs_print_leaf(root, leaf);
4760                        btrfs_crit(root->fs_info,
4761                                   "slot %d old_data %d data_end %d",
4762                                   slot, old_data, data_end);
4763                        BUG_ON(1);
4764                }
4765                /*
4766                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4767                 */
4768                /* first correct the data pointers */
4769                for (i = slot; i < nritems; i++) {
4770                        u32 ioff;
4771
4772                        item = btrfs_item_nr(i);
4773                        ioff = btrfs_token_item_offset(leaf, item, &token);
4774                        btrfs_set_token_item_offset(leaf, item,
4775                                                    ioff - total_data, &token);
4776                }
4777                /* shift the items */
4778                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4779                              btrfs_item_nr_offset(slot),
4780                              (nritems - slot) * sizeof(struct btrfs_item));
4781
4782                /* shift the data */
4783                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4784                              data_end - total_data, btrfs_leaf_data(leaf) +
4785                              data_end, old_data - data_end);
4786                data_end = old_data;
4787        }
4788
4789        /* setup the item for the new data */
4790        for (i = 0; i < nr; i++) {
4791                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4792                btrfs_set_item_key(leaf, &disk_key, slot + i);
4793                item = btrfs_item_nr(slot + i);
4794                btrfs_set_token_item_offset(leaf, item,
4795                                            data_end - data_size[i], &token);
4796                data_end -= data_size[i];
4797                btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4798        }
4799
4800        btrfs_set_header_nritems(leaf, nritems + nr);
4801        btrfs_mark_buffer_dirty(leaf);
4802
4803        if (btrfs_leaf_free_space(root, leaf) < 0) {
4804                btrfs_print_leaf(root, leaf);
4805                BUG();
4806        }
4807}
4808
4809/*
4810 * Given a key and some data, insert items into the tree.
4811 * This does all the path init required, making room in the tree if needed.
4812 */
4813int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4814                            struct btrfs_root *root,
4815                            struct btrfs_path *path,
4816                            struct btrfs_key *cpu_key, u32 *data_size,
4817                            int nr)
4818{
4819        int ret = 0;
4820        int slot;
4821        int i;
4822        u32 total_size = 0;
4823        u32 total_data = 0;
4824
4825        for (i = 0; i < nr; i++)
4826                total_data += data_size[i];
4827
4828        total_size = total_data + (nr * sizeof(struct btrfs_item));
4829        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4830        if (ret == 0)
4831                return -EEXIST;
4832        if (ret < 0)
4833                return ret;
4834
4835        slot = path->slots[0];
4836        BUG_ON(slot < 0);
4837
4838        setup_items_for_insert(root, path, cpu_key, data_size,
4839                               total_data, total_size, nr);
4840        return 0;
4841}
4842
4843/*
4844 * Given a key and some data, insert an item into the tree.
4845 * This does all the path init required, making room in the tree if needed.
4846 */
4847int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4848                      *root, struct btrfs_key *cpu_key, void *data, u32
4849                      data_size)
4850{
4851        int ret = 0;
4852        struct btrfs_path *path;
4853        struct extent_buffer *leaf;
4854        unsigned long ptr;
4855
4856        path = btrfs_alloc_path();
4857        if (!path)
4858                return -ENOMEM;
4859        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4860        if (!ret) {
4861                leaf = path->nodes[0];
4862                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4863                write_extent_buffer(leaf, data, ptr, data_size);
4864                btrfs_mark_buffer_dirty(leaf);
4865        }
4866        btrfs_free_path(path);
4867        return ret;
4868}
4869
4870/*
4871 * delete the pointer from a given node.
4872 *
4873 * the tree should have been previously balanced so the deletion does not
4874 * empty a node.
4875 */
4876static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4877                    int level, int slot)
4878{
4879        struct extent_buffer *parent = path->nodes[level];
4880        u32 nritems;
4881        int ret;
4882
4883        nritems = btrfs_header_nritems(parent);
4884        if (slot != nritems - 1) {
4885                if (level)
4886                        tree_mod_log_eb_move(root->fs_info, parent, slot,
4887                                             slot + 1, nritems - slot - 1);
4888                memmove_extent_buffer(parent,
4889                              btrfs_node_key_ptr_offset(slot),
4890                              btrfs_node_key_ptr_offset(slot + 1),
4891                              sizeof(struct btrfs_key_ptr) *
4892                              (nritems - slot - 1));
4893        } else if (level) {
4894                ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4895                                              MOD_LOG_KEY_REMOVE, GFP_NOFS);
4896                BUG_ON(ret < 0);
4897        }
4898
4899        nritems--;
4900        btrfs_set_header_nritems(parent, nritems);
4901        if (nritems == 0 && parent == root->node) {
4902                BUG_ON(btrfs_header_level(root->node) != 1);
4903                /* just turn the root into a leaf and break */
4904                btrfs_set_header_level(root->node, 0);
4905        } else if (slot == 0) {
4906                struct btrfs_disk_key disk_key;
4907
4908                btrfs_node_key(parent, &disk_key, 0);
4909                fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
4910        }
4911        btrfs_mark_buffer_dirty(parent);
4912}
4913
4914/*
4915 * a helper function to delete the leaf pointed to by path->slots[1] and
4916 * path->nodes[1].
4917 *
4918 * This deletes the pointer in path->nodes[1] and frees the leaf
4919 * block extent.  zero is returned if it all worked out, < 0 otherwise.
4920 *
4921 * The path must have already been setup for deleting the leaf, including
4922 * all the proper balancing.  path->nodes[1] must be locked.
4923 */
4924static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4925                                    struct btrfs_root *root,
4926                                    struct btrfs_path *path,
4927                                    struct extent_buffer *leaf)
4928{
4929        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4930        del_ptr(root, path, 1, path->slots[1]);
4931
4932        /*
4933         * btrfs_free_extent is expensive, we want to make sure we
4934         * aren't holding any locks when we call it
4935         */
4936        btrfs_unlock_up_safe(path, 0);
4937
4938        root_sub_used(root, leaf->len);
4939
4940        extent_buffer_get(leaf);
4941        btrfs_free_tree_block(trans, root, leaf, 0, 1);
4942        free_extent_buffer_stale(leaf);
4943}
4944/*
4945 * delete the item at the leaf level in path.  If that empties
4946 * the leaf, remove it from the tree
4947 */
4948int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4949                    struct btrfs_path *path, int slot, int nr)
4950{
4951        struct extent_buffer *leaf;
4952        struct btrfs_item *item;
4953        u32 last_off;
4954        u32 dsize = 0;
4955        int ret = 0;
4956        int wret;
4957        int i;
4958        u32 nritems;
4959        struct btrfs_map_token token;
4960
4961        btrfs_init_map_token(&token);
4962
4963        leaf = path->nodes[0];
4964        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4965
4966        for (i = 0; i < nr; i++)
4967                dsize += btrfs_item_size_nr(leaf, slot + i);
4968
4969        nritems = btrfs_header_nritems(leaf);
4970
4971        if (slot + nr != nritems) {
4972                int data_end = leaf_data_end(root, leaf);
4973
4974                memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4975                              data_end + dsize,
4976                              btrfs_leaf_data(leaf) + data_end,
4977                              last_off - data_end);
4978
4979                for (i = slot + nr; i < nritems; i++) {
4980                        u32 ioff;
4981
4982                        item = btrfs_item_nr(i);
4983                        ioff = btrfs_token_item_offset(leaf, item, &token);
4984                        btrfs_set_token_item_offset(leaf, item,
4985                                                    ioff + dsize, &token);
4986                }
4987
4988                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4989                              btrfs_item_nr_offset(slot + nr),
4990                              sizeof(struct btrfs_item) *
4991                              (nritems - slot - nr));
4992        }
4993        btrfs_set_header_nritems(leaf, nritems - nr);
4994        nritems -= nr;
4995
4996        /* delete the leaf if we've emptied it */
4997        if (nritems == 0) {
4998                if (leaf == root->node) {
4999                        btrfs_set_header_level(leaf, 0);
5000                } else {
5001                        btrfs_set_path_blocking(path);
5002                        clean_tree_block(trans, root->fs_info, leaf);
5003                        btrfs_del_leaf(trans, root, path, leaf);
5004                }
5005        } else {
5006                int used = leaf_space_used(leaf, 0, nritems);
5007                if (slot == 0) {
5008                        struct btrfs_disk_key disk_key;
5009
5010                        btrfs_item_key(leaf, &disk_key, 0);
5011                        fixup_low_keys(root->fs_info, path, &disk_key, 1);
5012                }
5013
5014                /* delete the leaf if it is mostly empty */
5015                if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
5016                        /* push_leaf_left fixes the path.
5017                         * make sure the path still points to our leaf
5018                         * for possible call to del_ptr below
5019                         */
5020                        slot = path->slots[1];
5021                        extent_buffer_get(leaf);
5022
5023                        btrfs_set_path_blocking(path);
5024                        wret = push_leaf_left(trans, root, path, 1, 1,
5025                                              1, (u32)-1);
5026                        if (wret < 0 && wret != -ENOSPC)
5027                                ret = wret;
5028
5029                        if (path->nodes[0] == leaf &&
5030                            btrfs_header_nritems(leaf)) {
5031                                wret = push_leaf_right(trans, root, path, 1,
5032                                                       1, 1, 0);
5033                                if (wret < 0 && wret != -ENOSPC)
5034                                        ret = wret;
5035                        }
5036
5037                        if (btrfs_header_nritems(leaf) == 0) {
5038                                path->slots[1] = slot;
5039                                btrfs_del_leaf(trans, root, path, leaf);
5040                                free_extent_buffer(leaf);
5041                                ret = 0;
5042                        } else {
5043                                /* if we're still in the path, make sure
5044                                 * we're dirty.  Otherwise, one of the
5045                                 * push_leaf functions must have already
5046                                 * dirtied this buffer
5047                                 */
5048                                if (path->nodes[0] == leaf)
5049                                        btrfs_mark_buffer_dirty(leaf);
5050                                free_extent_buffer(leaf);
5051                        }
5052                } else {
5053                        btrfs_mark_buffer_dirty(leaf);
5054                }
5055        }
5056        return ret;
5057}
5058
5059/*
5060 * search the tree again to find a leaf with lesser keys
5061 * returns 0 if it found something or 1 if there are no lesser leaves.
5062 * returns < 0 on io errors.
5063 *
5064 * This may release the path, and so you may lose any locks held at the
5065 * time you call it.
5066 */
5067int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5068{
5069        struct btrfs_key key;
5070        struct btrfs_disk_key found_key;
5071        int ret;
5072
5073        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5074
5075        if (key.offset > 0) {
5076                key.offset--;
5077        } else if (key.type > 0) {
5078                key.type--;
5079                key.offset = (u64)-1;
5080        } else if (key.objectid > 0) {
5081                key.objectid--;
5082                key.type = (u8)-1;
5083                key.offset = (u64)-1;
5084        } else {
5085                return 1;
5086        }
5087
5088        btrfs_release_path(path);
5089        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5090        if (ret < 0)
5091                return ret;
5092        btrfs_item_key(path->nodes[0], &found_key, 0);
5093        ret = comp_keys(&found_key, &key);
5094        /*
5095         * We might have had an item with the previous key in the tree right
5096         * before we released our path. And after we released our path, that
5097         * item might have been pushed to the first slot (0) of the leaf we
5098         * were holding due to a tree balance. Alternatively, an item with the
5099         * previous key can exist as the only element of a leaf (big fat item).
5100         * Therefore account for these 2 cases, so that our callers (like
5101         * btrfs_previous_item) don't miss an existing item with a key matching
5102         * the previous key we computed above.
5103         */
5104        if (ret <= 0)
5105                return 0;
5106        return 1;
5107}
5108
5109/*
5110 * A helper function to walk down the tree starting at min_key, and looking
5111 * for nodes or leaves that are have a minimum transaction id.
5112 * This is used by the btree defrag code, and tree logging
5113 *
5114 * This does not cow, but it does stuff the starting key it finds back
5115 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5116 * key and get a writable path.
5117 *
5118 * This does lock as it descends, and path->keep_locks should be set
5119 * to 1 by the caller.
5120 *
5121 * This honors path->lowest_level to prevent descent past a given level
5122 * of the tree.
5123 *
5124 * min_trans indicates the oldest transaction that you are interested
5125 * in walking through.  Any nodes or leaves older than min_trans are
5126 * skipped over (without reading them).
5127 *
5128 * returns zero if something useful was found, < 0 on error and 1 if there
5129 * was nothing in the tree that matched the search criteria.
5130 */
5131int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5132                         struct btrfs_path *path,
5133                         u64 min_trans)
5134{
5135        struct extent_buffer *cur;
5136        struct btrfs_key found_key;
5137        int slot;
5138        int sret;
5139        u32 nritems;
5140        int level;
5141        int ret = 1;
5142        int keep_locks = path->keep_locks;
5143
5144        path->keep_locks = 1;
5145again:
5146        cur = btrfs_read_lock_root_node(root);
5147        level = btrfs_header_level(cur);
5148        WARN_ON(path->nodes[level]);
5149        path->nodes[level] = cur;
5150        path->locks[level] = BTRFS_READ_LOCK;
5151
5152        if (btrfs_header_generation(cur) < min_trans) {
5153                ret = 1;
5154                goto out;
5155        }
5156        while (1) {
5157                nritems = btrfs_header_nritems(cur);
5158                level = btrfs_header_level(cur);
5159                sret = bin_search(cur, min_key, level, &slot);
5160
5161                /* at the lowest level, we're done, setup the path and exit */
5162                if (level == path->lowest_level) {
5163                        if (slot >= nritems)
5164                                goto find_next_key;
5165                        ret = 0;
5166                        path->slots[level] = slot;
5167                        btrfs_item_key_to_cpu(cur, &found_key, slot);
5168                        goto out;
5169                }
5170                if (sret && slot > 0)
5171                        slot--;
5172                /*
5173                 * check this node pointer against the min_trans parameters.
5174                 * If it is too old, old, skip to the next one.
5175                 */
5176                while (slot < nritems) {
5177                        u64 gen;
5178
5179                        gen = btrfs_node_ptr_generation(cur, slot);
5180                        if (gen < min_trans) {
5181                                slot++;
5182                                continue;
5183                        }
5184                        break;
5185                }
5186find_next_key:
5187                /*
5188                 * we didn't find a candidate key in this node, walk forward
5189                 * and find another one
5190                 */
5191                if (slot >= nritems) {
5192                        path->slots[level] = slot;
5193                        btrfs_set_path_blocking(path);
5194                        sret = btrfs_find_next_key(root, path, min_key, level,
5195                                                  min_trans);
5196                        if (sret == 0) {
5197                                btrfs_release_path(path);
5198                                goto again;
5199                        } else {
5200                                goto out;
5201                        }
5202                }
5203                /* save our key for returning back */
5204                btrfs_node_key_to_cpu(cur, &found_key, slot);
5205                path->slots[level] = slot;
5206                if (level == path->lowest_level) {
5207                        ret = 0;
5208                        goto out;
5209                }
5210                btrfs_set_path_blocking(path);
5211                cur = read_node_slot(root, cur, slot);
5212                if (IS_ERR(cur)) {
5213                        ret = PTR_ERR(cur);
5214                        goto out;
5215                }
5216
5217                btrfs_tree_read_lock(cur);
5218
5219                path->locks[level - 1] = BTRFS_READ_LOCK;
5220                path->nodes[level - 1] = cur;
5221                unlock_up(path, level, 1, 0, NULL);
5222                btrfs_clear_path_blocking(path, NULL, 0);
5223        }
5224out:
5225        path->keep_locks = keep_locks;
5226        if (ret == 0) {
5227                btrfs_unlock_up_safe(path, path->lowest_level + 1);
5228                btrfs_set_path_blocking(path);
5229                memcpy(min_key, &found_key, sizeof(found_key));
5230        }
5231        return ret;
5232}
5233
5234static int tree_move_down(struct btrfs_root *root,
5235                           struct btrfs_path *path,
5236                           int *level, int root_level)
5237{
5238        struct extent_buffer *eb;
5239
5240        BUG_ON(*level == 0);
5241        eb = read_node_slot(root, path->nodes[*level], path->slots[*level]);
5242        if (IS_ERR(eb))
5243                return PTR_ERR(eb);
5244
5245        path->nodes[*level - 1] = eb;
5246        path->slots[*level - 1] = 0;
5247        (*level)--;
5248        return 0;
5249}
5250
5251static int tree_move_next_or_upnext(struct btrfs_root *root,
5252                                    struct btrfs_path *path,
5253                                    int *level, int root_level)
5254{
5255        int ret = 0;
5256        int nritems;
5257        nritems = btrfs_header_nritems(path->nodes[*level]);
5258
5259        path->slots[*level]++;
5260
5261        while (path->slots[*level] >= nritems) {
5262                if (*level == root_level)
5263                        return -1;
5264
5265                /* move upnext */
5266                path->slots[*level] = 0;
5267                free_extent_buffer(path->nodes[*level]);
5268                path->nodes[*level] = NULL;
5269                (*level)++;
5270                path->slots[*level]++;
5271
5272                nritems = btrfs_header_nritems(path->nodes[*level]);
5273                ret = 1;
5274        }
5275        return ret;
5276}
5277
5278/*
5279 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5280 * or down.
5281 */
5282static int tree_advance(struct btrfs_root *root,
5283                        struct btrfs_path *path,
5284                        int *level, int root_level,
5285                        int allow_down,
5286                        struct btrfs_key *key)
5287{
5288        int ret;
5289
5290        if (*level == 0 || !allow_down) {
5291                ret = tree_move_next_or_upnext(root, path, level, root_level);
5292        } else {
5293                ret = tree_move_down(root, path, level, root_level);
5294        }
5295        if (ret >= 0) {
5296                if (*level == 0)
5297                        btrfs_item_key_to_cpu(path->nodes[*level], key,
5298                                        path->slots[*level]);
5299                else
5300                        btrfs_node_key_to_cpu(path->nodes[*level], key,
5301                                        path->slots[*level]);
5302        }
5303        return ret;
5304}
5305
5306static int tree_compare_item(struct btrfs_root *left_root,
5307                             struct btrfs_path *left_path,
5308                             struct btrfs_path *right_path,
5309                             char *tmp_buf)
5310{
5311        int cmp;
5312        int len1, len2;
5313        unsigned long off1, off2;
5314
5315        len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5316        len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5317        if (len1 != len2)
5318                return 1;
5319
5320        off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5321        off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5322                                right_path->slots[0]);
5323
5324        read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5325
5326        cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5327        if (cmp)
5328                return 1;
5329        return 0;
5330}
5331
5332#define ADVANCE 1
5333#define ADVANCE_ONLY_NEXT -1
5334
5335/*
5336 * This function compares two trees and calls the provided callback for
5337 * every changed/new/deleted item it finds.
5338 * If shared tree blocks are encountered, whole subtrees are skipped, making
5339 * the compare pretty fast on snapshotted subvolumes.
5340 *
5341 * This currently works on commit roots only. As commit roots are read only,
5342 * we don't do any locking. The commit roots are protected with transactions.
5343 * Transactions are ended and rejoined when a commit is tried in between.
5344 *
5345 * This function checks for modifications done to the trees while comparing.
5346 * If it detects a change, it aborts immediately.
5347 */
5348int btrfs_compare_trees(struct btrfs_root *left_root,
5349                        struct btrfs_root *right_root,
5350                        btrfs_changed_cb_t changed_cb, void *ctx)
5351{
5352        int ret;
5353        int cmp;
5354        struct btrfs_path *left_path = NULL;
5355        struct btrfs_path *right_path = NULL;
5356        struct btrfs_key left_key;
5357        struct btrfs_key right_key;
5358        char *tmp_buf = NULL;
5359        int left_root_level;
5360        int right_root_level;
5361        int left_level;
5362        int right_level;
5363        int left_end_reached;
5364        int right_end_reached;
5365        int advance_left;
5366        int advance_right;
5367        u64 left_blockptr;
5368        u64 right_blockptr;
5369        u64 left_gen;
5370        u64 right_gen;
5371
5372        left_path = btrfs_alloc_path();
5373        if (!left_path) {
5374                ret = -ENOMEM;
5375                goto out;
5376        }
5377        right_path = btrfs_alloc_path();
5378        if (!right_path) {
5379                ret = -ENOMEM;
5380                goto out;
5381        }
5382
5383        tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN);
5384        if (!tmp_buf) {
5385                tmp_buf = vmalloc(left_root->nodesize);
5386                if (!tmp_buf) {
5387                        ret = -ENOMEM;
5388                        goto out;
5389                }
5390        }
5391
5392        left_path->search_commit_root = 1;
5393        left_path->skip_locking = 1;
5394        right_path->search_commit_root = 1;
5395        right_path->skip_locking = 1;
5396
5397        /*
5398         * Strategy: Go to the first items of both trees. Then do
5399         *
5400         * If both trees are at level 0
5401         *   Compare keys of current items
5402         *     If left < right treat left item as new, advance left tree
5403         *       and repeat
5404         *     If left > right treat right item as deleted, advance right tree
5405         *       and repeat
5406         *     If left == right do deep compare of items, treat as changed if
5407         *       needed, advance both trees and repeat
5408         * If both trees are at the same level but not at level 0
5409         *   Compare keys of current nodes/leafs
5410         *     If left < right advance left tree and repeat
5411         *     If left > right advance right tree and repeat
5412         *     If left == right compare blockptrs of the next nodes/leafs
5413         *       If they match advance both trees but stay at the same level
5414         *         and repeat
5415         *       If they don't match advance both trees while allowing to go
5416         *         deeper and repeat
5417         * If tree levels are different
5418         *   Advance the tree that needs it and repeat
5419         *
5420         * Advancing a tree means:
5421         *   If we are at level 0, try to go to the next slot. If that's not
5422         *   possible, go one level up and repeat. Stop when we found a level
5423         *   where we could go to the next slot. We may at this point be on a
5424         *   node or a leaf.
5425         *
5426         *   If we are not at level 0 and not on shared tree blocks, go one
5427         *   level deeper.
5428         *
5429         *   If we are not at level 0 and on shared tree blocks, go one slot to
5430         *   the right if possible or go up and right.
5431         */
5432
5433        down_read(&left_root->fs_info->commit_root_sem);
5434        left_level = btrfs_header_level(left_root->commit_root);
5435        left_root_level = left_level;
5436        left_path->nodes[left_level] = left_root->commit_root;
5437        extent_buffer_get(left_path->nodes[left_level]);
5438
5439        right_level = btrfs_header_level(right_root->commit_root);
5440        right_root_level = right_level;
5441        right_path->nodes[right_level] = right_root->commit_root;
5442        extent_buffer_get(right_path->nodes[right_level]);
5443        up_read(&left_root->fs_info->commit_root_sem);
5444
5445        if (left_level == 0)
5446                btrfs_item_key_to_cpu(left_path->nodes[left_level],
5447                                &left_key, left_path->slots[left_level]);
5448        else
5449                btrfs_node_key_to_cpu(left_path->nodes[left_level],
5450                                &left_key, left_path->slots[left_level]);
5451        if (right_level == 0)
5452                btrfs_item_key_to_cpu(right_path->nodes[right_level],
5453                                &right_key, right_path->slots[right_level]);
5454        else
5455                btrfs_node_key_to_cpu(right_path->nodes[right_level],
5456                                &right_key, right_path->slots[right_level]);
5457
5458        left_end_reached = right_end_reached = 0;
5459        advance_left = advance_right = 0;
5460
5461        while (1) {
5462                if (advance_left && !left_end_reached) {
5463                        ret = tree_advance(left_root, left_path, &left_level,
5464                                        left_root_level,
5465                                        advance_left != ADVANCE_ONLY_NEXT,
5466                                        &left_key);
5467                        if (ret == -1)
5468                                left_end_reached = ADVANCE;
5469                        else if (ret < 0)
5470                                goto out;
5471                        advance_left = 0;
5472                }
5473                if (advance_right && !right_end_reached) {
5474                        ret = tree_advance(right_root, right_path, &right_level,
5475                                        right_root_level,
5476                                        advance_right != ADVANCE_ONLY_NEXT,
5477                                        &right_key);
5478                        if (ret == -1)
5479                                right_end_reached = ADVANCE;
5480                        else if (ret < 0)
5481                                goto out;
5482                        advance_right = 0;
5483                }
5484
5485                if (left_end_reached && right_end_reached) {
5486                        ret = 0;
5487                        goto out;
5488                } else if (left_end_reached) {
5489                        if (right_level == 0) {
5490                                ret = changed_cb(left_root, right_root,
5491                                                left_path, right_path,
5492                                                &right_key,
5493                                                BTRFS_COMPARE_TREE_DELETED,
5494                                                ctx);
5495                                if (ret < 0)
5496                                        goto out;
5497                        }
5498                        advance_right = ADVANCE;
5499                        continue;
5500                } else if (right_end_reached) {
5501                        if (left_level == 0) {
5502                                ret = changed_cb(left_root, right_root,
5503                                                left_path, right_path,
5504                                                &left_key,
5505                                                BTRFS_COMPARE_TREE_NEW,
5506                                                ctx);
5507                                if (ret < 0)
5508                                        goto out;
5509                        }
5510                        advance_left = ADVANCE;
5511                        continue;
5512                }
5513
5514                if (left_level == 0 && right_level == 0) {
5515                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5516                        if (cmp < 0) {
5517                                ret = changed_cb(left_root, right_root,
5518                                                left_path, right_path,
5519                                                &left_key,
5520                                                BTRFS_COMPARE_TREE_NEW,
5521                                                ctx);
5522                                if (ret < 0)
5523                                        goto out;
5524                                advance_left = ADVANCE;
5525                        } else if (cmp > 0) {
5526                                ret = changed_cb(left_root, right_root,
5527                                                left_path, right_path,
5528                                                &right_key,
5529                                                BTRFS_COMPARE_TREE_DELETED,
5530                                                ctx);
5531                                if (ret < 0)
5532                                        goto out;
5533                                advance_right = ADVANCE;
5534                        } else {
5535                                enum btrfs_compare_tree_result result;
5536
5537                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5538                                ret = tree_compare_item(left_root, left_path,
5539                                                right_path, tmp_buf);
5540                                if (ret)
5541                                        result = BTRFS_COMPARE_TREE_CHANGED;
5542                                else
5543                                        result = BTRFS_COMPARE_TREE_SAME;
5544                                ret = changed_cb(left_root, right_root,
5545                                                 left_path, right_path,
5546                                                 &left_key, result, ctx);
5547                                if (ret < 0)
5548                                        goto out;
5549                                advance_left = ADVANCE;
5550                                advance_right = ADVANCE;
5551                        }
5552                } else if (left_level == right_level) {
5553                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5554                        if (cmp < 0) {
5555                                advance_left = ADVANCE;
5556                        } else if (cmp > 0) {
5557                                advance_right = ADVANCE;
5558                        } else {
5559                                left_blockptr = btrfs_node_blockptr(
5560                                                left_path->nodes[left_level],
5561                                                left_path->slots[left_level]);
5562                                right_blockptr = btrfs_node_blockptr(
5563                                                right_path->nodes[right_level],
5564                                                right_path->slots[right_level]);
5565                                left_gen = btrfs_node_ptr_generation(
5566                                                left_path->nodes[left_level],
5567                                                left_path->slots[left_level]);
5568                                right_gen = btrfs_node_ptr_generation(
5569                                                right_path->nodes[right_level],
5570                                                right_path->slots[right_level]);
5571                                if (left_blockptr == right_blockptr &&
5572                                    left_gen == right_gen) {
5573                                        /*
5574                                         * As we're on a shared block, don't
5575                                         * allow to go deeper.
5576                                         */
5577                                        advance_left = ADVANCE_ONLY_NEXT;
5578                                        advance_right = ADVANCE_ONLY_NEXT;
5579                                } else {
5580                                        advance_left = ADVANCE;
5581                                        advance_right = ADVANCE;
5582                                }
5583                        }
5584                } else if (left_level < right_level) {
5585                        advance_right = ADVANCE;
5586                } else {
5587                        advance_left = ADVANCE;
5588                }
5589        }
5590
5591out:
5592        btrfs_free_path(left_path);
5593        btrfs_free_path(right_path);
5594        kvfree(tmp_buf);
5595        return ret;
5596}
5597
5598/*
5599 * this is similar to btrfs_next_leaf, but does not try to preserve
5600 * and fixup the path.  It looks for and returns the next key in the
5601 * tree based on the current path and the min_trans parameters.
5602 *
5603 * 0 is returned if another key is found, < 0 if there are any errors
5604 * and 1 is returned if there are no higher keys in the tree
5605 *
5606 * path->keep_locks should be set to 1 on the search made before
5607 * calling this function.
5608 */
5609int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5610                        struct btrfs_key *key, int level, u64 min_trans)
5611{
5612        int slot;
5613        struct extent_buffer *c;
5614
5615        WARN_ON(!path->keep_locks);
5616        while (level < BTRFS_MAX_LEVEL) {
5617                if (!path->nodes[level])
5618                        return 1;
5619
5620                slot = path->slots[level] + 1;
5621                c = path->nodes[level];
5622next:
5623                if (slot >= btrfs_header_nritems(c)) {
5624                        int ret;
5625                        int orig_lowest;
5626                        struct btrfs_key cur_key;
5627                        if (level + 1 >= BTRFS_MAX_LEVEL ||
5628                            !path->nodes[level + 1])
5629                                return 1;
5630
5631                        if (path->locks[level + 1]) {
5632                                level++;
5633                                continue;
5634                        }
5635
5636                        slot = btrfs_header_nritems(c) - 1;
5637                        if (level == 0)
5638                                btrfs_item_key_to_cpu(c, &cur_key, slot);
5639                        else
5640                                btrfs_node_key_to_cpu(c, &cur_key, slot);
5641
5642                        orig_lowest = path->lowest_level;
5643                        btrfs_release_path(path);
5644                        path->lowest_level = level;
5645                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
5646                                                0, 0);
5647                        path->lowest_level = orig_lowest;
5648                        if (ret < 0)
5649                                return ret;
5650
5651                        c = path->nodes[level];
5652                        slot = path->slots[level];
5653                        if (ret == 0)
5654                                slot++;
5655                        goto next;
5656                }
5657
5658                if (level == 0)
5659                        btrfs_item_key_to_cpu(c, key, slot);
5660                else {
5661                        u64 gen = btrfs_node_ptr_generation(c, slot);
5662
5663                        if (gen < min_trans) {
5664                                slot++;
5665                                goto next;
5666                        }
5667                        btrfs_node_key_to_cpu(c, key, slot);
5668                }
5669                return 0;
5670        }
5671        return 1;
5672}
5673
5674/*
5675 * search the tree again to find a leaf with greater keys
5676 * returns 0 if it found something or 1 if there are no greater leaves.
5677 * returns < 0 on io errors.
5678 */
5679int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5680{
5681        return btrfs_next_old_leaf(root, path, 0);
5682}
5683
5684int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5685                        u64 time_seq)
5686{
5687        int slot;
5688        int level;
5689        struct extent_buffer *c;
5690        struct extent_buffer *next;
5691        struct btrfs_key key;
5692        u32 nritems;
5693        int ret;
5694        int old_spinning = path->leave_spinning;
5695        int next_rw_lock = 0;
5696
5697        nritems = btrfs_header_nritems(path->nodes[0]);
5698        if (nritems == 0)
5699                return 1;
5700
5701        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5702again:
5703        level = 1;
5704        next = NULL;
5705        next_rw_lock = 0;
5706        btrfs_release_path(path);
5707
5708        path->keep_locks = 1;
5709        path->leave_spinning = 1;
5710
5711        if (time_seq)
5712                ret = btrfs_search_old_slot(root, &key, path, time_seq);
5713        else
5714                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5715        path->keep_locks = 0;
5716
5717        if (ret < 0)
5718                return ret;
5719
5720        nritems = btrfs_header_nritems(path->nodes[0]);
5721        /*
5722         * by releasing the path above we dropped all our locks.  A balance
5723         * could have added more items next to the key that used to be
5724         * at the very end of the block.  So, check again here and
5725         * advance the path if there are now more items available.
5726         */
5727        if (nritems > 0 && path->slots[0] < nritems - 1) {
5728                if (ret == 0)
5729                        path->slots[0]++;
5730                ret = 0;
5731                goto done;
5732        }
5733        /*
5734         * So the above check misses one case:
5735         * - after releasing the path above, someone has removed the item that
5736         *   used to be at the very end of the block, and balance between leafs
5737         *   gets another one with bigger key.offset to replace it.
5738         *
5739         * This one should be returned as well, or we can get leaf corruption
5740         * later(esp. in __btrfs_drop_extents()).
5741         *
5742         * And a bit more explanation about this check,
5743         * with ret > 0, the key isn't found, the path points to the slot
5744         * where it should be inserted, so the path->slots[0] item must be the
5745         * bigger one.
5746         */
5747        if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5748                ret = 0;
5749                goto done;
5750        }
5751
5752        while (level < BTRFS_MAX_LEVEL) {
5753                if (!path->nodes[level]) {
5754                        ret = 1;
5755                        goto done;
5756                }
5757
5758                slot = path->slots[level] + 1;
5759                c = path->nodes[level];
5760                if (slot >= btrfs_header_nritems(c)) {
5761                        level++;
5762                        if (level == BTRFS_MAX_LEVEL) {
5763                                ret = 1;
5764                                goto done;
5765                        }
5766                        continue;
5767                }
5768
5769                if (next) {
5770                        btrfs_tree_unlock_rw(next, next_rw_lock);
5771                        free_extent_buffer(next);
5772                }
5773
5774                next = c;
5775                next_rw_lock = path->locks[level];
5776                ret = read_block_for_search(NULL, root, path, &next, level,
5777                                            slot, &key, 0);
5778                if (ret == -EAGAIN)
5779                        goto again;
5780
5781                if (ret < 0) {
5782                        btrfs_release_path(path);
5783                        goto done;
5784                }
5785
5786                if (!path->skip_locking) {
5787                        ret = btrfs_try_tree_read_lock(next);
5788                        if (!ret && time_seq) {
5789                                /*
5790                                 * If we don't get the lock, we may be racing
5791                                 * with push_leaf_left, holding that lock while
5792                                 * itself waiting for the leaf we've currently
5793                                 * locked. To solve this situation, we give up
5794                                 * on our lock and cycle.
5795                                 */
5796                                free_extent_buffer(next);
5797                                btrfs_release_path(path);
5798                                cond_resched();
5799                                goto again;
5800                        }
5801                        if (!ret) {
5802                                btrfs_set_path_blocking(path);
5803                                btrfs_tree_read_lock(next);
5804                                btrfs_clear_path_blocking(path, next,
5805                                                          BTRFS_READ_LOCK);
5806                        }
5807                        next_rw_lock = BTRFS_READ_LOCK;
5808                }
5809                break;
5810        }
5811        path->slots[level] = slot;
5812        while (1) {
5813                level--;
5814                c = path->nodes[level];
5815                if (path->locks[level])
5816                        btrfs_tree_unlock_rw(c, path->locks[level]);
5817
5818                free_extent_buffer(c);
5819                path->nodes[level] = next;
5820                path->slots[level] = 0;
5821                if (!path->skip_locking)
5822                        path->locks[level] = next_rw_lock;
5823                if (!level)
5824                        break;
5825
5826                ret = read_block_for_search(NULL, root, path, &next, level,
5827                                            0, &key, 0);
5828                if (ret == -EAGAIN)
5829                        goto again;
5830
5831                if (ret < 0) {
5832                        btrfs_release_path(path);
5833                        goto done;
5834                }
5835
5836                if (!path->skip_locking) {
5837                        ret = btrfs_try_tree_read_lock(next);
5838                        if (!ret) {
5839                                btrfs_set_path_blocking(path);
5840                                btrfs_tree_read_lock(next);
5841                                btrfs_clear_path_blocking(path, next,
5842                                                          BTRFS_READ_LOCK);
5843                        }
5844                        next_rw_lock = BTRFS_READ_LOCK;
5845                }
5846        }
5847        ret = 0;
5848done:
5849        unlock_up(path, 0, 1, 0, NULL);
5850        path->leave_spinning = old_spinning;
5851        if (!old_spinning)
5852                btrfs_set_path_blocking(path);
5853
5854        return ret;
5855}
5856
5857/*
5858 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5859 * searching until it gets past min_objectid or finds an item of 'type'
5860 *
5861 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5862 */
5863int btrfs_previous_item(struct btrfs_root *root,
5864                        struct btrfs_path *path, u64 min_objectid,
5865                        int type)
5866{
5867        struct btrfs_key found_key;
5868        struct extent_buffer *leaf;
5869        u32 nritems;
5870        int ret;
5871
5872        while (1) {
5873                if (path->slots[0] == 0) {
5874                        btrfs_set_path_blocking(path);
5875                        ret = btrfs_prev_leaf(root, path);
5876                        if (ret != 0)
5877                                return ret;
5878                } else {
5879                        path->slots[0]--;
5880                }
5881                leaf = path->nodes[0];
5882                nritems = btrfs_header_nritems(leaf);
5883                if (nritems == 0)
5884                        return 1;
5885                if (path->slots[0] == nritems)
5886                        path->slots[0]--;
5887
5888                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5889                if (found_key.objectid < min_objectid)
5890                        break;
5891                if (found_key.type == type)
5892                        return 0;
5893                if (found_key.objectid == min_objectid &&
5894                    found_key.type < type)
5895                        break;
5896        }
5897        return 1;
5898}
5899
5900/*
5901 * search in extent tree to find a previous Metadata/Data extent item with
5902 * min objecitd.
5903 *
5904 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5905 */
5906int btrfs_previous_extent_item(struct btrfs_root *root,
5907                        struct btrfs_path *path, u64 min_objectid)
5908{
5909        struct btrfs_key found_key;
5910        struct extent_buffer *leaf;
5911        u32 nritems;
5912        int ret;
5913
5914        while (1) {
5915                if (path->slots[0] == 0) {
5916                        btrfs_set_path_blocking(path);
5917                        ret = btrfs_prev_leaf(root, path);
5918                        if (ret != 0)
5919                                return ret;
5920                } else {
5921                        path->slots[0]--;
5922                }
5923                leaf = path->nodes[0];
5924                nritems = btrfs_header_nritems(leaf);
5925                if (nritems == 0)
5926                        return 1;
5927                if (path->slots[0] == nritems)
5928                        path->slots[0]--;
5929
5930                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5931                if (found_key.objectid < min_objectid)
5932                        break;
5933                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5934                    found_key.type == BTRFS_METADATA_ITEM_KEY)
5935                        return 0;
5936                if (found_key.objectid == min_objectid &&
5937                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5938                        break;
5939        }
5940        return 1;
5941}
5942