linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/slab.h>
   8#include <linux/rbtree.h>
   9#include <linux/mm.h>
  10#include "ctree.h"
  11#include "disk-io.h"
  12#include "transaction.h"
  13#include "print-tree.h"
  14#include "locking.h"
  15
  16static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  17                      *root, struct btrfs_path *path, int level);
  18static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  19                      const struct btrfs_key *ins_key, struct btrfs_path *path,
  20                      int data_size, int extend);
  21static int push_node_left(struct btrfs_trans_handle *trans,
  22                          struct btrfs_fs_info *fs_info,
  23                          struct extent_buffer *dst,
  24                          struct extent_buffer *src, int empty);
  25static int balance_node_right(struct btrfs_trans_handle *trans,
  26                              struct btrfs_fs_info *fs_info,
  27                              struct extent_buffer *dst_buf,
  28                              struct extent_buffer *src_buf);
  29static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
  30                    int level, int slot);
  31
  32struct btrfs_path *btrfs_alloc_path(void)
  33{
  34        return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  35}
  36
  37/*
  38 * set all locked nodes in the path to blocking locks.  This should
  39 * be done before scheduling
  40 */
  41noinline void btrfs_set_path_blocking(struct btrfs_path *p)
  42{
  43        int i;
  44        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  45                if (!p->nodes[i] || !p->locks[i])
  46                        continue;
  47                btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
  48                if (p->locks[i] == BTRFS_READ_LOCK)
  49                        p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
  50                else if (p->locks[i] == BTRFS_WRITE_LOCK)
  51                        p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
  52        }
  53}
  54
  55/* this also releases the path */
  56void btrfs_free_path(struct btrfs_path *p)
  57{
  58        if (!p)
  59                return;
  60        btrfs_release_path(p);
  61        kmem_cache_free(btrfs_path_cachep, p);
  62}
  63
  64/*
  65 * path release drops references on the extent buffers in the path
  66 * and it drops any locks held by this path
  67 *
  68 * It is safe to call this on paths that no locks or extent buffers held.
  69 */
  70noinline void btrfs_release_path(struct btrfs_path *p)
  71{
  72        int i;
  73
  74        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  75                p->slots[i] = 0;
  76                if (!p->nodes[i])
  77                        continue;
  78                if (p->locks[i]) {
  79                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
  80                        p->locks[i] = 0;
  81                }
  82                free_extent_buffer(p->nodes[i]);
  83                p->nodes[i] = NULL;
  84        }
  85}
  86
  87/*
  88 * safely gets a reference on the root node of a tree.  A lock
  89 * is not taken, so a concurrent writer may put a different node
  90 * at the root of the tree.  See btrfs_lock_root_node for the
  91 * looping required.
  92 *
  93 * The extent buffer returned by this has a reference taken, so
  94 * it won't disappear.  It may stop being the root of the tree
  95 * at any time because there are no locks held.
  96 */
  97struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
  98{
  99        struct extent_buffer *eb;
 100
 101        while (1) {
 102                rcu_read_lock();
 103                eb = rcu_dereference(root->node);
 104
 105                /*
 106                 * RCU really hurts here, we could free up the root node because
 107                 * it was COWed but we may not get the new root node yet so do
 108                 * the inc_not_zero dance and if it doesn't work then
 109                 * synchronize_rcu and try again.
 110                 */
 111                if (atomic_inc_not_zero(&eb->refs)) {
 112                        rcu_read_unlock();
 113                        break;
 114                }
 115                rcu_read_unlock();
 116                synchronize_rcu();
 117        }
 118        return eb;
 119}
 120
 121/* loop around taking references on and locking the root node of the
 122 * tree until you end up with a lock on the root.  A locked buffer
 123 * is returned, with a reference held.
 124 */
 125struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 126{
 127        struct extent_buffer *eb;
 128
 129        while (1) {
 130                eb = btrfs_root_node(root);
 131                btrfs_tree_lock(eb);
 132                if (eb == root->node)
 133                        break;
 134                btrfs_tree_unlock(eb);
 135                free_extent_buffer(eb);
 136        }
 137        return eb;
 138}
 139
 140/* loop around taking references on and locking the root node of the
 141 * tree until you end up with a lock on the root.  A locked buffer
 142 * is returned, with a reference held.
 143 */
 144struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 145{
 146        struct extent_buffer *eb;
 147
 148        while (1) {
 149                eb = btrfs_root_node(root);
 150                btrfs_tree_read_lock(eb);
 151                if (eb == root->node)
 152                        break;
 153                btrfs_tree_read_unlock(eb);
 154                free_extent_buffer(eb);
 155        }
 156        return eb;
 157}
 158
 159/* cowonly root (everything not a reference counted cow subvolume), just get
 160 * put onto a simple dirty list.  transaction.c walks this to make sure they
 161 * get properly updated on disk.
 162 */
 163static void add_root_to_dirty_list(struct btrfs_root *root)
 164{
 165        struct btrfs_fs_info *fs_info = root->fs_info;
 166
 167        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
 168            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
 169                return;
 170
 171        spin_lock(&fs_info->trans_lock);
 172        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
 173                /* Want the extent tree to be the last on the list */
 174                if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
 175                        list_move_tail(&root->dirty_list,
 176                                       &fs_info->dirty_cowonly_roots);
 177                else
 178                        list_move(&root->dirty_list,
 179                                  &fs_info->dirty_cowonly_roots);
 180        }
 181        spin_unlock(&fs_info->trans_lock);
 182}
 183
 184/*
 185 * used by snapshot creation to make a copy of a root for a tree with
 186 * a given objectid.  The buffer with the new root node is returned in
 187 * cow_ret, and this func returns zero on success or a negative error code.
 188 */
 189int btrfs_copy_root(struct btrfs_trans_handle *trans,
 190                      struct btrfs_root *root,
 191                      struct extent_buffer *buf,
 192                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 193{
 194        struct btrfs_fs_info *fs_info = root->fs_info;
 195        struct extent_buffer *cow;
 196        int ret = 0;
 197        int level;
 198        struct btrfs_disk_key disk_key;
 199
 200        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 201                trans->transid != fs_info->running_transaction->transid);
 202        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 203                trans->transid != root->last_trans);
 204
 205        level = btrfs_header_level(buf);
 206        if (level == 0)
 207                btrfs_item_key(buf, &disk_key, 0);
 208        else
 209                btrfs_node_key(buf, &disk_key, 0);
 210
 211        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 212                        &disk_key, level, buf->start, 0);
 213        if (IS_ERR(cow))
 214                return PTR_ERR(cow);
 215
 216        copy_extent_buffer_full(cow, buf);
 217        btrfs_set_header_bytenr(cow, cow->start);
 218        btrfs_set_header_generation(cow, trans->transid);
 219        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 220        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 221                                     BTRFS_HEADER_FLAG_RELOC);
 222        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 223                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 224        else
 225                btrfs_set_header_owner(cow, new_root_objectid);
 226
 227        write_extent_buffer_fsid(cow, fs_info->fsid);
 228
 229        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 230        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 231                ret = btrfs_inc_ref(trans, root, cow, 1);
 232        else
 233                ret = btrfs_inc_ref(trans, root, cow, 0);
 234
 235        if (ret)
 236                return ret;
 237
 238        btrfs_mark_buffer_dirty(cow);
 239        *cow_ret = cow;
 240        return 0;
 241}
 242
 243enum mod_log_op {
 244        MOD_LOG_KEY_REPLACE,
 245        MOD_LOG_KEY_ADD,
 246        MOD_LOG_KEY_REMOVE,
 247        MOD_LOG_KEY_REMOVE_WHILE_FREEING,
 248        MOD_LOG_KEY_REMOVE_WHILE_MOVING,
 249        MOD_LOG_MOVE_KEYS,
 250        MOD_LOG_ROOT_REPLACE,
 251};
 252
 253struct tree_mod_root {
 254        u64 logical;
 255        u8 level;
 256};
 257
 258struct tree_mod_elem {
 259        struct rb_node node;
 260        u64 logical;
 261        u64 seq;
 262        enum mod_log_op op;
 263
 264        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
 265        int slot;
 266
 267        /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
 268        u64 generation;
 269
 270        /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
 271        struct btrfs_disk_key key;
 272        u64 blockptr;
 273
 274        /* this is used for op == MOD_LOG_MOVE_KEYS */
 275        struct {
 276                int dst_slot;
 277                int nr_items;
 278        } move;
 279
 280        /* this is used for op == MOD_LOG_ROOT_REPLACE */
 281        struct tree_mod_root old_root;
 282};
 283
 284/*
 285 * Pull a new tree mod seq number for our operation.
 286 */
 287static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 288{
 289        return atomic64_inc_return(&fs_info->tree_mod_seq);
 290}
 291
 292/*
 293 * This adds a new blocker to the tree mod log's blocker list if the @elem
 294 * passed does not already have a sequence number set. So when a caller expects
 295 * to record tree modifications, it should ensure to set elem->seq to zero
 296 * before calling btrfs_get_tree_mod_seq.
 297 * Returns a fresh, unused tree log modification sequence number, even if no new
 298 * blocker was added.
 299 */
 300u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 301                           struct seq_list *elem)
 302{
 303        write_lock(&fs_info->tree_mod_log_lock);
 304        spin_lock(&fs_info->tree_mod_seq_lock);
 305        if (!elem->seq) {
 306                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 307                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 308        }
 309        spin_unlock(&fs_info->tree_mod_seq_lock);
 310        write_unlock(&fs_info->tree_mod_log_lock);
 311
 312        return elem->seq;
 313}
 314
 315void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 316                            struct seq_list *elem)
 317{
 318        struct rb_root *tm_root;
 319        struct rb_node *node;
 320        struct rb_node *next;
 321        struct seq_list *cur_elem;
 322        struct tree_mod_elem *tm;
 323        u64 min_seq = (u64)-1;
 324        u64 seq_putting = elem->seq;
 325
 326        if (!seq_putting)
 327                return;
 328
 329        spin_lock(&fs_info->tree_mod_seq_lock);
 330        list_del(&elem->list);
 331        elem->seq = 0;
 332
 333        list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
 334                if (cur_elem->seq < min_seq) {
 335                        if (seq_putting > cur_elem->seq) {
 336                                /*
 337                                 * blocker with lower sequence number exists, we
 338                                 * cannot remove anything from the log
 339                                 */
 340                                spin_unlock(&fs_info->tree_mod_seq_lock);
 341                                return;
 342                        }
 343                        min_seq = cur_elem->seq;
 344                }
 345        }
 346        spin_unlock(&fs_info->tree_mod_seq_lock);
 347
 348        /*
 349         * anything that's lower than the lowest existing (read: blocked)
 350         * sequence number can be removed from the tree.
 351         */
 352        write_lock(&fs_info->tree_mod_log_lock);
 353        tm_root = &fs_info->tree_mod_log;
 354        for (node = rb_first(tm_root); node; node = next) {
 355                next = rb_next(node);
 356                tm = rb_entry(node, struct tree_mod_elem, node);
 357                if (tm->seq > min_seq)
 358                        continue;
 359                rb_erase(node, tm_root);
 360                kfree(tm);
 361        }
 362        write_unlock(&fs_info->tree_mod_log_lock);
 363}
 364
 365/*
 366 * key order of the log:
 367 *       node/leaf start address -> sequence
 368 *
 369 * The 'start address' is the logical address of the *new* root node
 370 * for root replace operations, or the logical address of the affected
 371 * block for all other operations.
 372 *
 373 * Note: must be called with write lock for fs_info::tree_mod_log_lock.
 374 */
 375static noinline int
 376__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 377{
 378        struct rb_root *tm_root;
 379        struct rb_node **new;
 380        struct rb_node *parent = NULL;
 381        struct tree_mod_elem *cur;
 382
 383        tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 384
 385        tm_root = &fs_info->tree_mod_log;
 386        new = &tm_root->rb_node;
 387        while (*new) {
 388                cur = rb_entry(*new, struct tree_mod_elem, node);
 389                parent = *new;
 390                if (cur->logical < tm->logical)
 391                        new = &((*new)->rb_left);
 392                else if (cur->logical > tm->logical)
 393                        new = &((*new)->rb_right);
 394                else if (cur->seq < tm->seq)
 395                        new = &((*new)->rb_left);
 396                else if (cur->seq > tm->seq)
 397                        new = &((*new)->rb_right);
 398                else
 399                        return -EEXIST;
 400        }
 401
 402        rb_link_node(&tm->node, parent, new);
 403        rb_insert_color(&tm->node, tm_root);
 404        return 0;
 405}
 406
 407/*
 408 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 409 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 410 * this until all tree mod log insertions are recorded in the rb tree and then
 411 * write unlock fs_info::tree_mod_log_lock.
 412 */
 413static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 414                                    struct extent_buffer *eb) {
 415        smp_mb();
 416        if (list_empty(&(fs_info)->tree_mod_seq_list))
 417                return 1;
 418        if (eb && btrfs_header_level(eb) == 0)
 419                return 1;
 420
 421        write_lock(&fs_info->tree_mod_log_lock);
 422        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 423                write_unlock(&fs_info->tree_mod_log_lock);
 424                return 1;
 425        }
 426
 427        return 0;
 428}
 429
 430/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 431static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 432                                    struct extent_buffer *eb)
 433{
 434        smp_mb();
 435        if (list_empty(&(fs_info)->tree_mod_seq_list))
 436                return 0;
 437        if (eb && btrfs_header_level(eb) == 0)
 438                return 0;
 439
 440        return 1;
 441}
 442
 443static struct tree_mod_elem *
 444alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
 445                    enum mod_log_op op, gfp_t flags)
 446{
 447        struct tree_mod_elem *tm;
 448
 449        tm = kzalloc(sizeof(*tm), flags);
 450        if (!tm)
 451                return NULL;
 452
 453        tm->logical = eb->start;
 454        if (op != MOD_LOG_KEY_ADD) {
 455                btrfs_node_key(eb, &tm->key, slot);
 456                tm->blockptr = btrfs_node_blockptr(eb, slot);
 457        }
 458        tm->op = op;
 459        tm->slot = slot;
 460        tm->generation = btrfs_node_ptr_generation(eb, slot);
 461        RB_CLEAR_NODE(&tm->node);
 462
 463        return tm;
 464}
 465
 466static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 467                enum mod_log_op op, gfp_t flags)
 468{
 469        struct tree_mod_elem *tm;
 470        int ret;
 471
 472        if (!tree_mod_need_log(eb->fs_info, eb))
 473                return 0;
 474
 475        tm = alloc_tree_mod_elem(eb, slot, op, flags);
 476        if (!tm)
 477                return -ENOMEM;
 478
 479        if (tree_mod_dont_log(eb->fs_info, eb)) {
 480                kfree(tm);
 481                return 0;
 482        }
 483
 484        ret = __tree_mod_log_insert(eb->fs_info, tm);
 485        write_unlock(&eb->fs_info->tree_mod_log_lock);
 486        if (ret)
 487                kfree(tm);
 488
 489        return ret;
 490}
 491
 492static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
 493                int dst_slot, int src_slot, int nr_items)
 494{
 495        struct tree_mod_elem *tm = NULL;
 496        struct tree_mod_elem **tm_list = NULL;
 497        int ret = 0;
 498        int i;
 499        int locked = 0;
 500
 501        if (!tree_mod_need_log(eb->fs_info, eb))
 502                return 0;
 503
 504        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
 505        if (!tm_list)
 506                return -ENOMEM;
 507
 508        tm = kzalloc(sizeof(*tm), GFP_NOFS);
 509        if (!tm) {
 510                ret = -ENOMEM;
 511                goto free_tms;
 512        }
 513
 514        tm->logical = eb->start;
 515        tm->slot = src_slot;
 516        tm->move.dst_slot = dst_slot;
 517        tm->move.nr_items = nr_items;
 518        tm->op = MOD_LOG_MOVE_KEYS;
 519
 520        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 521                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 522                    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
 523                if (!tm_list[i]) {
 524                        ret = -ENOMEM;
 525                        goto free_tms;
 526                }
 527        }
 528
 529        if (tree_mod_dont_log(eb->fs_info, eb))
 530                goto free_tms;
 531        locked = 1;
 532
 533        /*
 534         * When we override something during the move, we log these removals.
 535         * This can only happen when we move towards the beginning of the
 536         * buffer, i.e. dst_slot < src_slot.
 537         */
 538        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 539                ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
 540                if (ret)
 541                        goto free_tms;
 542        }
 543
 544        ret = __tree_mod_log_insert(eb->fs_info, tm);
 545        if (ret)
 546                goto free_tms;
 547        write_unlock(&eb->fs_info->tree_mod_log_lock);
 548        kfree(tm_list);
 549
 550        return 0;
 551free_tms:
 552        for (i = 0; i < nr_items; i++) {
 553                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 554                        rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
 555                kfree(tm_list[i]);
 556        }
 557        if (locked)
 558                write_unlock(&eb->fs_info->tree_mod_log_lock);
 559        kfree(tm_list);
 560        kfree(tm);
 561
 562        return ret;
 563}
 564
 565static inline int
 566__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 567                       struct tree_mod_elem **tm_list,
 568                       int nritems)
 569{
 570        int i, j;
 571        int ret;
 572
 573        for (i = nritems - 1; i >= 0; i--) {
 574                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 575                if (ret) {
 576                        for (j = nritems - 1; j > i; j--)
 577                                rb_erase(&tm_list[j]->node,
 578                                         &fs_info->tree_mod_log);
 579                        return ret;
 580                }
 581        }
 582
 583        return 0;
 584}
 585
 586static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
 587                         struct extent_buffer *new_root, int log_removal)
 588{
 589        struct btrfs_fs_info *fs_info = old_root->fs_info;
 590        struct tree_mod_elem *tm = NULL;
 591        struct tree_mod_elem **tm_list = NULL;
 592        int nritems = 0;
 593        int ret = 0;
 594        int i;
 595
 596        if (!tree_mod_need_log(fs_info, NULL))
 597                return 0;
 598
 599        if (log_removal && btrfs_header_level(old_root) > 0) {
 600                nritems = btrfs_header_nritems(old_root);
 601                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 602                                  GFP_NOFS);
 603                if (!tm_list) {
 604                        ret = -ENOMEM;
 605                        goto free_tms;
 606                }
 607                for (i = 0; i < nritems; i++) {
 608                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
 609                            MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 610                        if (!tm_list[i]) {
 611                                ret = -ENOMEM;
 612                                goto free_tms;
 613                        }
 614                }
 615        }
 616
 617        tm = kzalloc(sizeof(*tm), GFP_NOFS);
 618        if (!tm) {
 619                ret = -ENOMEM;
 620                goto free_tms;
 621        }
 622
 623        tm->logical = new_root->start;
 624        tm->old_root.logical = old_root->start;
 625        tm->old_root.level = btrfs_header_level(old_root);
 626        tm->generation = btrfs_header_generation(old_root);
 627        tm->op = MOD_LOG_ROOT_REPLACE;
 628
 629        if (tree_mod_dont_log(fs_info, NULL))
 630                goto free_tms;
 631
 632        if (tm_list)
 633                ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 634        if (!ret)
 635                ret = __tree_mod_log_insert(fs_info, tm);
 636
 637        write_unlock(&fs_info->tree_mod_log_lock);
 638        if (ret)
 639                goto free_tms;
 640        kfree(tm_list);
 641
 642        return ret;
 643
 644free_tms:
 645        if (tm_list) {
 646                for (i = 0; i < nritems; i++)
 647                        kfree(tm_list[i]);
 648                kfree(tm_list);
 649        }
 650        kfree(tm);
 651
 652        return ret;
 653}
 654
 655static struct tree_mod_elem *
 656__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 657                      int smallest)
 658{
 659        struct rb_root *tm_root;
 660        struct rb_node *node;
 661        struct tree_mod_elem *cur = NULL;
 662        struct tree_mod_elem *found = NULL;
 663
 664        read_lock(&fs_info->tree_mod_log_lock);
 665        tm_root = &fs_info->tree_mod_log;
 666        node = tm_root->rb_node;
 667        while (node) {
 668                cur = rb_entry(node, struct tree_mod_elem, node);
 669                if (cur->logical < start) {
 670                        node = node->rb_left;
 671                } else if (cur->logical > start) {
 672                        node = node->rb_right;
 673                } else if (cur->seq < min_seq) {
 674                        node = node->rb_left;
 675                } else if (!smallest) {
 676                        /* we want the node with the highest seq */
 677                        if (found)
 678                                BUG_ON(found->seq > cur->seq);
 679                        found = cur;
 680                        node = node->rb_left;
 681                } else if (cur->seq > min_seq) {
 682                        /* we want the node with the smallest seq */
 683                        if (found)
 684                                BUG_ON(found->seq < cur->seq);
 685                        found = cur;
 686                        node = node->rb_right;
 687                } else {
 688                        found = cur;
 689                        break;
 690                }
 691        }
 692        read_unlock(&fs_info->tree_mod_log_lock);
 693
 694        return found;
 695}
 696
 697/*
 698 * this returns the element from the log with the smallest time sequence
 699 * value that's in the log (the oldest log item). any element with a time
 700 * sequence lower than min_seq will be ignored.
 701 */
 702static struct tree_mod_elem *
 703tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
 704                           u64 min_seq)
 705{
 706        return __tree_mod_log_search(fs_info, start, min_seq, 1);
 707}
 708
 709/*
 710 * this returns the element from the log with the largest time sequence
 711 * value that's in the log (the most recent log item). any element with
 712 * a time sequence lower than min_seq will be ignored.
 713 */
 714static struct tree_mod_elem *
 715tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 716{
 717        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 718}
 719
 720static noinline int
 721tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 722                     struct extent_buffer *src, unsigned long dst_offset,
 723                     unsigned long src_offset, int nr_items)
 724{
 725        int ret = 0;
 726        struct tree_mod_elem **tm_list = NULL;
 727        struct tree_mod_elem **tm_list_add, **tm_list_rem;
 728        int i;
 729        int locked = 0;
 730
 731        if (!tree_mod_need_log(fs_info, NULL))
 732                return 0;
 733
 734        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 735                return 0;
 736
 737        tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 738                          GFP_NOFS);
 739        if (!tm_list)
 740                return -ENOMEM;
 741
 742        tm_list_add = tm_list;
 743        tm_list_rem = tm_list + nr_items;
 744        for (i = 0; i < nr_items; i++) {
 745                tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 746                    MOD_LOG_KEY_REMOVE, GFP_NOFS);
 747                if (!tm_list_rem[i]) {
 748                        ret = -ENOMEM;
 749                        goto free_tms;
 750                }
 751
 752                tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 753                    MOD_LOG_KEY_ADD, GFP_NOFS);
 754                if (!tm_list_add[i]) {
 755                        ret = -ENOMEM;
 756                        goto free_tms;
 757                }
 758        }
 759
 760        if (tree_mod_dont_log(fs_info, NULL))
 761                goto free_tms;
 762        locked = 1;
 763
 764        for (i = 0; i < nr_items; i++) {
 765                ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
 766                if (ret)
 767                        goto free_tms;
 768                ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
 769                if (ret)
 770                        goto free_tms;
 771        }
 772
 773        write_unlock(&fs_info->tree_mod_log_lock);
 774        kfree(tm_list);
 775
 776        return 0;
 777
 778free_tms:
 779        for (i = 0; i < nr_items * 2; i++) {
 780                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 781                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 782                kfree(tm_list[i]);
 783        }
 784        if (locked)
 785                write_unlock(&fs_info->tree_mod_log_lock);
 786        kfree(tm_list);
 787
 788        return ret;
 789}
 790
 791static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
 792{
 793        struct tree_mod_elem **tm_list = NULL;
 794        int nritems = 0;
 795        int i;
 796        int ret = 0;
 797
 798        if (btrfs_header_level(eb) == 0)
 799                return 0;
 800
 801        if (!tree_mod_need_log(eb->fs_info, NULL))
 802                return 0;
 803
 804        nritems = btrfs_header_nritems(eb);
 805        tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 806        if (!tm_list)
 807                return -ENOMEM;
 808
 809        for (i = 0; i < nritems; i++) {
 810                tm_list[i] = alloc_tree_mod_elem(eb, i,
 811                    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 812                if (!tm_list[i]) {
 813                        ret = -ENOMEM;
 814                        goto free_tms;
 815                }
 816        }
 817
 818        if (tree_mod_dont_log(eb->fs_info, eb))
 819                goto free_tms;
 820
 821        ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
 822        write_unlock(&eb->fs_info->tree_mod_log_lock);
 823        if (ret)
 824                goto free_tms;
 825        kfree(tm_list);
 826
 827        return 0;
 828
 829free_tms:
 830        for (i = 0; i < nritems; i++)
 831                kfree(tm_list[i]);
 832        kfree(tm_list);
 833
 834        return ret;
 835}
 836
 837/*
 838 * check if the tree block can be shared by multiple trees
 839 */
 840int btrfs_block_can_be_shared(struct btrfs_root *root,
 841                              struct extent_buffer *buf)
 842{
 843        /*
 844         * Tree blocks not in reference counted trees and tree roots
 845         * are never shared. If a block was allocated after the last
 846         * snapshot and the block was not allocated by tree relocation,
 847         * we know the block is not shared.
 848         */
 849        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 850            buf != root->node && buf != root->commit_root &&
 851            (btrfs_header_generation(buf) <=
 852             btrfs_root_last_snapshot(&root->root_item) ||
 853             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 854                return 1;
 855
 856        return 0;
 857}
 858
 859static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 860                                       struct btrfs_root *root,
 861                                       struct extent_buffer *buf,
 862                                       struct extent_buffer *cow,
 863                                       int *last_ref)
 864{
 865        struct btrfs_fs_info *fs_info = root->fs_info;
 866        u64 refs;
 867        u64 owner;
 868        u64 flags;
 869        u64 new_flags = 0;
 870        int ret;
 871
 872        /*
 873         * Backrefs update rules:
 874         *
 875         * Always use full backrefs for extent pointers in tree block
 876         * allocated by tree relocation.
 877         *
 878         * If a shared tree block is no longer referenced by its owner
 879         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 880         * use full backrefs for extent pointers in tree block.
 881         *
 882         * If a tree block is been relocating
 883         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
 884         * use full backrefs for extent pointers in tree block.
 885         * The reason for this is some operations (such as drop tree)
 886         * are only allowed for blocks use full backrefs.
 887         */
 888
 889        if (btrfs_block_can_be_shared(root, buf)) {
 890                ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
 891                                               btrfs_header_level(buf), 1,
 892                                               &refs, &flags);
 893                if (ret)
 894                        return ret;
 895                if (refs == 0) {
 896                        ret = -EROFS;
 897                        btrfs_handle_fs_error(fs_info, ret, NULL);
 898                        return ret;
 899                }
 900        } else {
 901                refs = 1;
 902                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
 903                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 904                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 905                else
 906                        flags = 0;
 907        }
 908
 909        owner = btrfs_header_owner(buf);
 910        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
 911               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 912
 913        if (refs > 1) {
 914                if ((owner == root->root_key.objectid ||
 915                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 916                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
 917                        ret = btrfs_inc_ref(trans, root, buf, 1);
 918                        if (ret)
 919                                return ret;
 920
 921                        if (root->root_key.objectid ==
 922                            BTRFS_TREE_RELOC_OBJECTID) {
 923                                ret = btrfs_dec_ref(trans, root, buf, 0);
 924                                if (ret)
 925                                        return ret;
 926                                ret = btrfs_inc_ref(trans, root, cow, 1);
 927                                if (ret)
 928                                        return ret;
 929                        }
 930                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 931                } else {
 932
 933                        if (root->root_key.objectid ==
 934                            BTRFS_TREE_RELOC_OBJECTID)
 935                                ret = btrfs_inc_ref(trans, root, cow, 1);
 936                        else
 937                                ret = btrfs_inc_ref(trans, root, cow, 0);
 938                        if (ret)
 939                                return ret;
 940                }
 941                if (new_flags != 0) {
 942                        int level = btrfs_header_level(buf);
 943
 944                        ret = btrfs_set_disk_extent_flags(trans, fs_info,
 945                                                          buf->start,
 946                                                          buf->len,
 947                                                          new_flags, level, 0);
 948                        if (ret)
 949                                return ret;
 950                }
 951        } else {
 952                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 953                        if (root->root_key.objectid ==
 954                            BTRFS_TREE_RELOC_OBJECTID)
 955                                ret = btrfs_inc_ref(trans, root, cow, 1);
 956                        else
 957                                ret = btrfs_inc_ref(trans, root, cow, 0);
 958                        if (ret)
 959                                return ret;
 960                        ret = btrfs_dec_ref(trans, root, buf, 1);
 961                        if (ret)
 962                                return ret;
 963                }
 964                clean_tree_block(fs_info, buf);
 965                *last_ref = 1;
 966        }
 967        return 0;
 968}
 969
 970/*
 971 * does the dirty work in cow of a single block.  The parent block (if
 972 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 973 * dirty and returned locked.  If you modify the block it needs to be marked
 974 * dirty again.
 975 *
 976 * search_start -- an allocation hint for the new block
 977 *
 978 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 979 * bytes the allocator should try to find free next to the block it returns.
 980 * This is just a hint and may be ignored by the allocator.
 981 */
 982static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 983                             struct btrfs_root *root,
 984                             struct extent_buffer *buf,
 985                             struct extent_buffer *parent, int parent_slot,
 986                             struct extent_buffer **cow_ret,
 987                             u64 search_start, u64 empty_size)
 988{
 989        struct btrfs_fs_info *fs_info = root->fs_info;
 990        struct btrfs_disk_key disk_key;
 991        struct extent_buffer *cow;
 992        int level, ret;
 993        int last_ref = 0;
 994        int unlock_orig = 0;
 995        u64 parent_start = 0;
 996
 997        if (*cow_ret == buf)
 998                unlock_orig = 1;
 999
1000        btrfs_assert_tree_locked(buf);
1001
1002        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1003                trans->transid != fs_info->running_transaction->transid);
1004        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1005                trans->transid != root->last_trans);
1006
1007        level = btrfs_header_level(buf);
1008
1009        if (level == 0)
1010                btrfs_item_key(buf, &disk_key, 0);
1011        else
1012                btrfs_node_key(buf, &disk_key, 0);
1013
1014        if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1015                parent_start = parent->start;
1016
1017        /*
1018         * If we are COWing a node/leaf from the extent, chunk or device trees,
1019         * make sure that we do not finish block group creation of pending block
1020         * groups. We do this to avoid a deadlock.
1021         * COWing can result in allocation of a new chunk, and flushing pending
1022         * block groups (btrfs_create_pending_block_groups()) can be triggered
1023         * when finishing allocation of a new chunk. Creation of a pending block
1024         * group modifies the extent, chunk and device trees, therefore we could
1025         * deadlock with ourselves since we are holding a lock on an extent
1026         * buffer that btrfs_create_pending_block_groups() may try to COW later.
1027         */
1028        if (root == fs_info->extent_root ||
1029            root == fs_info->chunk_root ||
1030            root == fs_info->dev_root)
1031                trans->can_flush_pending_bgs = false;
1032
1033        cow = btrfs_alloc_tree_block(trans, root, parent_start,
1034                        root->root_key.objectid, &disk_key, level,
1035                        search_start, empty_size);
1036        trans->can_flush_pending_bgs = true;
1037        if (IS_ERR(cow))
1038                return PTR_ERR(cow);
1039
1040        /* cow is set to blocking by btrfs_init_new_buffer */
1041
1042        copy_extent_buffer_full(cow, buf);
1043        btrfs_set_header_bytenr(cow, cow->start);
1044        btrfs_set_header_generation(cow, trans->transid);
1045        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1046        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1047                                     BTRFS_HEADER_FLAG_RELOC);
1048        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1049                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1050        else
1051                btrfs_set_header_owner(cow, root->root_key.objectid);
1052
1053        write_extent_buffer_fsid(cow, fs_info->fsid);
1054
1055        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1056        if (ret) {
1057                btrfs_abort_transaction(trans, ret);
1058                return ret;
1059        }
1060
1061        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1062                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1063                if (ret) {
1064                        btrfs_abort_transaction(trans, ret);
1065                        return ret;
1066                }
1067        }
1068
1069        if (buf == root->node) {
1070                WARN_ON(parent && parent != buf);
1071                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1072                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1073                        parent_start = buf->start;
1074
1075                extent_buffer_get(cow);
1076                ret = tree_mod_log_insert_root(root->node, cow, 1);
1077                BUG_ON(ret < 0);
1078                rcu_assign_pointer(root->node, cow);
1079
1080                btrfs_free_tree_block(trans, root, buf, parent_start,
1081                                      last_ref);
1082                free_extent_buffer(buf);
1083                add_root_to_dirty_list(root);
1084        } else {
1085                WARN_ON(trans->transid != btrfs_header_generation(parent));
1086                tree_mod_log_insert_key(parent, parent_slot,
1087                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1088                btrfs_set_node_blockptr(parent, parent_slot,
1089                                        cow->start);
1090                btrfs_set_node_ptr_generation(parent, parent_slot,
1091                                              trans->transid);
1092                btrfs_mark_buffer_dirty(parent);
1093                if (last_ref) {
1094                        ret = tree_mod_log_free_eb(buf);
1095                        if (ret) {
1096                                btrfs_abort_transaction(trans, ret);
1097                                return ret;
1098                        }
1099                }
1100                btrfs_free_tree_block(trans, root, buf, parent_start,
1101                                      last_ref);
1102        }
1103        if (unlock_orig)
1104                btrfs_tree_unlock(buf);
1105        free_extent_buffer_stale(buf);
1106        btrfs_mark_buffer_dirty(cow);
1107        *cow_ret = cow;
1108        return 0;
1109}
1110
1111/*
1112 * returns the logical address of the oldest predecessor of the given root.
1113 * entries older than time_seq are ignored.
1114 */
1115static struct tree_mod_elem *__tree_mod_log_oldest_root(
1116                struct extent_buffer *eb_root, u64 time_seq)
1117{
1118        struct tree_mod_elem *tm;
1119        struct tree_mod_elem *found = NULL;
1120        u64 root_logical = eb_root->start;
1121        int looped = 0;
1122
1123        if (!time_seq)
1124                return NULL;
1125
1126        /*
1127         * the very last operation that's logged for a root is the
1128         * replacement operation (if it is replaced at all). this has
1129         * the logical address of the *new* root, making it the very
1130         * first operation that's logged for this root.
1131         */
1132        while (1) {
1133                tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1134                                                time_seq);
1135                if (!looped && !tm)
1136                        return NULL;
1137                /*
1138                 * if there are no tree operation for the oldest root, we simply
1139                 * return it. this should only happen if that (old) root is at
1140                 * level 0.
1141                 */
1142                if (!tm)
1143                        break;
1144
1145                /*
1146                 * if there's an operation that's not a root replacement, we
1147                 * found the oldest version of our root. normally, we'll find a
1148                 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1149                 */
1150                if (tm->op != MOD_LOG_ROOT_REPLACE)
1151                        break;
1152
1153                found = tm;
1154                root_logical = tm->old_root.logical;
1155                looped = 1;
1156        }
1157
1158        /* if there's no old root to return, return what we found instead */
1159        if (!found)
1160                found = tm;
1161
1162        return found;
1163}
1164
1165/*
1166 * tm is a pointer to the first operation to rewind within eb. then, all
1167 * previous operations will be rewound (until we reach something older than
1168 * time_seq).
1169 */
1170static void
1171__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1172                      u64 time_seq, struct tree_mod_elem *first_tm)
1173{
1174        u32 n;
1175        struct rb_node *next;
1176        struct tree_mod_elem *tm = first_tm;
1177        unsigned long o_dst;
1178        unsigned long o_src;
1179        unsigned long p_size = sizeof(struct btrfs_key_ptr);
1180
1181        n = btrfs_header_nritems(eb);
1182        read_lock(&fs_info->tree_mod_log_lock);
1183        while (tm && tm->seq >= time_seq) {
1184                /*
1185                 * all the operations are recorded with the operator used for
1186                 * the modification. as we're going backwards, we do the
1187                 * opposite of each operation here.
1188                 */
1189                switch (tm->op) {
1190                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1191                        BUG_ON(tm->slot < n);
1192                        /* Fallthrough */
1193                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1194                case MOD_LOG_KEY_REMOVE:
1195                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1196                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1197                        btrfs_set_node_ptr_generation(eb, tm->slot,
1198                                                      tm->generation);
1199                        n++;
1200                        break;
1201                case MOD_LOG_KEY_REPLACE:
1202                        BUG_ON(tm->slot >= n);
1203                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1204                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1205                        btrfs_set_node_ptr_generation(eb, tm->slot,
1206                                                      tm->generation);
1207                        break;
1208                case MOD_LOG_KEY_ADD:
1209                        /* if a move operation is needed it's in the log */
1210                        n--;
1211                        break;
1212                case MOD_LOG_MOVE_KEYS:
1213                        o_dst = btrfs_node_key_ptr_offset(tm->slot);
1214                        o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1215                        memmove_extent_buffer(eb, o_dst, o_src,
1216                                              tm->move.nr_items * p_size);
1217                        break;
1218                case MOD_LOG_ROOT_REPLACE:
1219                        /*
1220                         * this operation is special. for roots, this must be
1221                         * handled explicitly before rewinding.
1222                         * for non-roots, this operation may exist if the node
1223                         * was a root: root A -> child B; then A gets empty and
1224                         * B is promoted to the new root. in the mod log, we'll
1225                         * have a root-replace operation for B, a tree block
1226                         * that is no root. we simply ignore that operation.
1227                         */
1228                        break;
1229                }
1230                next = rb_next(&tm->node);
1231                if (!next)
1232                        break;
1233                tm = rb_entry(next, struct tree_mod_elem, node);
1234                if (tm->logical != first_tm->logical)
1235                        break;
1236        }
1237        read_unlock(&fs_info->tree_mod_log_lock);
1238        btrfs_set_header_nritems(eb, n);
1239}
1240
1241/*
1242 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1243 * is returned. If rewind operations happen, a fresh buffer is returned. The
1244 * returned buffer is always read-locked. If the returned buffer is not the
1245 * input buffer, the lock on the input buffer is released and the input buffer
1246 * is freed (its refcount is decremented).
1247 */
1248static struct extent_buffer *
1249tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1250                    struct extent_buffer *eb, u64 time_seq)
1251{
1252        struct extent_buffer *eb_rewin;
1253        struct tree_mod_elem *tm;
1254
1255        if (!time_seq)
1256                return eb;
1257
1258        if (btrfs_header_level(eb) == 0)
1259                return eb;
1260
1261        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1262        if (!tm)
1263                return eb;
1264
1265        btrfs_set_path_blocking(path);
1266        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1267
1268        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1269                BUG_ON(tm->slot != 0);
1270                eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1271                if (!eb_rewin) {
1272                        btrfs_tree_read_unlock_blocking(eb);
1273                        free_extent_buffer(eb);
1274                        return NULL;
1275                }
1276                btrfs_set_header_bytenr(eb_rewin, eb->start);
1277                btrfs_set_header_backref_rev(eb_rewin,
1278                                             btrfs_header_backref_rev(eb));
1279                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1280                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1281        } else {
1282                eb_rewin = btrfs_clone_extent_buffer(eb);
1283                if (!eb_rewin) {
1284                        btrfs_tree_read_unlock_blocking(eb);
1285                        free_extent_buffer(eb);
1286                        return NULL;
1287                }
1288        }
1289
1290        btrfs_tree_read_unlock_blocking(eb);
1291        free_extent_buffer(eb);
1292
1293        extent_buffer_get(eb_rewin);
1294        btrfs_tree_read_lock(eb_rewin);
1295        __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1296        WARN_ON(btrfs_header_nritems(eb_rewin) >
1297                BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1298
1299        return eb_rewin;
1300}
1301
1302/*
1303 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1304 * value. If there are no changes, the current root->root_node is returned. If
1305 * anything changed in between, there's a fresh buffer allocated on which the
1306 * rewind operations are done. In any case, the returned buffer is read locked.
1307 * Returns NULL on error (with no locks held).
1308 */
1309static inline struct extent_buffer *
1310get_old_root(struct btrfs_root *root, u64 time_seq)
1311{
1312        struct btrfs_fs_info *fs_info = root->fs_info;
1313        struct tree_mod_elem *tm;
1314        struct extent_buffer *eb = NULL;
1315        struct extent_buffer *eb_root;
1316        struct extent_buffer *old;
1317        struct tree_mod_root *old_root = NULL;
1318        u64 old_generation = 0;
1319        u64 logical;
1320        int level;
1321
1322        eb_root = btrfs_read_lock_root_node(root);
1323        tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1324        if (!tm)
1325                return eb_root;
1326
1327        if (tm->op == MOD_LOG_ROOT_REPLACE) {
1328                old_root = &tm->old_root;
1329                old_generation = tm->generation;
1330                logical = old_root->logical;
1331                level = old_root->level;
1332        } else {
1333                logical = eb_root->start;
1334                level = btrfs_header_level(eb_root);
1335        }
1336
1337        tm = tree_mod_log_search(fs_info, logical, time_seq);
1338        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1339                btrfs_tree_read_unlock(eb_root);
1340                free_extent_buffer(eb_root);
1341                old = read_tree_block(fs_info, logical, 0, level, NULL);
1342                if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1343                        if (!IS_ERR(old))
1344                                free_extent_buffer(old);
1345                        btrfs_warn(fs_info,
1346                                   "failed to read tree block %llu from get_old_root",
1347                                   logical);
1348                } else {
1349                        eb = btrfs_clone_extent_buffer(old);
1350                        free_extent_buffer(old);
1351                }
1352        } else if (old_root) {
1353                btrfs_tree_read_unlock(eb_root);
1354                free_extent_buffer(eb_root);
1355                eb = alloc_dummy_extent_buffer(fs_info, logical);
1356        } else {
1357                btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1358                eb = btrfs_clone_extent_buffer(eb_root);
1359                btrfs_tree_read_unlock_blocking(eb_root);
1360                free_extent_buffer(eb_root);
1361        }
1362
1363        if (!eb)
1364                return NULL;
1365        extent_buffer_get(eb);
1366        btrfs_tree_read_lock(eb);
1367        if (old_root) {
1368                btrfs_set_header_bytenr(eb, eb->start);
1369                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1370                btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1371                btrfs_set_header_level(eb, old_root->level);
1372                btrfs_set_header_generation(eb, old_generation);
1373        }
1374        if (tm)
1375                __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1376        else
1377                WARN_ON(btrfs_header_level(eb) != 0);
1378        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1379
1380        return eb;
1381}
1382
1383int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1384{
1385        struct tree_mod_elem *tm;
1386        int level;
1387        struct extent_buffer *eb_root = btrfs_root_node(root);
1388
1389        tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1390        if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1391                level = tm->old_root.level;
1392        } else {
1393                level = btrfs_header_level(eb_root);
1394        }
1395        free_extent_buffer(eb_root);
1396
1397        return level;
1398}
1399
1400static inline int should_cow_block(struct btrfs_trans_handle *trans,
1401                                   struct btrfs_root *root,
1402                                   struct extent_buffer *buf)
1403{
1404        if (btrfs_is_testing(root->fs_info))
1405                return 0;
1406
1407        /* Ensure we can see the FORCE_COW bit */
1408        smp_mb__before_atomic();
1409
1410        /*
1411         * We do not need to cow a block if
1412         * 1) this block is not created or changed in this transaction;
1413         * 2) this block does not belong to TREE_RELOC tree;
1414         * 3) the root is not forced COW.
1415         *
1416         * What is forced COW:
1417         *    when we create snapshot during committing the transaction,
1418         *    after we've finished coping src root, we must COW the shared
1419         *    block to ensure the metadata consistency.
1420         */
1421        if (btrfs_header_generation(buf) == trans->transid &&
1422            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1423            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1424              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1425            !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1426                return 0;
1427        return 1;
1428}
1429
1430/*
1431 * cows a single block, see __btrfs_cow_block for the real work.
1432 * This version of it has extra checks so that a block isn't COWed more than
1433 * once per transaction, as long as it hasn't been written yet
1434 */
1435noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1436                    struct btrfs_root *root, struct extent_buffer *buf,
1437                    struct extent_buffer *parent, int parent_slot,
1438                    struct extent_buffer **cow_ret)
1439{
1440        struct btrfs_fs_info *fs_info = root->fs_info;
1441        u64 search_start;
1442        int ret;
1443
1444        if (trans->transaction != fs_info->running_transaction)
1445                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1446                       trans->transid,
1447                       fs_info->running_transaction->transid);
1448
1449        if (trans->transid != fs_info->generation)
1450                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1451                       trans->transid, fs_info->generation);
1452
1453        if (!should_cow_block(trans, root, buf)) {
1454                trans->dirty = true;
1455                *cow_ret = buf;
1456                return 0;
1457        }
1458
1459        search_start = buf->start & ~((u64)SZ_1G - 1);
1460
1461        if (parent)
1462                btrfs_set_lock_blocking(parent);
1463        btrfs_set_lock_blocking(buf);
1464
1465        ret = __btrfs_cow_block(trans, root, buf, parent,
1466                                 parent_slot, cow_ret, search_start, 0);
1467
1468        trace_btrfs_cow_block(root, buf, *cow_ret);
1469
1470        return ret;
1471}
1472
1473/*
1474 * helper function for defrag to decide if two blocks pointed to by a
1475 * node are actually close by
1476 */
1477static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1478{
1479        if (blocknr < other && other - (blocknr + blocksize) < 32768)
1480                return 1;
1481        if (blocknr > other && blocknr - (other + blocksize) < 32768)
1482                return 1;
1483        return 0;
1484}
1485
1486/*
1487 * compare two keys in a memcmp fashion
1488 */
1489static int comp_keys(const struct btrfs_disk_key *disk,
1490                     const struct btrfs_key *k2)
1491{
1492        struct btrfs_key k1;
1493
1494        btrfs_disk_key_to_cpu(&k1, disk);
1495
1496        return btrfs_comp_cpu_keys(&k1, k2);
1497}
1498
1499/*
1500 * same as comp_keys only with two btrfs_key's
1501 */
1502int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1503{
1504        if (k1->objectid > k2->objectid)
1505                return 1;
1506        if (k1->objectid < k2->objectid)
1507                return -1;
1508        if (k1->type > k2->type)
1509                return 1;
1510        if (k1->type < k2->type)
1511                return -1;
1512        if (k1->offset > k2->offset)
1513                return 1;
1514        if (k1->offset < k2->offset)
1515                return -1;
1516        return 0;
1517}
1518
1519/*
1520 * this is used by the defrag code to go through all the
1521 * leaves pointed to by a node and reallocate them so that
1522 * disk order is close to key order
1523 */
1524int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1525                       struct btrfs_root *root, struct extent_buffer *parent,
1526                       int start_slot, u64 *last_ret,
1527                       struct btrfs_key *progress)
1528{
1529        struct btrfs_fs_info *fs_info = root->fs_info;
1530        struct extent_buffer *cur;
1531        u64 blocknr;
1532        u64 gen;
1533        u64 search_start = *last_ret;
1534        u64 last_block = 0;
1535        u64 other;
1536        u32 parent_nritems;
1537        int end_slot;
1538        int i;
1539        int err = 0;
1540        int parent_level;
1541        int uptodate;
1542        u32 blocksize;
1543        int progress_passed = 0;
1544        struct btrfs_disk_key disk_key;
1545
1546        parent_level = btrfs_header_level(parent);
1547
1548        WARN_ON(trans->transaction != fs_info->running_transaction);
1549        WARN_ON(trans->transid != fs_info->generation);
1550
1551        parent_nritems = btrfs_header_nritems(parent);
1552        blocksize = fs_info->nodesize;
1553        end_slot = parent_nritems - 1;
1554
1555        if (parent_nritems <= 1)
1556                return 0;
1557
1558        btrfs_set_lock_blocking(parent);
1559
1560        for (i = start_slot; i <= end_slot; i++) {
1561                struct btrfs_key first_key;
1562                int close = 1;
1563
1564                btrfs_node_key(parent, &disk_key, i);
1565                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1566                        continue;
1567
1568                progress_passed = 1;
1569                blocknr = btrfs_node_blockptr(parent, i);
1570                gen = btrfs_node_ptr_generation(parent, i);
1571                btrfs_node_key_to_cpu(parent, &first_key, i);
1572                if (last_block == 0)
1573                        last_block = blocknr;
1574
1575                if (i > 0) {
1576                        other = btrfs_node_blockptr(parent, i - 1);
1577                        close = close_blocks(blocknr, other, blocksize);
1578                }
1579                if (!close && i < end_slot) {
1580                        other = btrfs_node_blockptr(parent, i + 1);
1581                        close = close_blocks(blocknr, other, blocksize);
1582                }
1583                if (close) {
1584                        last_block = blocknr;
1585                        continue;
1586                }
1587
1588                cur = find_extent_buffer(fs_info, blocknr);
1589                if (cur)
1590                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1591                else
1592                        uptodate = 0;
1593                if (!cur || !uptodate) {
1594                        if (!cur) {
1595                                cur = read_tree_block(fs_info, blocknr, gen,
1596                                                      parent_level - 1,
1597                                                      &first_key);
1598                                if (IS_ERR(cur)) {
1599                                        return PTR_ERR(cur);
1600                                } else if (!extent_buffer_uptodate(cur)) {
1601                                        free_extent_buffer(cur);
1602                                        return -EIO;
1603                                }
1604                        } else if (!uptodate) {
1605                                err = btrfs_read_buffer(cur, gen,
1606                                                parent_level - 1,&first_key);
1607                                if (err) {
1608                                        free_extent_buffer(cur);
1609                                        return err;
1610                                }
1611                        }
1612                }
1613                if (search_start == 0)
1614                        search_start = last_block;
1615
1616                btrfs_tree_lock(cur);
1617                btrfs_set_lock_blocking(cur);
1618                err = __btrfs_cow_block(trans, root, cur, parent, i,
1619                                        &cur, search_start,
1620                                        min(16 * blocksize,
1621                                            (end_slot - i) * blocksize));
1622                if (err) {
1623                        btrfs_tree_unlock(cur);
1624                        free_extent_buffer(cur);
1625                        break;
1626                }
1627                search_start = cur->start;
1628                last_block = cur->start;
1629                *last_ret = search_start;
1630                btrfs_tree_unlock(cur);
1631                free_extent_buffer(cur);
1632        }
1633        return err;
1634}
1635
1636/*
1637 * search for key in the extent_buffer.  The items start at offset p,
1638 * and they are item_size apart.  There are 'max' items in p.
1639 *
1640 * the slot in the array is returned via slot, and it points to
1641 * the place where you would insert key if it is not found in
1642 * the array.
1643 *
1644 * slot may point to max if the key is bigger than all of the keys
1645 */
1646static noinline int generic_bin_search(struct extent_buffer *eb,
1647                                       unsigned long p, int item_size,
1648                                       const struct btrfs_key *key,
1649                                       int max, int *slot)
1650{
1651        int low = 0;
1652        int high = max;
1653        int mid;
1654        int ret;
1655        struct btrfs_disk_key *tmp = NULL;
1656        struct btrfs_disk_key unaligned;
1657        unsigned long offset;
1658        char *kaddr = NULL;
1659        unsigned long map_start = 0;
1660        unsigned long map_len = 0;
1661        int err;
1662
1663        if (low > high) {
1664                btrfs_err(eb->fs_info,
1665                 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1666                          __func__, low, high, eb->start,
1667                          btrfs_header_owner(eb), btrfs_header_level(eb));
1668                return -EINVAL;
1669        }
1670
1671        while (low < high) {
1672                mid = (low + high) / 2;
1673                offset = p + mid * item_size;
1674
1675                if (!kaddr || offset < map_start ||
1676                    (offset + sizeof(struct btrfs_disk_key)) >
1677                    map_start + map_len) {
1678
1679                        err = map_private_extent_buffer(eb, offset,
1680                                                sizeof(struct btrfs_disk_key),
1681                                                &kaddr, &map_start, &map_len);
1682
1683                        if (!err) {
1684                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
1685                                                        map_start);
1686                        } else if (err == 1) {
1687                                read_extent_buffer(eb, &unaligned,
1688                                                   offset, sizeof(unaligned));
1689                                tmp = &unaligned;
1690                        } else {
1691                                return err;
1692                        }
1693
1694                } else {
1695                        tmp = (struct btrfs_disk_key *)(kaddr + offset -
1696                                                        map_start);
1697                }
1698                ret = comp_keys(tmp, key);
1699
1700                if (ret < 0)
1701                        low = mid + 1;
1702                else if (ret > 0)
1703                        high = mid;
1704                else {
1705                        *slot = mid;
1706                        return 0;
1707                }
1708        }
1709        *slot = low;
1710        return 1;
1711}
1712
1713/*
1714 * simple bin_search frontend that does the right thing for
1715 * leaves vs nodes
1716 */
1717int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1718                     int level, int *slot)
1719{
1720        if (level == 0)
1721                return generic_bin_search(eb,
1722                                          offsetof(struct btrfs_leaf, items),
1723                                          sizeof(struct btrfs_item),
1724                                          key, btrfs_header_nritems(eb),
1725                                          slot);
1726        else
1727                return generic_bin_search(eb,
1728                                          offsetof(struct btrfs_node, ptrs),
1729                                          sizeof(struct btrfs_key_ptr),
1730                                          key, btrfs_header_nritems(eb),
1731                                          slot);
1732}
1733
1734static void root_add_used(struct btrfs_root *root, u32 size)
1735{
1736        spin_lock(&root->accounting_lock);
1737        btrfs_set_root_used(&root->root_item,
1738                            btrfs_root_used(&root->root_item) + size);
1739        spin_unlock(&root->accounting_lock);
1740}
1741
1742static void root_sub_used(struct btrfs_root *root, u32 size)
1743{
1744        spin_lock(&root->accounting_lock);
1745        btrfs_set_root_used(&root->root_item,
1746                            btrfs_root_used(&root->root_item) - size);
1747        spin_unlock(&root->accounting_lock);
1748}
1749
1750/* given a node and slot number, this reads the blocks it points to.  The
1751 * extent buffer is returned with a reference taken (but unlocked).
1752 */
1753static noinline struct extent_buffer *
1754read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1755               int slot)
1756{
1757        int level = btrfs_header_level(parent);
1758        struct extent_buffer *eb;
1759        struct btrfs_key first_key;
1760
1761        if (slot < 0 || slot >= btrfs_header_nritems(parent))
1762                return ERR_PTR(-ENOENT);
1763
1764        BUG_ON(level == 0);
1765
1766        btrfs_node_key_to_cpu(parent, &first_key, slot);
1767        eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1768                             btrfs_node_ptr_generation(parent, slot),
1769                             level - 1, &first_key);
1770        if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1771                free_extent_buffer(eb);
1772                eb = ERR_PTR(-EIO);
1773        }
1774
1775        return eb;
1776}
1777
1778/*
1779 * node level balancing, used to make sure nodes are in proper order for
1780 * item deletion.  We balance from the top down, so we have to make sure
1781 * that a deletion won't leave an node completely empty later on.
1782 */
1783static noinline int balance_level(struct btrfs_trans_handle *trans,
1784                         struct btrfs_root *root,
1785                         struct btrfs_path *path, int level)
1786{
1787        struct btrfs_fs_info *fs_info = root->fs_info;
1788        struct extent_buffer *right = NULL;
1789        struct extent_buffer *mid;
1790        struct extent_buffer *left = NULL;
1791        struct extent_buffer *parent = NULL;
1792        int ret = 0;
1793        int wret;
1794        int pslot;
1795        int orig_slot = path->slots[level];
1796        u64 orig_ptr;
1797
1798        ASSERT(level > 0);
1799
1800        mid = path->nodes[level];
1801
1802        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1803                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1804        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1805
1806        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1807
1808        if (level < BTRFS_MAX_LEVEL - 1) {
1809                parent = path->nodes[level + 1];
1810                pslot = path->slots[level + 1];
1811        }
1812
1813        /*
1814         * deal with the case where there is only one pointer in the root
1815         * by promoting the node below to a root
1816         */
1817        if (!parent) {
1818                struct extent_buffer *child;
1819
1820                if (btrfs_header_nritems(mid) != 1)
1821                        return 0;
1822
1823                /* promote the child to a root */
1824                child = read_node_slot(fs_info, mid, 0);
1825                if (IS_ERR(child)) {
1826                        ret = PTR_ERR(child);
1827                        btrfs_handle_fs_error(fs_info, ret, NULL);
1828                        goto enospc;
1829                }
1830
1831                btrfs_tree_lock(child);
1832                btrfs_set_lock_blocking(child);
1833                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1834                if (ret) {
1835                        btrfs_tree_unlock(child);
1836                        free_extent_buffer(child);
1837                        goto enospc;
1838                }
1839
1840                ret = tree_mod_log_insert_root(root->node, child, 1);
1841                BUG_ON(ret < 0);
1842                rcu_assign_pointer(root->node, child);
1843
1844                add_root_to_dirty_list(root);
1845                btrfs_tree_unlock(child);
1846
1847                path->locks[level] = 0;
1848                path->nodes[level] = NULL;
1849                clean_tree_block(fs_info, mid);
1850                btrfs_tree_unlock(mid);
1851                /* once for the path */
1852                free_extent_buffer(mid);
1853
1854                root_sub_used(root, mid->len);
1855                btrfs_free_tree_block(trans, root, mid, 0, 1);
1856                /* once for the root ptr */
1857                free_extent_buffer_stale(mid);
1858                return 0;
1859        }
1860        if (btrfs_header_nritems(mid) >
1861            BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1862                return 0;
1863
1864        left = read_node_slot(fs_info, parent, pslot - 1);
1865        if (IS_ERR(left))
1866                left = NULL;
1867
1868        if (left) {
1869                btrfs_tree_lock(left);
1870                btrfs_set_lock_blocking(left);
1871                wret = btrfs_cow_block(trans, root, left,
1872                                       parent, pslot - 1, &left);
1873                if (wret) {
1874                        ret = wret;
1875                        goto enospc;
1876                }
1877        }
1878
1879        right = read_node_slot(fs_info, parent, pslot + 1);
1880        if (IS_ERR(right))
1881                right = NULL;
1882
1883        if (right) {
1884                btrfs_tree_lock(right);
1885                btrfs_set_lock_blocking(right);
1886                wret = btrfs_cow_block(trans, root, right,
1887                                       parent, pslot + 1, &right);
1888                if (wret) {
1889                        ret = wret;
1890                        goto enospc;
1891                }
1892        }
1893
1894        /* first, try to make some room in the middle buffer */
1895        if (left) {
1896                orig_slot += btrfs_header_nritems(left);
1897                wret = push_node_left(trans, fs_info, left, mid, 1);
1898                if (wret < 0)
1899                        ret = wret;
1900        }
1901
1902        /*
1903         * then try to empty the right most buffer into the middle
1904         */
1905        if (right) {
1906                wret = push_node_left(trans, fs_info, mid, right, 1);
1907                if (wret < 0 && wret != -ENOSPC)
1908                        ret = wret;
1909                if (btrfs_header_nritems(right) == 0) {
1910                        clean_tree_block(fs_info, right);
1911                        btrfs_tree_unlock(right);
1912                        del_ptr(root, path, level + 1, pslot + 1);
1913                        root_sub_used(root, right->len);
1914                        btrfs_free_tree_block(trans, root, right, 0, 1);
1915                        free_extent_buffer_stale(right);
1916                        right = NULL;
1917                } else {
1918                        struct btrfs_disk_key right_key;
1919                        btrfs_node_key(right, &right_key, 0);
1920                        ret = tree_mod_log_insert_key(parent, pslot + 1,
1921                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1922                        BUG_ON(ret < 0);
1923                        btrfs_set_node_key(parent, &right_key, pslot + 1);
1924                        btrfs_mark_buffer_dirty(parent);
1925                }
1926        }
1927        if (btrfs_header_nritems(mid) == 1) {
1928                /*
1929                 * we're not allowed to leave a node with one item in the
1930                 * tree during a delete.  A deletion from lower in the tree
1931                 * could try to delete the only pointer in this node.
1932                 * So, pull some keys from the left.
1933                 * There has to be a left pointer at this point because
1934                 * otherwise we would have pulled some pointers from the
1935                 * right
1936                 */
1937                if (!left) {
1938                        ret = -EROFS;
1939                        btrfs_handle_fs_error(fs_info, ret, NULL);
1940                        goto enospc;
1941                }
1942                wret = balance_node_right(trans, fs_info, mid, left);
1943                if (wret < 0) {
1944                        ret = wret;
1945                        goto enospc;
1946                }
1947                if (wret == 1) {
1948                        wret = push_node_left(trans, fs_info, left, mid, 1);
1949                        if (wret < 0)
1950                                ret = wret;
1951                }
1952                BUG_ON(wret == 1);
1953        }
1954        if (btrfs_header_nritems(mid) == 0) {
1955                clean_tree_block(fs_info, mid);
1956                btrfs_tree_unlock(mid);
1957                del_ptr(root, path, level + 1, pslot);
1958                root_sub_used(root, mid->len);
1959                btrfs_free_tree_block(trans, root, mid, 0, 1);
1960                free_extent_buffer_stale(mid);
1961                mid = NULL;
1962        } else {
1963                /* update the parent key to reflect our changes */
1964                struct btrfs_disk_key mid_key;
1965                btrfs_node_key(mid, &mid_key, 0);
1966                ret = tree_mod_log_insert_key(parent, pslot,
1967                                MOD_LOG_KEY_REPLACE, GFP_NOFS);
1968                BUG_ON(ret < 0);
1969                btrfs_set_node_key(parent, &mid_key, pslot);
1970                btrfs_mark_buffer_dirty(parent);
1971        }
1972
1973        /* update the path */
1974        if (left) {
1975                if (btrfs_header_nritems(left) > orig_slot) {
1976                        extent_buffer_get(left);
1977                        /* left was locked after cow */
1978                        path->nodes[level] = left;
1979                        path->slots[level + 1] -= 1;
1980                        path->slots[level] = orig_slot;
1981                        if (mid) {
1982                                btrfs_tree_unlock(mid);
1983                                free_extent_buffer(mid);
1984                        }
1985                } else {
1986                        orig_slot -= btrfs_header_nritems(left);
1987                        path->slots[level] = orig_slot;
1988                }
1989        }
1990        /* double check we haven't messed things up */
1991        if (orig_ptr !=
1992            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1993                BUG();
1994enospc:
1995        if (right) {
1996                btrfs_tree_unlock(right);
1997                free_extent_buffer(right);
1998        }
1999        if (left) {
2000                if (path->nodes[level] != left)
2001                        btrfs_tree_unlock(left);
2002                free_extent_buffer(left);
2003        }
2004        return ret;
2005}
2006
2007/* Node balancing for insertion.  Here we only split or push nodes around
2008 * when they are completely full.  This is also done top down, so we
2009 * have to be pessimistic.
2010 */
2011static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2012                                          struct btrfs_root *root,
2013                                          struct btrfs_path *path, int level)
2014{
2015        struct btrfs_fs_info *fs_info = root->fs_info;
2016        struct extent_buffer *right = NULL;
2017        struct extent_buffer *mid;
2018        struct extent_buffer *left = NULL;
2019        struct extent_buffer *parent = NULL;
2020        int ret = 0;
2021        int wret;
2022        int pslot;
2023        int orig_slot = path->slots[level];
2024
2025        if (level == 0)
2026                return 1;
2027
2028        mid = path->nodes[level];
2029        WARN_ON(btrfs_header_generation(mid) != trans->transid);
2030
2031        if (level < BTRFS_MAX_LEVEL - 1) {
2032                parent = path->nodes[level + 1];
2033                pslot = path->slots[level + 1];
2034        }
2035
2036        if (!parent)
2037                return 1;
2038
2039        left = read_node_slot(fs_info, parent, pslot - 1);
2040        if (IS_ERR(left))
2041                left = NULL;
2042
2043        /* first, try to make some room in the middle buffer */
2044        if (left) {
2045                u32 left_nr;
2046
2047                btrfs_tree_lock(left);
2048                btrfs_set_lock_blocking(left);
2049
2050                left_nr = btrfs_header_nritems(left);
2051                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2052                        wret = 1;
2053                } else {
2054                        ret = btrfs_cow_block(trans, root, left, parent,
2055                                              pslot - 1, &left);
2056                        if (ret)
2057                                wret = 1;
2058                        else {
2059                                wret = push_node_left(trans, fs_info,
2060                                                      left, mid, 0);
2061                        }
2062                }
2063                if (wret < 0)
2064                        ret = wret;
2065                if (wret == 0) {
2066                        struct btrfs_disk_key disk_key;
2067                        orig_slot += left_nr;
2068                        btrfs_node_key(mid, &disk_key, 0);
2069                        ret = tree_mod_log_insert_key(parent, pslot,
2070                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
2071                        BUG_ON(ret < 0);
2072                        btrfs_set_node_key(parent, &disk_key, pslot);
2073                        btrfs_mark_buffer_dirty(parent);
2074                        if (btrfs_header_nritems(left) > orig_slot) {
2075                                path->nodes[level] = left;
2076                                path->slots[level + 1] -= 1;
2077                                path->slots[level] = orig_slot;
2078                                btrfs_tree_unlock(mid);
2079                                free_extent_buffer(mid);
2080                        } else {
2081                                orig_slot -=
2082                                        btrfs_header_nritems(left);
2083                                path->slots[level] = orig_slot;
2084                                btrfs_tree_unlock(left);
2085                                free_extent_buffer(left);
2086                        }
2087                        return 0;
2088                }
2089                btrfs_tree_unlock(left);
2090                free_extent_buffer(left);
2091        }
2092        right = read_node_slot(fs_info, parent, pslot + 1);
2093        if (IS_ERR(right))
2094                right = NULL;
2095
2096        /*
2097         * then try to empty the right most buffer into the middle
2098         */
2099        if (right) {
2100                u32 right_nr;
2101
2102                btrfs_tree_lock(right);
2103                btrfs_set_lock_blocking(right);
2104
2105                right_nr = btrfs_header_nritems(right);
2106                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2107                        wret = 1;
2108                } else {
2109                        ret = btrfs_cow_block(trans, root, right,
2110                                              parent, pslot + 1,
2111                                              &right);
2112                        if (ret)
2113                                wret = 1;
2114                        else {
2115                                wret = balance_node_right(trans, fs_info,
2116                                                          right, mid);
2117                        }
2118                }
2119                if (wret < 0)
2120                        ret = wret;
2121                if (wret == 0) {
2122                        struct btrfs_disk_key disk_key;
2123
2124                        btrfs_node_key(right, &disk_key, 0);
2125                        ret = tree_mod_log_insert_key(parent, pslot + 1,
2126                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
2127                        BUG_ON(ret < 0);
2128                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
2129                        btrfs_mark_buffer_dirty(parent);
2130
2131                        if (btrfs_header_nritems(mid) <= orig_slot) {
2132                                path->nodes[level] = right;
2133                                path->slots[level + 1] += 1;
2134                                path->slots[level] = orig_slot -
2135                                        btrfs_header_nritems(mid);
2136                                btrfs_tree_unlock(mid);
2137                                free_extent_buffer(mid);
2138                        } else {
2139                                btrfs_tree_unlock(right);
2140                                free_extent_buffer(right);
2141                        }
2142                        return 0;
2143                }
2144                btrfs_tree_unlock(right);
2145                free_extent_buffer(right);
2146        }
2147        return 1;
2148}
2149
2150/*
2151 * readahead one full node of leaves, finding things that are close
2152 * to the block in 'slot', and triggering ra on them.
2153 */
2154static void reada_for_search(struct btrfs_fs_info *fs_info,
2155                             struct btrfs_path *path,
2156                             int level, int slot, u64 objectid)
2157{
2158        struct extent_buffer *node;
2159        struct btrfs_disk_key disk_key;
2160        u32 nritems;
2161        u64 search;
2162        u64 target;
2163        u64 nread = 0;
2164        struct extent_buffer *eb;
2165        u32 nr;
2166        u32 blocksize;
2167        u32 nscan = 0;
2168
2169        if (level != 1)
2170                return;
2171
2172        if (!path->nodes[level])
2173                return;
2174
2175        node = path->nodes[level];
2176
2177        search = btrfs_node_blockptr(node, slot);
2178        blocksize = fs_info->nodesize;
2179        eb = find_extent_buffer(fs_info, search);
2180        if (eb) {
2181                free_extent_buffer(eb);
2182                return;
2183        }
2184
2185        target = search;
2186
2187        nritems = btrfs_header_nritems(node);
2188        nr = slot;
2189
2190        while (1) {
2191                if (path->reada == READA_BACK) {
2192                        if (nr == 0)
2193                                break;
2194                        nr--;
2195                } else if (path->reada == READA_FORWARD) {
2196                        nr++;
2197                        if (nr >= nritems)
2198                                break;
2199                }
2200                if (path->reada == READA_BACK && objectid) {
2201                        btrfs_node_key(node, &disk_key, nr);
2202                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
2203                                break;
2204                }
2205                search = btrfs_node_blockptr(node, nr);
2206                if ((search <= target && target - search <= 65536) ||
2207                    (search > target && search - target <= 65536)) {
2208                        readahead_tree_block(fs_info, search);
2209                        nread += blocksize;
2210                }
2211                nscan++;
2212                if ((nread > 65536 || nscan > 32))
2213                        break;
2214        }
2215}
2216
2217static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2218                                       struct btrfs_path *path, int level)
2219{
2220        int slot;
2221        int nritems;
2222        struct extent_buffer *parent;
2223        struct extent_buffer *eb;
2224        u64 gen;
2225        u64 block1 = 0;
2226        u64 block2 = 0;
2227
2228        parent = path->nodes[level + 1];
2229        if (!parent)
2230                return;
2231
2232        nritems = btrfs_header_nritems(parent);
2233        slot = path->slots[level + 1];
2234
2235        if (slot > 0) {
2236                block1 = btrfs_node_blockptr(parent, slot - 1);
2237                gen = btrfs_node_ptr_generation(parent, slot - 1);
2238                eb = find_extent_buffer(fs_info, block1);
2239                /*
2240                 * if we get -eagain from btrfs_buffer_uptodate, we
2241                 * don't want to return eagain here.  That will loop
2242                 * forever
2243                 */
2244                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2245                        block1 = 0;
2246                free_extent_buffer(eb);
2247        }
2248        if (slot + 1 < nritems) {
2249                block2 = btrfs_node_blockptr(parent, slot + 1);
2250                gen = btrfs_node_ptr_generation(parent, slot + 1);
2251                eb = find_extent_buffer(fs_info, block2);
2252                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2253                        block2 = 0;
2254                free_extent_buffer(eb);
2255        }
2256
2257        if (block1)
2258                readahead_tree_block(fs_info, block1);
2259        if (block2)
2260                readahead_tree_block(fs_info, block2);
2261}
2262
2263
2264/*
2265 * when we walk down the tree, it is usually safe to unlock the higher layers
2266 * in the tree.  The exceptions are when our path goes through slot 0, because
2267 * operations on the tree might require changing key pointers higher up in the
2268 * tree.
2269 *
2270 * callers might also have set path->keep_locks, which tells this code to keep
2271 * the lock if the path points to the last slot in the block.  This is part of
2272 * walking through the tree, and selecting the next slot in the higher block.
2273 *
2274 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2275 * if lowest_unlock is 1, level 0 won't be unlocked
2276 */
2277static noinline void unlock_up(struct btrfs_path *path, int level,
2278                               int lowest_unlock, int min_write_lock_level,
2279                               int *write_lock_level)
2280{
2281        int i;
2282        int skip_level = level;
2283        int no_skips = 0;
2284        struct extent_buffer *t;
2285
2286        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2287                if (!path->nodes[i])
2288                        break;
2289                if (!path->locks[i])
2290                        break;
2291                if (!no_skips && path->slots[i] == 0) {
2292                        skip_level = i + 1;
2293                        continue;
2294                }
2295                if (!no_skips && path->keep_locks) {
2296                        u32 nritems;
2297                        t = path->nodes[i];
2298                        nritems = btrfs_header_nritems(t);
2299                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
2300                                skip_level = i + 1;
2301                                continue;
2302                        }
2303                }
2304                if (skip_level < i && i >= lowest_unlock)
2305                        no_skips = 1;
2306
2307                t = path->nodes[i];
2308                if (i >= lowest_unlock && i > skip_level) {
2309                        btrfs_tree_unlock_rw(t, path->locks[i]);
2310                        path->locks[i] = 0;
2311                        if (write_lock_level &&
2312                            i > min_write_lock_level &&
2313                            i <= *write_lock_level) {
2314                                *write_lock_level = i - 1;
2315                        }
2316                }
2317        }
2318}
2319
2320/*
2321 * This releases any locks held in the path starting at level and
2322 * going all the way up to the root.
2323 *
2324 * btrfs_search_slot will keep the lock held on higher nodes in a few
2325 * corner cases, such as COW of the block at slot zero in the node.  This
2326 * ignores those rules, and it should only be called when there are no
2327 * more updates to be done higher up in the tree.
2328 */
2329noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2330{
2331        int i;
2332
2333        if (path->keep_locks)
2334                return;
2335
2336        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2337                if (!path->nodes[i])
2338                        continue;
2339                if (!path->locks[i])
2340                        continue;
2341                btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2342                path->locks[i] = 0;
2343        }
2344}
2345
2346/*
2347 * helper function for btrfs_search_slot.  The goal is to find a block
2348 * in cache without setting the path to blocking.  If we find the block
2349 * we return zero and the path is unchanged.
2350 *
2351 * If we can't find the block, we set the path blocking and do some
2352 * reada.  -EAGAIN is returned and the search must be repeated.
2353 */
2354static int
2355read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2356                      struct extent_buffer **eb_ret, int level, int slot,
2357                      const struct btrfs_key *key)
2358{
2359        struct btrfs_fs_info *fs_info = root->fs_info;
2360        u64 blocknr;
2361        u64 gen;
2362        struct extent_buffer *b = *eb_ret;
2363        struct extent_buffer *tmp;
2364        struct btrfs_key first_key;
2365        int ret;
2366        int parent_level;
2367
2368        blocknr = btrfs_node_blockptr(b, slot);
2369        gen = btrfs_node_ptr_generation(b, slot);
2370        parent_level = btrfs_header_level(b);
2371        btrfs_node_key_to_cpu(b, &first_key, slot);
2372
2373        tmp = find_extent_buffer(fs_info, blocknr);
2374        if (tmp) {
2375                /* first we do an atomic uptodate check */
2376                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2377                        *eb_ret = tmp;
2378                        return 0;
2379                }
2380
2381                /* the pages were up to date, but we failed
2382                 * the generation number check.  Do a full
2383                 * read for the generation number that is correct.
2384                 * We must do this without dropping locks so
2385                 * we can trust our generation number
2386                 */
2387                btrfs_set_path_blocking(p);
2388
2389                /* now we're allowed to do a blocking uptodate check */
2390                ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2391                if (!ret) {
2392                        *eb_ret = tmp;
2393                        return 0;
2394                }
2395                free_extent_buffer(tmp);
2396                btrfs_release_path(p);
2397                return -EIO;
2398        }
2399
2400        /*
2401         * reduce lock contention at high levels
2402         * of the btree by dropping locks before
2403         * we read.  Don't release the lock on the current
2404         * level because we need to walk this node to figure
2405         * out which blocks to read.
2406         */
2407        btrfs_unlock_up_safe(p, level + 1);
2408        btrfs_set_path_blocking(p);
2409
2410        if (p->reada != READA_NONE)
2411                reada_for_search(fs_info, p, level, slot, key->objectid);
2412
2413        ret = -EAGAIN;
2414        tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2415                              &first_key);
2416        if (!IS_ERR(tmp)) {
2417                /*
2418                 * If the read above didn't mark this buffer up to date,
2419                 * it will never end up being up to date.  Set ret to EIO now
2420                 * and give up so that our caller doesn't loop forever
2421                 * on our EAGAINs.
2422                 */
2423                if (!extent_buffer_uptodate(tmp))
2424                        ret = -EIO;
2425                free_extent_buffer(tmp);
2426        } else {
2427                ret = PTR_ERR(tmp);
2428        }
2429
2430        btrfs_release_path(p);
2431        return ret;
2432}
2433
2434/*
2435 * helper function for btrfs_search_slot.  This does all of the checks
2436 * for node-level blocks and does any balancing required based on
2437 * the ins_len.
2438 *
2439 * If no extra work was required, zero is returned.  If we had to
2440 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2441 * start over
2442 */
2443static int
2444setup_nodes_for_search(struct btrfs_trans_handle *trans,
2445                       struct btrfs_root *root, struct btrfs_path *p,
2446                       struct extent_buffer *b, int level, int ins_len,
2447                       int *write_lock_level)
2448{
2449        struct btrfs_fs_info *fs_info = root->fs_info;
2450        int ret;
2451
2452        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2453            BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2454                int sret;
2455
2456                if (*write_lock_level < level + 1) {
2457                        *write_lock_level = level + 1;
2458                        btrfs_release_path(p);
2459                        goto again;
2460                }
2461
2462                btrfs_set_path_blocking(p);
2463                reada_for_balance(fs_info, p, level);
2464                sret = split_node(trans, root, p, level);
2465
2466                BUG_ON(sret > 0);
2467                if (sret) {
2468                        ret = sret;
2469                        goto done;
2470                }
2471                b = p->nodes[level];
2472        } else if (ins_len < 0 && btrfs_header_nritems(b) <
2473                   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2474                int sret;
2475
2476                if (*write_lock_level < level + 1) {
2477                        *write_lock_level = level + 1;
2478                        btrfs_release_path(p);
2479                        goto again;
2480                }
2481
2482                btrfs_set_path_blocking(p);
2483                reada_for_balance(fs_info, p, level);
2484                sret = balance_level(trans, root, p, level);
2485
2486                if (sret) {
2487                        ret = sret;
2488                        goto done;
2489                }
2490                b = p->nodes[level];
2491                if (!b) {
2492                        btrfs_release_path(p);
2493                        goto again;
2494                }
2495                BUG_ON(btrfs_header_nritems(b) == 1);
2496        }
2497        return 0;
2498
2499again:
2500        ret = -EAGAIN;
2501done:
2502        return ret;
2503}
2504
2505static void key_search_validate(struct extent_buffer *b,
2506                                const struct btrfs_key *key,
2507                                int level)
2508{
2509#ifdef CONFIG_BTRFS_ASSERT
2510        struct btrfs_disk_key disk_key;
2511
2512        btrfs_cpu_key_to_disk(&disk_key, key);
2513
2514        if (level == 0)
2515                ASSERT(!memcmp_extent_buffer(b, &disk_key,
2516                    offsetof(struct btrfs_leaf, items[0].key),
2517                    sizeof(disk_key)));
2518        else
2519                ASSERT(!memcmp_extent_buffer(b, &disk_key,
2520                    offsetof(struct btrfs_node, ptrs[0].key),
2521                    sizeof(disk_key)));
2522#endif
2523}
2524
2525static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2526                      int level, int *prev_cmp, int *slot)
2527{
2528        if (*prev_cmp != 0) {
2529                *prev_cmp = btrfs_bin_search(b, key, level, slot);
2530                return *prev_cmp;
2531        }
2532
2533        key_search_validate(b, key, level);
2534        *slot = 0;
2535
2536        return 0;
2537}
2538
2539int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2540                u64 iobjectid, u64 ioff, u8 key_type,
2541                struct btrfs_key *found_key)
2542{
2543        int ret;
2544        struct btrfs_key key;
2545        struct extent_buffer *eb;
2546
2547        ASSERT(path);
2548        ASSERT(found_key);
2549
2550        key.type = key_type;
2551        key.objectid = iobjectid;
2552        key.offset = ioff;
2553
2554        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2555        if (ret < 0)
2556                return ret;
2557
2558        eb = path->nodes[0];
2559        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2560                ret = btrfs_next_leaf(fs_root, path);
2561                if (ret)
2562                        return ret;
2563                eb = path->nodes[0];
2564        }
2565
2566        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2567        if (found_key->type != key.type ||
2568                        found_key->objectid != key.objectid)
2569                return 1;
2570
2571        return 0;
2572}
2573
2574static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2575                                                        struct btrfs_path *p,
2576                                                        int write_lock_level)
2577{
2578        struct btrfs_fs_info *fs_info = root->fs_info;
2579        struct extent_buffer *b;
2580        int root_lock;
2581        int level = 0;
2582
2583        /* We try very hard to do read locks on the root */
2584        root_lock = BTRFS_READ_LOCK;
2585
2586        if (p->search_commit_root) {
2587                /* The commit roots are read only so we always do read locks */
2588                if (p->need_commit_sem)
2589                        down_read(&fs_info->commit_root_sem);
2590                b = root->commit_root;
2591                extent_buffer_get(b);
2592                level = btrfs_header_level(b);
2593                if (p->need_commit_sem)
2594                        up_read(&fs_info->commit_root_sem);
2595                /*
2596                 * Ensure that all callers have set skip_locking when
2597                 * p->search_commit_root = 1.
2598                 */
2599                ASSERT(p->skip_locking == 1);
2600
2601                goto out;
2602        }
2603
2604        if (p->skip_locking) {
2605                b = btrfs_root_node(root);
2606                level = btrfs_header_level(b);
2607                goto out;
2608        }
2609
2610        /*
2611         * If the level is set to maximum, we can skip trying to get the read
2612         * lock.
2613         */
2614        if (write_lock_level < BTRFS_MAX_LEVEL) {
2615                /*
2616                 * We don't know the level of the root node until we actually
2617                 * have it read locked
2618                 */
2619                b = btrfs_read_lock_root_node(root);
2620                level = btrfs_header_level(b);
2621                if (level > write_lock_level)
2622                        goto out;
2623
2624                /* Whoops, must trade for write lock */
2625                btrfs_tree_read_unlock(b);
2626                free_extent_buffer(b);
2627        }
2628
2629        b = btrfs_lock_root_node(root);
2630        root_lock = BTRFS_WRITE_LOCK;
2631
2632        /* The level might have changed, check again */
2633        level = btrfs_header_level(b);
2634
2635out:
2636        p->nodes[level] = b;
2637        if (!p->skip_locking)
2638                p->locks[level] = root_lock;
2639        /*
2640         * Callers are responsible for dropping b's references.
2641         */
2642        return b;
2643}
2644
2645
2646/*
2647 * btrfs_search_slot - look for a key in a tree and perform necessary
2648 * modifications to preserve tree invariants.
2649 *
2650 * @trans:      Handle of transaction, used when modifying the tree
2651 * @p:          Holds all btree nodes along the search path
2652 * @root:       The root node of the tree
2653 * @key:        The key we are looking for
2654 * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2655 *              deletions it's -1. 0 for plain searches
2656 * @cow:        boolean should CoW operations be performed. Must always be 1
2657 *              when modifying the tree.
2658 *
2659 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2660 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2661 *
2662 * If @key is found, 0 is returned and you can find the item in the leaf level
2663 * of the path (level 0)
2664 *
2665 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2666 * points to the slot where it should be inserted
2667 *
2668 * If an error is encountered while searching the tree a negative error number
2669 * is returned
2670 */
2671int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2672                      const struct btrfs_key *key, struct btrfs_path *p,
2673                      int ins_len, int cow)
2674{
2675        struct btrfs_fs_info *fs_info = root->fs_info;
2676        struct extent_buffer *b;
2677        int slot;
2678        int ret;
2679        int err;
2680        int level;
2681        int lowest_unlock = 1;
2682        /* everything at write_lock_level or lower must be write locked */
2683        int write_lock_level = 0;
2684        u8 lowest_level = 0;
2685        int min_write_lock_level;
2686        int prev_cmp;
2687
2688        lowest_level = p->lowest_level;
2689        WARN_ON(lowest_level && ins_len > 0);
2690        WARN_ON(p->nodes[0] != NULL);
2691        BUG_ON(!cow && ins_len);
2692
2693        if (ins_len < 0) {
2694                lowest_unlock = 2;
2695
2696                /* when we are removing items, we might have to go up to level
2697                 * two as we update tree pointers  Make sure we keep write
2698                 * for those levels as well
2699                 */
2700                write_lock_level = 2;
2701        } else if (ins_len > 0) {
2702                /*
2703                 * for inserting items, make sure we have a write lock on
2704                 * level 1 so we can update keys
2705                 */
2706                write_lock_level = 1;
2707        }
2708
2709        if (!cow)
2710                write_lock_level = -1;
2711
2712        if (cow && (p->keep_locks || p->lowest_level))
2713                write_lock_level = BTRFS_MAX_LEVEL;
2714
2715        min_write_lock_level = write_lock_level;
2716
2717again:
2718        prev_cmp = -1;
2719        b = btrfs_search_slot_get_root(root, p, write_lock_level);
2720
2721        while (b) {
2722                level = btrfs_header_level(b);
2723
2724                /*
2725                 * setup the path here so we can release it under lock
2726                 * contention with the cow code
2727                 */
2728                if (cow) {
2729                        bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2730
2731                        /*
2732                         * if we don't really need to cow this block
2733                         * then we don't want to set the path blocking,
2734                         * so we test it here
2735                         */
2736                        if (!should_cow_block(trans, root, b)) {
2737                                trans->dirty = true;
2738                                goto cow_done;
2739                        }
2740
2741                        /*
2742                         * must have write locks on this node and the
2743                         * parent
2744                         */
2745                        if (level > write_lock_level ||
2746                            (level + 1 > write_lock_level &&
2747                            level + 1 < BTRFS_MAX_LEVEL &&
2748                            p->nodes[level + 1])) {
2749                                write_lock_level = level + 1;
2750                                btrfs_release_path(p);
2751                                goto again;
2752                        }
2753
2754                        btrfs_set_path_blocking(p);
2755                        if (last_level)
2756                                err = btrfs_cow_block(trans, root, b, NULL, 0,
2757                                                      &b);
2758                        else
2759                                err = btrfs_cow_block(trans, root, b,
2760                                                      p->nodes[level + 1],
2761                                                      p->slots[level + 1], &b);
2762                        if (err) {
2763                                ret = err;
2764                                goto done;
2765                        }
2766                }
2767cow_done:
2768                p->nodes[level] = b;
2769                /*
2770                 * Leave path with blocking locks to avoid massive
2771                 * lock context switch, this is made on purpose.
2772                 */
2773
2774                /*
2775                 * we have a lock on b and as long as we aren't changing
2776                 * the tree, there is no way to for the items in b to change.
2777                 * It is safe to drop the lock on our parent before we
2778                 * go through the expensive btree search on b.
2779                 *
2780                 * If we're inserting or deleting (ins_len != 0), then we might
2781                 * be changing slot zero, which may require changing the parent.
2782                 * So, we can't drop the lock until after we know which slot
2783                 * we're operating on.
2784                 */
2785                if (!ins_len && !p->keep_locks) {
2786                        int u = level + 1;
2787
2788                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2789                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2790                                p->locks[u] = 0;
2791                        }
2792                }
2793
2794                ret = key_search(b, key, level, &prev_cmp, &slot);
2795                if (ret < 0)
2796                        goto done;
2797
2798                if (level != 0) {
2799                        int dec = 0;
2800                        if (ret && slot > 0) {
2801                                dec = 1;
2802                                slot -= 1;
2803                        }
2804                        p->slots[level] = slot;
2805                        err = setup_nodes_for_search(trans, root, p, b, level,
2806                                             ins_len, &write_lock_level);
2807                        if (err == -EAGAIN)
2808                                goto again;
2809                        if (err) {
2810                                ret = err;
2811                                goto done;
2812                        }
2813                        b = p->nodes[level];
2814                        slot = p->slots[level];
2815
2816                        /*
2817                         * slot 0 is special, if we change the key
2818                         * we have to update the parent pointer
2819                         * which means we must have a write lock
2820                         * on the parent
2821                         */
2822                        if (slot == 0 && ins_len &&
2823                            write_lock_level < level + 1) {
2824                                write_lock_level = level + 1;
2825                                btrfs_release_path(p);
2826                                goto again;
2827                        }
2828
2829                        unlock_up(p, level, lowest_unlock,
2830                                  min_write_lock_level, &write_lock_level);
2831
2832                        if (level == lowest_level) {
2833                                if (dec)
2834                                        p->slots[level]++;
2835                                goto done;
2836                        }
2837
2838                        err = read_block_for_search(root, p, &b, level,
2839                                                    slot, key);
2840                        if (err == -EAGAIN)
2841                                goto again;
2842                        if (err) {
2843                                ret = err;
2844                                goto done;
2845                        }
2846
2847                        if (!p->skip_locking) {
2848                                level = btrfs_header_level(b);
2849                                if (level <= write_lock_level) {
2850                                        err = btrfs_try_tree_write_lock(b);
2851                                        if (!err) {
2852                                                btrfs_set_path_blocking(p);
2853                                                btrfs_tree_lock(b);
2854                                        }
2855                                        p->locks[level] = BTRFS_WRITE_LOCK;
2856                                } else {
2857                                        err = btrfs_tree_read_lock_atomic(b);
2858                                        if (!err) {
2859                                                btrfs_set_path_blocking(p);
2860                                                btrfs_tree_read_lock(b);
2861                                        }
2862                                        p->locks[level] = BTRFS_READ_LOCK;
2863                                }
2864                                p->nodes[level] = b;
2865                        }
2866                } else {
2867                        p->slots[level] = slot;
2868                        if (ins_len > 0 &&
2869                            btrfs_leaf_free_space(fs_info, b) < ins_len) {
2870                                if (write_lock_level < 1) {
2871                                        write_lock_level = 1;
2872                                        btrfs_release_path(p);
2873                                        goto again;
2874                                }
2875
2876                                btrfs_set_path_blocking(p);
2877                                err = split_leaf(trans, root, key,
2878                                                 p, ins_len, ret == 0);
2879
2880                                BUG_ON(err > 0);
2881                                if (err) {
2882                                        ret = err;
2883                                        goto done;
2884                                }
2885                        }
2886                        if (!p->search_for_split)
2887                                unlock_up(p, level, lowest_unlock,
2888                                          min_write_lock_level, NULL);
2889                        goto done;
2890                }
2891        }
2892        ret = 1;
2893done:
2894        /*
2895         * we don't really know what they plan on doing with the path
2896         * from here on, so for now just mark it as blocking
2897         */
2898        if (!p->leave_spinning)
2899                btrfs_set_path_blocking(p);
2900        if (ret < 0 && !p->skip_release_on_error)
2901                btrfs_release_path(p);
2902        return ret;
2903}
2904
2905/*
2906 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2907 * current state of the tree together with the operations recorded in the tree
2908 * modification log to search for the key in a previous version of this tree, as
2909 * denoted by the time_seq parameter.
2910 *
2911 * Naturally, there is no support for insert, delete or cow operations.
2912 *
2913 * The resulting path and return value will be set up as if we called
2914 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2915 */
2916int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2917                          struct btrfs_path *p, u64 time_seq)
2918{
2919        struct btrfs_fs_info *fs_info = root->fs_info;
2920        struct extent_buffer *b;
2921        int slot;
2922        int ret;
2923        int err;
2924        int level;
2925        int lowest_unlock = 1;
2926        u8 lowest_level = 0;
2927        int prev_cmp = -1;
2928
2929        lowest_level = p->lowest_level;
2930        WARN_ON(p->nodes[0] != NULL);
2931
2932        if (p->search_commit_root) {
2933                BUG_ON(time_seq);
2934                return btrfs_search_slot(NULL, root, key, p, 0, 0);
2935        }
2936
2937again:
2938        b = get_old_root(root, time_seq);
2939        if (!b) {
2940                ret = -EIO;
2941                goto done;
2942        }
2943        level = btrfs_header_level(b);
2944        p->locks[level] = BTRFS_READ_LOCK;
2945
2946        while (b) {
2947                level = btrfs_header_level(b);
2948                p->nodes[level] = b;
2949
2950                /*
2951                 * we have a lock on b and as long as we aren't changing
2952                 * the tree, there is no way to for the items in b to change.
2953                 * It is safe to drop the lock on our parent before we
2954                 * go through the expensive btree search on b.
2955                 */
2956                btrfs_unlock_up_safe(p, level + 1);
2957
2958                /*
2959                 * Since we can unwind ebs we want to do a real search every
2960                 * time.
2961                 */
2962                prev_cmp = -1;
2963                ret = key_search(b, key, level, &prev_cmp, &slot);
2964
2965                if (level != 0) {
2966                        int dec = 0;
2967                        if (ret && slot > 0) {
2968                                dec = 1;
2969                                slot -= 1;
2970                        }
2971                        p->slots[level] = slot;
2972                        unlock_up(p, level, lowest_unlock, 0, NULL);
2973
2974                        if (level == lowest_level) {
2975                                if (dec)
2976                                        p->slots[level]++;
2977                                goto done;
2978                        }
2979
2980                        err = read_block_for_search(root, p, &b, level,
2981                                                    slot, key);
2982                        if (err == -EAGAIN)
2983                                goto again;
2984                        if (err) {
2985                                ret = err;
2986                                goto done;
2987                        }
2988
2989                        level = btrfs_header_level(b);
2990                        err = btrfs_tree_read_lock_atomic(b);
2991                        if (!err) {
2992                                btrfs_set_path_blocking(p);
2993                                btrfs_tree_read_lock(b);
2994                        }
2995                        b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2996                        if (!b) {
2997                                ret = -ENOMEM;
2998                                goto done;
2999                        }
3000                        p->locks[level] = BTRFS_READ_LOCK;
3001                        p->nodes[level] = b;
3002                } else {
3003                        p->slots[level] = slot;
3004                        unlock_up(p, level, lowest_unlock, 0, NULL);
3005                        goto done;
3006                }
3007        }
3008        ret = 1;
3009done:
3010        if (!p->leave_spinning)
3011                btrfs_set_path_blocking(p);
3012        if (ret < 0)
3013                btrfs_release_path(p);
3014
3015        return ret;
3016}
3017
3018/*
3019 * helper to use instead of search slot if no exact match is needed but
3020 * instead the next or previous item should be returned.
3021 * When find_higher is true, the next higher item is returned, the next lower
3022 * otherwise.
3023 * When return_any and find_higher are both true, and no higher item is found,
3024 * return the next lower instead.
3025 * When return_any is true and find_higher is false, and no lower item is found,
3026 * return the next higher instead.
3027 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3028 * < 0 on error
3029 */
3030int btrfs_search_slot_for_read(struct btrfs_root *root,
3031                               const struct btrfs_key *key,
3032                               struct btrfs_path *p, int find_higher,
3033                               int return_any)
3034{
3035        int ret;
3036        struct extent_buffer *leaf;
3037
3038again:
3039        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3040        if (ret <= 0)
3041                return ret;
3042        /*
3043         * a return value of 1 means the path is at the position where the
3044         * item should be inserted. Normally this is the next bigger item,
3045         * but in case the previous item is the last in a leaf, path points
3046         * to the first free slot in the previous leaf, i.e. at an invalid
3047         * item.
3048         */
3049        leaf = p->nodes[0];
3050
3051        if (find_higher) {
3052                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3053                        ret = btrfs_next_leaf(root, p);
3054                        if (ret <= 0)
3055                                return ret;
3056                        if (!return_any)
3057                                return 1;
3058                        /*
3059                         * no higher item found, return the next
3060                         * lower instead
3061                         */
3062                        return_any = 0;
3063                        find_higher = 0;
3064                        btrfs_release_path(p);
3065                        goto again;
3066                }
3067        } else {
3068                if (p->slots[0] == 0) {
3069                        ret = btrfs_prev_leaf(root, p);
3070                        if (ret < 0)
3071                                return ret;
3072                        if (!ret) {
3073                                leaf = p->nodes[0];
3074                                if (p->slots[0] == btrfs_header_nritems(leaf))
3075                                        p->slots[0]--;
3076                                return 0;
3077                        }
3078                        if (!return_any)
3079                                return 1;
3080                        /*
3081                         * no lower item found, return the next
3082                         * higher instead
3083                         */
3084                        return_any = 0;
3085                        find_higher = 1;
3086                        btrfs_release_path(p);
3087                        goto again;
3088                } else {
3089                        --p->slots[0];
3090                }
3091        }
3092        return 0;
3093}
3094
3095/*
3096 * adjust the pointers going up the tree, starting at level
3097 * making sure the right key of each node is points to 'key'.
3098 * This is used after shifting pointers to the left, so it stops
3099 * fixing up pointers when a given leaf/node is not in slot 0 of the
3100 * higher levels
3101 *
3102 */
3103static void fixup_low_keys(struct btrfs_path *path,
3104                           struct btrfs_disk_key *key, int level)
3105{
3106        int i;
3107        struct extent_buffer *t;
3108        int ret;
3109
3110        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3111                int tslot = path->slots[i];
3112
3113                if (!path->nodes[i])
3114                        break;
3115                t = path->nodes[i];
3116                ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3117                                GFP_ATOMIC);
3118                BUG_ON(ret < 0);
3119                btrfs_set_node_key(t, key, tslot);
3120                btrfs_mark_buffer_dirty(path->nodes[i]);
3121                if (tslot != 0)
3122                        break;
3123        }
3124}
3125
3126/*
3127 * update item key.
3128 *
3129 * This function isn't completely safe. It's the caller's responsibility
3130 * that the new key won't break the order
3131 */
3132void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3133                             struct btrfs_path *path,
3134                             const struct btrfs_key *new_key)
3135{
3136        struct btrfs_disk_key disk_key;
3137        struct extent_buffer *eb;
3138        int slot;
3139
3140        eb = path->nodes[0];
3141        slot = path->slots[0];
3142        if (slot > 0) {
3143                btrfs_item_key(eb, &disk_key, slot - 1);
3144                BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3145        }
3146        if (slot < btrfs_header_nritems(eb) - 1) {
3147                btrfs_item_key(eb, &disk_key, slot + 1);
3148                BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3149        }
3150
3151        btrfs_cpu_key_to_disk(&disk_key, new_key);
3152        btrfs_set_item_key(eb, &disk_key, slot);
3153        btrfs_mark_buffer_dirty(eb);
3154        if (slot == 0)
3155                fixup_low_keys(path, &disk_key, 1);
3156}
3157
3158/*
3159 * try to push data from one node into the next node left in the
3160 * tree.
3161 *
3162 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3163 * error, and > 0 if there was no room in the left hand block.
3164 */
3165static int push_node_left(struct btrfs_trans_handle *trans,
3166                          struct btrfs_fs_info *fs_info,
3167                          struct extent_buffer *dst,
3168                          struct extent_buffer *src, int empty)
3169{
3170        int push_items = 0;
3171        int src_nritems;
3172        int dst_nritems;
3173        int ret = 0;
3174
3175        src_nritems = btrfs_header_nritems(src);
3176        dst_nritems = btrfs_header_nritems(dst);
3177        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3178        WARN_ON(btrfs_header_generation(src) != trans->transid);
3179        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3180
3181        if (!empty && src_nritems <= 8)
3182                return 1;
3183
3184        if (push_items <= 0)
3185                return 1;
3186
3187        if (empty) {
3188                push_items = min(src_nritems, push_items);
3189                if (push_items < src_nritems) {
3190                        /* leave at least 8 pointers in the node if
3191                         * we aren't going to empty it
3192                         */
3193                        if (src_nritems - push_items < 8) {
3194                                if (push_items <= 8)
3195                                        return 1;
3196                                push_items -= 8;
3197                        }
3198                }
3199        } else
3200                push_items = min(src_nritems - 8, push_items);
3201
3202        ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3203                                   push_items);
3204        if (ret) {
3205                btrfs_abort_transaction(trans, ret);
3206                return ret;
3207        }
3208        copy_extent_buffer(dst, src,
3209                           btrfs_node_key_ptr_offset(dst_nritems),
3210                           btrfs_node_key_ptr_offset(0),
3211                           push_items * sizeof(struct btrfs_key_ptr));
3212
3213        if (push_items < src_nritems) {
3214                /*
3215                 * Don't call tree_mod_log_insert_move here, key removal was
3216                 * already fully logged by tree_mod_log_eb_copy above.
3217                 */
3218                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3219                                      btrfs_node_key_ptr_offset(push_items),
3220                                      (src_nritems - push_items) *
3221                                      sizeof(struct btrfs_key_ptr));
3222        }
3223        btrfs_set_header_nritems(src, src_nritems - push_items);
3224        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3225        btrfs_mark_buffer_dirty(src);
3226        btrfs_mark_buffer_dirty(dst);
3227
3228        return ret;
3229}
3230
3231/*
3232 * try to push data from one node into the next node right in the
3233 * tree.
3234 *
3235 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3236 * error, and > 0 if there was no room in the right hand block.
3237 *
3238 * this will  only push up to 1/2 the contents of the left node over
3239 */
3240static int balance_node_right(struct btrfs_trans_handle *trans,
3241                              struct btrfs_fs_info *fs_info,
3242                              struct extent_buffer *dst,
3243                              struct extent_buffer *src)
3244{
3245        int push_items = 0;
3246        int max_push;
3247        int src_nritems;
3248        int dst_nritems;
3249        int ret = 0;
3250
3251        WARN_ON(btrfs_header_generation(src) != trans->transid);
3252        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3253
3254        src_nritems = btrfs_header_nritems(src);
3255        dst_nritems = btrfs_header_nritems(dst);
3256        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3257        if (push_items <= 0)
3258                return 1;
3259
3260        if (src_nritems < 4)
3261                return 1;
3262
3263        max_push = src_nritems / 2 + 1;
3264        /* don't try to empty the node */
3265        if (max_push >= src_nritems)
3266                return 1;
3267
3268        if (max_push < push_items)
3269                push_items = max_push;
3270
3271        ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3272        BUG_ON(ret < 0);
3273        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3274                                      btrfs_node_key_ptr_offset(0),
3275                                      (dst_nritems) *
3276                                      sizeof(struct btrfs_key_ptr));
3277
3278        ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3279                                   src_nritems - push_items, push_items);
3280        if (ret) {
3281                btrfs_abort_transaction(trans, ret);
3282                return ret;
3283        }
3284        copy_extent_buffer(dst, src,
3285                           btrfs_node_key_ptr_offset(0),
3286                           btrfs_node_key_ptr_offset(src_nritems - push_items),
3287                           push_items * sizeof(struct btrfs_key_ptr));
3288
3289        btrfs_set_header_nritems(src, src_nritems - push_items);
3290        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3291
3292        btrfs_mark_buffer_dirty(src);
3293        btrfs_mark_buffer_dirty(dst);
3294
3295        return ret;
3296}
3297
3298/*
3299 * helper function to insert a new root level in the tree.
3300 * A new node is allocated, and a single item is inserted to
3301 * point to the existing root
3302 *
3303 * returns zero on success or < 0 on failure.
3304 */
3305static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3306                           struct btrfs_root *root,
3307                           struct btrfs_path *path, int level)
3308{
3309        struct btrfs_fs_info *fs_info = root->fs_info;
3310        u64 lower_gen;
3311        struct extent_buffer *lower;
3312        struct extent_buffer *c;
3313        struct extent_buffer *old;
3314        struct btrfs_disk_key lower_key;
3315        int ret;
3316
3317        BUG_ON(path->nodes[level]);
3318        BUG_ON(path->nodes[level-1] != root->node);
3319
3320        lower = path->nodes[level-1];
3321        if (level == 1)
3322                btrfs_item_key(lower, &lower_key, 0);
3323        else
3324                btrfs_node_key(lower, &lower_key, 0);
3325
3326        c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3327                                   &lower_key, level, root->node->start, 0);
3328        if (IS_ERR(c))
3329                return PTR_ERR(c);
3330
3331        root_add_used(root, fs_info->nodesize);
3332
3333        btrfs_set_header_nritems(c, 1);
3334        btrfs_set_node_key(c, &lower_key, 0);
3335        btrfs_set_node_blockptr(c, 0, lower->start);
3336        lower_gen = btrfs_header_generation(lower);
3337        WARN_ON(lower_gen != trans->transid);
3338
3339        btrfs_set_node_ptr_generation(c, 0, lower_gen);
3340
3341        btrfs_mark_buffer_dirty(c);
3342
3343        old = root->node;
3344        ret = tree_mod_log_insert_root(root->node, c, 0);
3345        BUG_ON(ret < 0);
3346        rcu_assign_pointer(root->node, c);
3347
3348        /* the super has an extra ref to root->node */
3349        free_extent_buffer(old);
3350
3351        add_root_to_dirty_list(root);
3352        extent_buffer_get(c);
3353        path->nodes[level] = c;
3354        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3355        path->slots[level] = 0;
3356        return 0;
3357}
3358
3359/*
3360 * worker function to insert a single pointer in a node.
3361 * the node should have enough room for the pointer already
3362 *
3363 * slot and level indicate where you want the key to go, and
3364 * blocknr is the block the key points to.
3365 */
3366static void insert_ptr(struct btrfs_trans_handle *trans,
3367                       struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3368                       struct btrfs_disk_key *key, u64 bytenr,
3369                       int slot, int level)
3370{
3371        struct extent_buffer *lower;
3372        int nritems;
3373        int ret;
3374
3375        BUG_ON(!path->nodes[level]);
3376        btrfs_assert_tree_locked(path->nodes[level]);
3377        lower = path->nodes[level];
3378        nritems = btrfs_header_nritems(lower);
3379        BUG_ON(slot > nritems);
3380        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3381        if (slot != nritems) {
3382                if (level) {
3383                        ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3384                                        nritems - slot);
3385                        BUG_ON(ret < 0);
3386                }
3387                memmove_extent_buffer(lower,
3388                              btrfs_node_key_ptr_offset(slot + 1),
3389                              btrfs_node_key_ptr_offset(slot),
3390                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
3391        }
3392        if (level) {
3393                ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3394                                GFP_NOFS);
3395                BUG_ON(ret < 0);
3396        }
3397        btrfs_set_node_key(lower, key, slot);
3398        btrfs_set_node_blockptr(lower, slot, bytenr);
3399        WARN_ON(trans->transid == 0);
3400        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3401        btrfs_set_header_nritems(lower, nritems + 1);
3402        btrfs_mark_buffer_dirty(lower);
3403}
3404
3405/*
3406 * split the node at the specified level in path in two.
3407 * The path is corrected to point to the appropriate node after the split
3408 *
3409 * Before splitting this tries to make some room in the node by pushing
3410 * left and right, if either one works, it returns right away.
3411 *
3412 * returns 0 on success and < 0 on failure
3413 */
3414static noinline int split_node(struct btrfs_trans_handle *trans,
3415                               struct btrfs_root *root,
3416                               struct btrfs_path *path, int level)
3417{
3418        struct btrfs_fs_info *fs_info = root->fs_info;
3419        struct extent_buffer *c;
3420        struct extent_buffer *split;
3421        struct btrfs_disk_key disk_key;
3422        int mid;
3423        int ret;
3424        u32 c_nritems;
3425
3426        c = path->nodes[level];
3427        WARN_ON(btrfs_header_generation(c) != trans->transid);
3428        if (c == root->node) {
3429                /*
3430                 * trying to split the root, lets make a new one
3431                 *
3432                 * tree mod log: We don't log_removal old root in
3433                 * insert_new_root, because that root buffer will be kept as a
3434                 * normal node. We are going to log removal of half of the
3435                 * elements below with tree_mod_log_eb_copy. We're holding a
3436                 * tree lock on the buffer, which is why we cannot race with
3437                 * other tree_mod_log users.
3438                 */
3439                ret = insert_new_root(trans, root, path, level + 1);
3440                if (ret)
3441                        return ret;
3442        } else {
3443                ret = push_nodes_for_insert(trans, root, path, level);
3444                c = path->nodes[level];
3445                if (!ret && btrfs_header_nritems(c) <
3446                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3447                        return 0;
3448                if (ret < 0)
3449                        return ret;
3450        }
3451
3452        c_nritems = btrfs_header_nritems(c);
3453        mid = (c_nritems + 1) / 2;
3454        btrfs_node_key(c, &disk_key, mid);
3455
3456        split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3457                        &disk_key, level, c->start, 0);
3458        if (IS_ERR(split))
3459                return PTR_ERR(split);
3460
3461        root_add_used(root, fs_info->nodesize);
3462        ASSERT(btrfs_header_level(c) == level);
3463
3464        ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3465        if (ret) {
3466                btrfs_abort_transaction(trans, ret);
3467                return ret;
3468        }
3469        copy_extent_buffer(split, c,
3470                           btrfs_node_key_ptr_offset(0),
3471                           btrfs_node_key_ptr_offset(mid),
3472                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3473        btrfs_set_header_nritems(split, c_nritems - mid);
3474        btrfs_set_header_nritems(c, mid);
3475        ret = 0;
3476
3477        btrfs_mark_buffer_dirty(c);
3478        btrfs_mark_buffer_dirty(split);
3479
3480        insert_ptr(trans, fs_info, path, &disk_key, split->start,
3481                   path->slots[level + 1] + 1, level + 1);
3482
3483        if (path->slots[level] >= mid) {
3484                path->slots[level] -= mid;
3485                btrfs_tree_unlock(c);
3486                free_extent_buffer(c);
3487                path->nodes[level] = split;
3488                path->slots[level + 1] += 1;
3489        } else {
3490                btrfs_tree_unlock(split);
3491                free_extent_buffer(split);
3492        }
3493        return ret;
3494}
3495
3496/*
3497 * how many bytes are required to store the items in a leaf.  start
3498 * and nr indicate which items in the leaf to check.  This totals up the
3499 * space used both by the item structs and the item data
3500 */
3501static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3502{
3503        struct btrfs_item *start_item;
3504        struct btrfs_item *end_item;
3505        struct btrfs_map_token token;
3506        int data_len;
3507        int nritems = btrfs_header_nritems(l);
3508        int end = min(nritems, start + nr) - 1;
3509
3510        if (!nr)
3511                return 0;
3512        btrfs_init_map_token(&token);
3513        start_item = btrfs_item_nr(start);
3514        end_item = btrfs_item_nr(end);
3515        data_len = btrfs_token_item_offset(l, start_item, &token) +
3516                btrfs_token_item_size(l, start_item, &token);
3517        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3518        data_len += sizeof(struct btrfs_item) * nr;
3519        WARN_ON(data_len < 0);
3520        return data_len;
3521}
3522
3523/*
3524 * The space between the end of the leaf items and
3525 * the start of the leaf data.  IOW, how much room
3526 * the leaf has left for both items and data
3527 */
3528noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3529                                   struct extent_buffer *leaf)
3530{
3531        int nritems = btrfs_header_nritems(leaf);
3532        int ret;
3533
3534        ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3535        if (ret < 0) {
3536                btrfs_crit(fs_info,
3537                           "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3538                           ret,
3539                           (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3540                           leaf_space_used(leaf, 0, nritems), nritems);
3541        }
3542        return ret;
3543}
3544
3545/*
3546 * min slot controls the lowest index we're willing to push to the
3547 * right.  We'll push up to and including min_slot, but no lower
3548 */
3549static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3550                                      struct btrfs_path *path,
3551                                      int data_size, int empty,
3552                                      struct extent_buffer *right,
3553                                      int free_space, u32 left_nritems,
3554                                      u32 min_slot)
3555{
3556        struct extent_buffer *left = path->nodes[0];
3557        struct extent_buffer *upper = path->nodes[1];
3558        struct btrfs_map_token token;
3559        struct btrfs_disk_key disk_key;
3560        int slot;
3561        u32 i;
3562        int push_space = 0;
3563        int push_items = 0;
3564        struct btrfs_item *item;
3565        u32 nr;
3566        u32 right_nritems;
3567        u32 data_end;
3568        u32 this_item_size;
3569
3570        btrfs_init_map_token(&token);
3571
3572        if (empty)
3573                nr = 0;
3574        else
3575                nr = max_t(u32, 1, min_slot);
3576
3577        if (path->slots[0] >= left_nritems)
3578                push_space += data_size;
3579
3580        slot = path->slots[1];
3581        i = left_nritems - 1;
3582        while (i >= nr) {
3583                item = btrfs_item_nr(i);
3584
3585                if (!empty && push_items > 0) {
3586                        if (path->slots[0] > i)
3587                                break;
3588                        if (path->slots[0] == i) {
3589                                int space = btrfs_leaf_free_space(fs_info, left);
3590                                if (space + push_space * 2 > free_space)
3591                                        break;
3592                        }
3593                }
3594
3595                if (path->slots[0] == i)
3596                        push_space += data_size;
3597
3598                this_item_size = btrfs_item_size(left, item);
3599                if (this_item_size + sizeof(*item) + push_space > free_space)
3600                        break;
3601
3602                push_items++;
3603                push_space += this_item_size + sizeof(*item);
3604                if (i == 0)
3605                        break;
3606                i--;
3607        }
3608
3609        if (push_items == 0)
3610                goto out_unlock;
3611
3612        WARN_ON(!empty && push_items == left_nritems);
3613
3614        /* push left to right */
3615        right_nritems = btrfs_header_nritems(right);
3616
3617        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3618        push_space -= leaf_data_end(fs_info, left);
3619
3620        /* make room in the right data area */
3621        data_end = leaf_data_end(fs_info, right);
3622        memmove_extent_buffer(right,
3623                              BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3624                              BTRFS_LEAF_DATA_OFFSET + data_end,
3625                              BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3626
3627        /* copy from the left data area */
3628        copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3629                     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3630                     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3631                     push_space);
3632
3633        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3634                              btrfs_item_nr_offset(0),
3635                              right_nritems * sizeof(struct btrfs_item));
3636
3637        /* copy the items from left to right */
3638        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3639                   btrfs_item_nr_offset(left_nritems - push_items),
3640                   push_items * sizeof(struct btrfs_item));
3641
3642        /* update the item pointers */
3643        right_nritems += push_items;
3644        btrfs_set_header_nritems(right, right_nritems);
3645        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3646        for (i = 0; i < right_nritems; i++) {
3647                item = btrfs_item_nr(i);
3648                push_space -= btrfs_token_item_size(right, item, &token);
3649                btrfs_set_token_item_offset(right, item, push_space, &token);
3650        }
3651
3652        left_nritems -= push_items;
3653        btrfs_set_header_nritems(left, left_nritems);
3654
3655        if (left_nritems)
3656                btrfs_mark_buffer_dirty(left);
3657        else
3658                clean_tree_block(fs_info, left);
3659
3660        btrfs_mark_buffer_dirty(right);
3661
3662        btrfs_item_key(right, &disk_key, 0);
3663        btrfs_set_node_key(upper, &disk_key, slot + 1);
3664        btrfs_mark_buffer_dirty(upper);
3665
3666        /* then fixup the leaf pointer in the path */
3667        if (path->slots[0] >= left_nritems) {
3668                path->slots[0] -= left_nritems;
3669                if (btrfs_header_nritems(path->nodes[0]) == 0)
3670                        clean_tree_block(fs_info, path->nodes[0]);
3671                btrfs_tree_unlock(path->nodes[0]);
3672                free_extent_buffer(path->nodes[0]);
3673                path->nodes[0] = right;
3674                path->slots[1] += 1;
3675        } else {
3676                btrfs_tree_unlock(right);
3677                free_extent_buffer(right);
3678        }
3679        return 0;
3680
3681out_unlock:
3682        btrfs_tree_unlock(right);
3683        free_extent_buffer(right);
3684        return 1;
3685}
3686
3687/*
3688 * push some data in the path leaf to the right, trying to free up at
3689 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3690 *
3691 * returns 1 if the push failed because the other node didn't have enough
3692 * room, 0 if everything worked out and < 0 if there were major errors.
3693 *
3694 * this will push starting from min_slot to the end of the leaf.  It won't
3695 * push any slot lower than min_slot
3696 */
3697static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3698                           *root, struct btrfs_path *path,
3699                           int min_data_size, int data_size,
3700                           int empty, u32 min_slot)
3701{
3702        struct btrfs_fs_info *fs_info = root->fs_info;
3703        struct extent_buffer *left = path->nodes[0];
3704        struct extent_buffer *right;
3705        struct extent_buffer *upper;
3706        int slot;
3707        int free_space;
3708        u32 left_nritems;
3709        int ret;
3710
3711        if (!path->nodes[1])
3712                return 1;
3713
3714        slot = path->slots[1];
3715        upper = path->nodes[1];
3716        if (slot >= btrfs_header_nritems(upper) - 1)
3717                return 1;
3718
3719        btrfs_assert_tree_locked(path->nodes[1]);
3720
3721        right = read_node_slot(fs_info, upper, slot + 1);
3722        /*
3723         * slot + 1 is not valid or we fail to read the right node,
3724         * no big deal, just return.
3725         */
3726        if (IS_ERR(right))
3727                return 1;
3728
3729        btrfs_tree_lock(right);
3730        btrfs_set_lock_blocking(right);
3731
3732        free_space = btrfs_leaf_free_space(fs_info, right);
3733        if (free_space < data_size)
3734                goto out_unlock;
3735
3736        /* cow and double check */
3737        ret = btrfs_cow_block(trans, root, right, upper,
3738                              slot + 1, &right);
3739        if (ret)
3740                goto out_unlock;
3741
3742        free_space = btrfs_leaf_free_space(fs_info, right);
3743        if (free_space < data_size)
3744                goto out_unlock;
3745
3746        left_nritems = btrfs_header_nritems(left);
3747        if (left_nritems == 0)
3748                goto out_unlock;
3749
3750        if (path->slots[0] == left_nritems && !empty) {
3751                /* Key greater than all keys in the leaf, right neighbor has
3752                 * enough room for it and we're not emptying our leaf to delete
3753                 * it, therefore use right neighbor to insert the new item and
3754                 * no need to touch/dirty our left leaft. */
3755                btrfs_tree_unlock(left);
3756                free_extent_buffer(left);
3757                path->nodes[0] = right;
3758                path->slots[0] = 0;
3759                path->slots[1]++;
3760                return 0;
3761        }
3762
3763        return __push_leaf_right(fs_info, path, min_data_size, empty,
3764                                right, free_space, left_nritems, min_slot);
3765out_unlock:
3766        btrfs_tree_unlock(right);
3767        free_extent_buffer(right);
3768        return 1;
3769}
3770
3771/*
3772 * push some data in the path leaf to the left, trying to free up at
3773 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3774 *
3775 * max_slot can put a limit on how far into the leaf we'll push items.  The
3776 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3777 * items
3778 */
3779static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3780                                     struct btrfs_path *path, int data_size,
3781                                     int empty, struct extent_buffer *left,
3782                                     int free_space, u32 right_nritems,
3783                                     u32 max_slot)
3784{
3785        struct btrfs_disk_key disk_key;
3786        struct extent_buffer *right = path->nodes[0];
3787        int i;
3788        int push_space = 0;
3789        int push_items = 0;
3790        struct btrfs_item *item;
3791        u32 old_left_nritems;
3792        u32 nr;
3793        int ret = 0;
3794        u32 this_item_size;
3795        u32 old_left_item_size;
3796        struct btrfs_map_token token;
3797
3798        btrfs_init_map_token(&token);
3799
3800        if (empty)
3801                nr = min(right_nritems, max_slot);
3802        else
3803                nr = min(right_nritems - 1, max_slot);
3804
3805        for (i = 0; i < nr; i++) {
3806                item = btrfs_item_nr(i);
3807
3808                if (!empty && push_items > 0) {
3809                        if (path->slots[0] < i)
3810                                break;
3811                        if (path->slots[0] == i) {
3812                                int space = btrfs_leaf_free_space(fs_info, right);
3813                                if (space + push_space * 2 > free_space)
3814                                        break;
3815                        }
3816                }
3817
3818                if (path->slots[0] == i)
3819                        push_space += data_size;
3820
3821                this_item_size = btrfs_item_size(right, item);
3822                if (this_item_size + sizeof(*item) + push_space > free_space)
3823                        break;
3824
3825                push_items++;
3826                push_space += this_item_size + sizeof(*item);
3827        }
3828
3829        if (push_items == 0) {
3830                ret = 1;
3831                goto out;
3832        }
3833        WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3834
3835        /* push data from right to left */
3836        copy_extent_buffer(left, right,
3837                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
3838                           btrfs_item_nr_offset(0),
3839                           push_items * sizeof(struct btrfs_item));
3840
3841        push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3842                     btrfs_item_offset_nr(right, push_items - 1);
3843
3844        copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3845                     leaf_data_end(fs_info, left) - push_space,
3846                     BTRFS_LEAF_DATA_OFFSET +
3847                     btrfs_item_offset_nr(right, push_items - 1),
3848                     push_space);
3849        old_left_nritems = btrfs_header_nritems(left);
3850        BUG_ON(old_left_nritems <= 0);
3851
3852        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3853        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3854                u32 ioff;
3855
3856                item = btrfs_item_nr(i);
3857
3858                ioff = btrfs_token_item_offset(left, item, &token);
3859                btrfs_set_token_item_offset(left, item,
3860                      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3861                      &token);
3862        }
3863        btrfs_set_header_nritems(left, old_left_nritems + push_items);
3864
3865        /* fixup right node */
3866        if (push_items > right_nritems)
3867                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3868                       right_nritems);
3869
3870        if (push_items < right_nritems) {
3871                push_space = btrfs_item_offset_nr(right, push_items - 1) -
3872                                                  leaf_data_end(fs_info, right);
3873                memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3874                                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3875                                      BTRFS_LEAF_DATA_OFFSET +
3876                                      leaf_data_end(fs_info, right), push_space);
3877
3878                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3879                              btrfs_item_nr_offset(push_items),
3880                             (btrfs_header_nritems(right) - push_items) *
3881                             sizeof(struct btrfs_item));
3882        }
3883        right_nritems -= push_items;
3884        btrfs_set_header_nritems(right, right_nritems);
3885        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3886        for (i = 0; i < right_nritems; i++) {
3887                item = btrfs_item_nr(i);
3888
3889                push_space = push_space - btrfs_token_item_size(right,
3890                                                                item, &token);
3891                btrfs_set_token_item_offset(right, item, push_space, &token);
3892        }
3893
3894        btrfs_mark_buffer_dirty(left);
3895        if (right_nritems)
3896                btrfs_mark_buffer_dirty(right);
3897        else
3898                clean_tree_block(fs_info, right);
3899
3900        btrfs_item_key(right, &disk_key, 0);
3901        fixup_low_keys(path, &disk_key, 1);
3902
3903        /* then fixup the leaf pointer in the path */
3904        if (path->slots[0] < push_items) {
3905                path->slots[0] += old_left_nritems;
3906                btrfs_tree_unlock(path->nodes[0]);
3907                free_extent_buffer(path->nodes[0]);
3908                path->nodes[0] = left;
3909                path->slots[1] -= 1;
3910        } else {
3911                btrfs_tree_unlock(left);
3912                free_extent_buffer(left);
3913                path->slots[0] -= push_items;
3914        }
3915        BUG_ON(path->slots[0] < 0);
3916        return ret;
3917out:
3918        btrfs_tree_unlock(left);
3919        free_extent_buffer(left);
3920        return ret;
3921}
3922
3923/*
3924 * push some data in the path leaf to the left, trying to free up at
3925 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3926 *
3927 * max_slot can put a limit on how far into the leaf we'll push items.  The
3928 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3929 * items
3930 */
3931static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3932                          *root, struct btrfs_path *path, int min_data_size,
3933                          int data_size, int empty, u32 max_slot)
3934{
3935        struct btrfs_fs_info *fs_info = root->fs_info;
3936        struct extent_buffer *right = path->nodes[0];
3937        struct extent_buffer *left;
3938        int slot;
3939        int free_space;
3940        u32 right_nritems;
3941        int ret = 0;
3942
3943        slot = path->slots[1];
3944        if (slot == 0)
3945                return 1;
3946        if (!path->nodes[1])
3947                return 1;
3948
3949        right_nritems = btrfs_header_nritems(right);
3950        if (right_nritems == 0)
3951                return 1;
3952
3953        btrfs_assert_tree_locked(path->nodes[1]);
3954
3955        left = read_node_slot(fs_info, path->nodes[1], slot - 1);
3956        /*
3957         * slot - 1 is not valid or we fail to read the left node,
3958         * no big deal, just return.
3959         */
3960        if (IS_ERR(left))
3961                return 1;
3962
3963        btrfs_tree_lock(left);
3964        btrfs_set_lock_blocking(left);
3965
3966        free_space = btrfs_leaf_free_space(fs_info, left);
3967        if (free_space < data_size) {
3968                ret = 1;
3969                goto out;
3970        }
3971
3972        /* cow and double check */
3973        ret = btrfs_cow_block(trans, root, left,
3974                              path->nodes[1], slot - 1, &left);
3975        if (ret) {
3976                /* we hit -ENOSPC, but it isn't fatal here */
3977                if (ret == -ENOSPC)
3978                        ret = 1;
3979                goto out;
3980        }
3981
3982        free_space = btrfs_leaf_free_space(fs_info, left);
3983        if (free_space < data_size) {
3984                ret = 1;
3985                goto out;
3986        }
3987
3988        return __push_leaf_left(fs_info, path, min_data_size,
3989                               empty, left, free_space, right_nritems,
3990                               max_slot);
3991out:
3992        btrfs_tree_unlock(left);
3993        free_extent_buffer(left);
3994        return ret;
3995}
3996
3997/*
3998 * split the path's leaf in two, making sure there is at least data_size
3999 * available for the resulting leaf level of the path.
4000 */
4001static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4002                                    struct btrfs_fs_info *fs_info,
4003                                    struct btrfs_path *path,
4004                                    struct extent_buffer *l,
4005                                    struct extent_buffer *right,
4006                                    int slot, int mid, int nritems)
4007{
4008        int data_copy_size;
4009        int rt_data_off;
4010        int i;
4011        struct btrfs_disk_key disk_key;
4012        struct btrfs_map_token token;
4013
4014        btrfs_init_map_token(&token);
4015
4016        nritems = nritems - mid;
4017        btrfs_set_header_nritems(right, nritems);
4018        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4019
4020        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4021                           btrfs_item_nr_offset(mid),
4022                           nritems * sizeof(struct btrfs_item));
4023
4024        copy_extent_buffer(right, l,
4025                     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4026                     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4027                     leaf_data_end(fs_info, l), data_copy_size);
4028
4029        rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4030
4031        for (i = 0; i < nritems; i++) {
4032                struct btrfs_item *item = btrfs_item_nr(i);
4033                u32 ioff;
4034
4035                ioff = btrfs_token_item_offset(right, item, &token);
4036                btrfs_set_token_item_offset(right, item,
4037                                            ioff + rt_data_off, &token);
4038        }
4039
4040        btrfs_set_header_nritems(l, mid);
4041        btrfs_item_key(right, &disk_key, 0);
4042        insert_ptr(trans, fs_info, path, &disk_key, right->start,
4043                   path->slots[1] + 1, 1);
4044
4045        btrfs_mark_buffer_dirty(right);
4046        btrfs_mark_buffer_dirty(l);
4047        BUG_ON(path->slots[0] != slot);
4048
4049        if (mid <= slot) {
4050                btrfs_tree_unlock(path->nodes[0]);
4051                free_extent_buffer(path->nodes[0]);
4052                path->nodes[0] = right;
4053                path->slots[0] -= mid;
4054                path->slots[1] += 1;
4055        } else {
4056                btrfs_tree_unlock(right);
4057                free_extent_buffer(right);
4058        }
4059
4060        BUG_ON(path->slots[0] < 0);
4061}
4062
4063/*
4064 * double splits happen when we need to insert a big item in the middle
4065 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4066 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4067 *          A                 B                 C
4068 *
4069 * We avoid this by trying to push the items on either side of our target
4070 * into the adjacent leaves.  If all goes well we can avoid the double split
4071 * completely.
4072 */
4073static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4074                                          struct btrfs_root *root,
4075                                          struct btrfs_path *path,
4076                                          int data_size)
4077{
4078        struct btrfs_fs_info *fs_info = root->fs_info;
4079        int ret;
4080        int progress = 0;
4081        int slot;
4082        u32 nritems;
4083        int space_needed = data_size;
4084
4085        slot = path->slots[0];
4086        if (slot < btrfs_header_nritems(path->nodes[0]))
4087                space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4088
4089        /*
4090         * try to push all the items after our slot into the
4091         * right leaf
4092         */
4093        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4094        if (ret < 0)
4095                return ret;
4096
4097        if (ret == 0)
4098                progress++;
4099
4100        nritems = btrfs_header_nritems(path->nodes[0]);
4101        /*
4102         * our goal is to get our slot at the start or end of a leaf.  If
4103         * we've done so we're done
4104         */
4105        if (path->slots[0] == 0 || path->slots[0] == nritems)
4106                return 0;
4107
4108        if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4109                return 0;
4110
4111        /* try to push all the items before our slot into the next leaf */
4112        slot = path->slots[0];
4113        space_needed = data_size;
4114        if (slot > 0)
4115                space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4116        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4117        if (ret < 0)
4118                return ret;
4119
4120        if (ret == 0)
4121                progress++;
4122
4123        if (progress)
4124                return 0;
4125        return 1;
4126}
4127
4128/*
4129 * split the path's leaf in two, making sure there is at least data_size
4130 * available for the resulting leaf level of the path.
4131 *
4132 * returns 0 if all went well and < 0 on failure.
4133 */
4134static noinline int split_leaf(struct btrfs_trans_handle *trans,
4135                               struct btrfs_root *root,
4136                               const struct btrfs_key *ins_key,
4137                               struct btrfs_path *path, int data_size,
4138                               int extend)
4139{
4140        struct btrfs_disk_key disk_key;
4141        struct extent_buffer *l;
4142        u32 nritems;
4143        int mid;
4144        int slot;
4145        struct extent_buffer *right;
4146        struct btrfs_fs_info *fs_info = root->fs_info;
4147        int ret = 0;
4148        int wret;
4149        int split;
4150        int num_doubles = 0;
4151        int tried_avoid_double = 0;
4152
4153        l = path->nodes[0];
4154        slot = path->slots[0];
4155        if (extend && data_size + btrfs_item_size_nr(l, slot) +
4156            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4157                return -EOVERFLOW;
4158
4159        /* first try to make some room by pushing left and right */
4160        if (data_size && path->nodes[1]) {
4161                int space_needed = data_size;
4162
4163                if (slot < btrfs_header_nritems(l))
4164                        space_needed -= btrfs_leaf_free_space(fs_info, l);
4165
4166                wret = push_leaf_right(trans, root, path, space_needed,
4167                                       space_needed, 0, 0);
4168                if (wret < 0)
4169                        return wret;
4170                if (wret) {
4171                        space_needed = data_size;
4172                        if (slot > 0)
4173                                space_needed -= btrfs_leaf_free_space(fs_info,
4174                                                                      l);
4175                        wret = push_leaf_left(trans, root, path, space_needed,
4176                                              space_needed, 0, (u32)-1);
4177                        if (wret < 0)
4178                                return wret;
4179                }
4180                l = path->nodes[0];
4181
4182                /* did the pushes work? */
4183                if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4184                        return 0;
4185        }
4186
4187        if (!path->nodes[1]) {
4188                ret = insert_new_root(trans, root, path, 1);
4189                if (ret)
4190                        return ret;
4191        }
4192again:
4193        split = 1;
4194        l = path->nodes[0];
4195        slot = path->slots[0];
4196        nritems = btrfs_header_nritems(l);
4197        mid = (nritems + 1) / 2;
4198
4199        if (mid <= slot) {
4200                if (nritems == 1 ||
4201                    leaf_space_used(l, mid, nritems - mid) + data_size >
4202                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
4203                        if (slot >= nritems) {
4204                                split = 0;
4205                        } else {
4206                                mid = slot;
4207                                if (mid != nritems &&
4208                                    leaf_space_used(l, mid, nritems - mid) +
4209                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4210                                        if (data_size && !tried_avoid_double)
4211                                                goto push_for_double;
4212                                        split = 2;
4213                                }
4214                        }
4215                }
4216        } else {
4217                if (leaf_space_used(l, 0, mid) + data_size >
4218                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
4219                        if (!extend && data_size && slot == 0) {
4220                                split = 0;
4221                        } else if ((extend || !data_size) && slot == 0) {
4222                                mid = 1;
4223                        } else {
4224                                mid = slot;
4225                                if (mid != nritems &&
4226                                    leaf_space_used(l, mid, nritems - mid) +
4227                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4228                                        if (data_size && !tried_avoid_double)
4229                                                goto push_for_double;
4230                                        split = 2;
4231                                }
4232                        }
4233                }
4234        }
4235
4236        if (split == 0)
4237                btrfs_cpu_key_to_disk(&disk_key, ins_key);
4238        else
4239                btrfs_item_key(l, &disk_key, mid);
4240
4241        right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4242                        &disk_key, 0, l->start, 0);
4243        if (IS_ERR(right))
4244                return PTR_ERR(right);
4245
4246        root_add_used(root, fs_info->nodesize);
4247
4248        if (split == 0) {
4249                if (mid <= slot) {
4250                        btrfs_set_header_nritems(right, 0);
4251                        insert_ptr(trans, fs_info, path, &disk_key,
4252                                   right->start, path->slots[1] + 1, 1);
4253                        btrfs_tree_unlock(path->nodes[0]);
4254                        free_extent_buffer(path->nodes[0]);
4255                        path->nodes[0] = right;
4256                        path->slots[0] = 0;
4257                        path->slots[1] += 1;
4258                } else {
4259                        btrfs_set_header_nritems(right, 0);
4260                        insert_ptr(trans, fs_info, path, &disk_key,
4261                                   right->start, path->slots[1], 1);
4262                        btrfs_tree_unlock(path->nodes[0]);
4263                        free_extent_buffer(path->nodes[0]);
4264                        path->nodes[0] = right;
4265                        path->slots[0] = 0;
4266                        if (path->slots[1] == 0)
4267                                fixup_low_keys(path, &disk_key, 1);
4268                }
4269                /*
4270                 * We create a new leaf 'right' for the required ins_len and
4271                 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4272                 * the content of ins_len to 'right'.
4273                 */
4274                return ret;
4275        }
4276
4277        copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4278
4279        if (split == 2) {
4280                BUG_ON(num_doubles != 0);
4281                num_doubles++;
4282                goto again;
4283        }
4284
4285        return 0;
4286
4287push_for_double:
4288        push_for_double_split(trans, root, path, data_size);
4289        tried_avoid_double = 1;
4290        if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4291                return 0;
4292        goto again;
4293}
4294
4295static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4296                                         struct btrfs_root *root,
4297                                         struct btrfs_path *path, int ins_len)
4298{
4299        struct btrfs_fs_info *fs_info = root->fs_info;
4300        struct btrfs_key key;
4301        struct extent_buffer *leaf;
4302        struct btrfs_file_extent_item *fi;
4303        u64 extent_len = 0;
4304        u32 item_size;
4305        int ret;
4306
4307        leaf = path->nodes[0];
4308        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4309
4310        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4311               key.type != BTRFS_EXTENT_CSUM_KEY);
4312
4313        if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4314                return 0;
4315
4316        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4317        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4318                fi = btrfs_item_ptr(leaf, path->slots[0],
4319                                    struct btrfs_file_extent_item);
4320                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4321        }
4322        btrfs_release_path(path);
4323
4324        path->keep_locks = 1;
4325        path->search_for_split = 1;
4326        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4327        path->search_for_split = 0;
4328        if (ret > 0)
4329                ret = -EAGAIN;
4330        if (ret < 0)
4331                goto err;
4332
4333        ret = -EAGAIN;
4334        leaf = path->nodes[0];
4335        /* if our item isn't there, return now */
4336        if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4337                goto err;
4338
4339        /* the leaf has  changed, it now has room.  return now */
4340        if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4341                goto err;
4342
4343        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4344                fi = btrfs_item_ptr(leaf, path->slots[0],
4345                                    struct btrfs_file_extent_item);
4346                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4347                        goto err;
4348        }
4349
4350        btrfs_set_path_blocking(path);
4351        ret = split_leaf(trans, root, &key, path, ins_len, 1);
4352        if (ret)
4353                goto err;
4354
4355        path->keep_locks = 0;
4356        btrfs_unlock_up_safe(path, 1);
4357        return 0;
4358err:
4359        path->keep_locks = 0;
4360        return ret;
4361}
4362
4363static noinline int split_item(struct btrfs_fs_info *fs_info,
4364                               struct btrfs_path *path,
4365                               const struct btrfs_key *new_key,
4366                               unsigned long split_offset)
4367{
4368        struct extent_buffer *leaf;
4369        struct btrfs_item *item;
4370        struct btrfs_item *new_item;
4371        int slot;
4372        char *buf;
4373        u32 nritems;
4374        u32 item_size;
4375        u32 orig_offset;
4376        struct btrfs_disk_key disk_key;
4377
4378        leaf = path->nodes[0];
4379        BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4380
4381        btrfs_set_path_blocking(path);
4382
4383        item = btrfs_item_nr(path->slots[0]);
4384        orig_offset = btrfs_item_offset(leaf, item);
4385        item_size = btrfs_item_size(leaf, item);
4386
4387        buf = kmalloc(item_size, GFP_NOFS);
4388        if (!buf)
4389                return -ENOMEM;
4390
4391        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4392                            path->slots[0]), item_size);
4393
4394        slot = path->slots[0] + 1;
4395        nritems = btrfs_header_nritems(leaf);
4396        if (slot != nritems) {
4397                /* shift the items */
4398                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4399                                btrfs_item_nr_offset(slot),
4400                                (nritems - slot) * sizeof(struct btrfs_item));
4401        }
4402
4403        btrfs_cpu_key_to_disk(&disk_key, new_key);
4404        btrfs_set_item_key(leaf, &disk_key, slot);
4405
4406        new_item = btrfs_item_nr(slot);
4407
4408        btrfs_set_item_offset(leaf, new_item, orig_offset);
4409        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4410
4411        btrfs_set_item_offset(leaf, item,
4412                              orig_offset + item_size - split_offset);
4413        btrfs_set_item_size(leaf, item, split_offset);
4414
4415        btrfs_set_header_nritems(leaf, nritems + 1);
4416
4417        /* write the data for the start of the original item */
4418        write_extent_buffer(leaf, buf,
4419                            btrfs_item_ptr_offset(leaf, path->slots[0]),
4420                            split_offset);
4421
4422        /* write the data for the new item */
4423        write_extent_buffer(leaf, buf + split_offset,
4424                            btrfs_item_ptr_offset(leaf, slot),
4425                            item_size - split_offset);
4426        btrfs_mark_buffer_dirty(leaf);
4427
4428        BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4429        kfree(buf);
4430        return 0;
4431}
4432
4433/*
4434 * This function splits a single item into two items,
4435 * giving 'new_key' to the new item and splitting the
4436 * old one at split_offset (from the start of the item).
4437 *
4438 * The path may be released by this operation.  After
4439 * the split, the path is pointing to the old item.  The
4440 * new item is going to be in the same node as the old one.
4441 *
4442 * Note, the item being split must be smaller enough to live alone on
4443 * a tree block with room for one extra struct btrfs_item
4444 *
4445 * This allows us to split the item in place, keeping a lock on the
4446 * leaf the entire time.
4447 */
4448int btrfs_split_item(struct btrfs_trans_handle *trans,
4449                     struct btrfs_root *root,
4450                     struct btrfs_path *path,
4451                     const struct btrfs_key *new_key,
4452                     unsigned long split_offset)
4453{
4454        int ret;
4455        ret = setup_leaf_for_split(trans, root, path,
4456                                   sizeof(struct btrfs_item));
4457        if (ret)
4458                return ret;
4459
4460        ret = split_item(root->fs_info, path, new_key, split_offset);
4461        return ret;
4462}
4463
4464/*
4465 * This function duplicate a item, giving 'new_key' to the new item.
4466 * It guarantees both items live in the same tree leaf and the new item
4467 * is contiguous with the original item.
4468 *
4469 * This allows us to split file extent in place, keeping a lock on the
4470 * leaf the entire time.
4471 */
4472int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4473                         struct btrfs_root *root,
4474                         struct btrfs_path *path,
4475                         const struct btrfs_key *new_key)
4476{
4477        struct extent_buffer *leaf;
4478        int ret;
4479        u32 item_size;
4480
4481        leaf = path->nodes[0];
4482        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4483        ret = setup_leaf_for_split(trans, root, path,
4484                                   item_size + sizeof(struct btrfs_item));
4485        if (ret)
4486                return ret;
4487
4488        path->slots[0]++;
4489        setup_items_for_insert(root, path, new_key, &item_size,
4490                               item_size, item_size +
4491                               sizeof(struct btrfs_item), 1);
4492        leaf = path->nodes[0];
4493        memcpy_extent_buffer(leaf,
4494                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4495                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4496                             item_size);
4497        return 0;
4498}
4499
4500/*
4501 * make the item pointed to by the path smaller.  new_size indicates
4502 * how small to make it, and from_end tells us if we just chop bytes
4503 * off the end of the item or if we shift the item to chop bytes off
4504 * the front.
4505 */
4506void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4507                         struct btrfs_path *path, u32 new_size, int from_end)
4508{
4509        int slot;
4510        struct extent_buffer *leaf;
4511        struct btrfs_item *item;
4512        u32 nritems;
4513        unsigned int data_end;
4514        unsigned int old_data_start;
4515        unsigned int old_size;
4516        unsigned int size_diff;
4517        int i;
4518        struct btrfs_map_token token;
4519
4520        btrfs_init_map_token(&token);
4521
4522        leaf = path->nodes[0];
4523        slot = path->slots[0];
4524
4525        old_size = btrfs_item_size_nr(leaf, slot);
4526        if (old_size == new_size)
4527                return;
4528
4529        nritems = btrfs_header_nritems(leaf);
4530        data_end = leaf_data_end(fs_info, leaf);
4531
4532        old_data_start = btrfs_item_offset_nr(leaf, slot);
4533
4534        size_diff = old_size - new_size;
4535
4536        BUG_ON(slot < 0);
4537        BUG_ON(slot >= nritems);
4538
4539        /*
4540         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4541         */
4542        /* first correct the data pointers */
4543        for (i = slot; i < nritems; i++) {
4544                u32 ioff;
4545                item = btrfs_item_nr(i);
4546
4547                ioff = btrfs_token_item_offset(leaf, item, &token);
4548                btrfs_set_token_item_offset(leaf, item,
4549                                            ioff + size_diff, &token);
4550        }
4551
4552        /* shift the data */
4553        if (from_end) {
4554                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4555                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4556                              data_end, old_data_start + new_size - data_end);
4557        } else {
4558                struct btrfs_disk_key disk_key;
4559                u64 offset;
4560
4561                btrfs_item_key(leaf, &disk_key, slot);
4562
4563                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4564                        unsigned long ptr;
4565                        struct btrfs_file_extent_item *fi;
4566
4567                        fi = btrfs_item_ptr(leaf, slot,
4568                                            struct btrfs_file_extent_item);
4569                        fi = (struct btrfs_file_extent_item *)(
4570                             (unsigned long)fi - size_diff);
4571
4572                        if (btrfs_file_extent_type(leaf, fi) ==
4573                            BTRFS_FILE_EXTENT_INLINE) {
4574                                ptr = btrfs_item_ptr_offset(leaf, slot);
4575                                memmove_extent_buffer(leaf, ptr,
4576                                      (unsigned long)fi,
4577                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4578                        }
4579                }
4580
4581                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4582                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4583                              data_end, old_data_start - data_end);
4584
4585                offset = btrfs_disk_key_offset(&disk_key);
4586                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4587                btrfs_set_item_key(leaf, &disk_key, slot);
4588                if (slot == 0)
4589                        fixup_low_keys(path, &disk_key, 1);
4590        }
4591
4592        item = btrfs_item_nr(slot);
4593        btrfs_set_item_size(leaf, item, new_size);
4594        btrfs_mark_buffer_dirty(leaf);
4595
4596        if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4597                btrfs_print_leaf(leaf);
4598                BUG();
4599        }
4600}
4601
4602/*
4603 * make the item pointed to by the path bigger, data_size is the added size.
4604 */
4605void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4606                       u32 data_size)
4607{
4608        int slot;
4609        struct extent_buffer *leaf;
4610        struct btrfs_item *item;
4611        u32 nritems;
4612        unsigned int data_end;
4613        unsigned int old_data;
4614        unsigned int old_size;
4615        int i;
4616        struct btrfs_map_token token;
4617
4618        btrfs_init_map_token(&token);
4619
4620        leaf = path->nodes[0];
4621
4622        nritems = btrfs_header_nritems(leaf);
4623        data_end = leaf_data_end(fs_info, leaf);
4624
4625        if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4626                btrfs_print_leaf(leaf);
4627                BUG();
4628        }
4629        slot = path->slots[0];
4630        old_data = btrfs_item_end_nr(leaf, slot);
4631
4632        BUG_ON(slot < 0);
4633        if (slot >= nritems) {
4634                btrfs_print_leaf(leaf);
4635                btrfs_crit(fs_info, "slot %d too large, nritems %d",
4636                           slot, nritems);
4637                BUG_ON(1);
4638        }
4639
4640        /*
4641         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4642         */
4643        /* first correct the data pointers */
4644        for (i = slot; i < nritems; i++) {
4645                u32 ioff;
4646                item = btrfs_item_nr(i);
4647
4648                ioff = btrfs_token_item_offset(leaf, item, &token);
4649                btrfs_set_token_item_offset(leaf, item,
4650                                            ioff - data_size, &token);
4651        }
4652
4653        /* shift the data */
4654        memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4655                      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4656                      data_end, old_data - data_end);
4657
4658        data_end = old_data;
4659        old_size = btrfs_item_size_nr(leaf, slot);
4660        item = btrfs_item_nr(slot);
4661        btrfs_set_item_size(leaf, item, old_size + data_size);
4662        btrfs_mark_buffer_dirty(leaf);
4663
4664        if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4665                btrfs_print_leaf(leaf);
4666                BUG();
4667        }
4668}
4669
4670/*
4671 * this is a helper for btrfs_insert_empty_items, the main goal here is
4672 * to save stack depth by doing the bulk of the work in a function
4673 * that doesn't call btrfs_search_slot
4674 */
4675void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4676                            const struct btrfs_key *cpu_key, u32 *data_size,
4677                            u32 total_data, u32 total_size, int nr)
4678{
4679        struct btrfs_fs_info *fs_info = root->fs_info;
4680        struct btrfs_item *item;
4681        int i;
4682        u32 nritems;
4683        unsigned int data_end;
4684        struct btrfs_disk_key disk_key;
4685        struct extent_buffer *leaf;
4686        int slot;
4687        struct btrfs_map_token token;
4688
4689        if (path->slots[0] == 0) {
4690                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4691                fixup_low_keys(path, &disk_key, 1);
4692        }
4693        btrfs_unlock_up_safe(path, 1);
4694
4695        btrfs_init_map_token(&token);
4696
4697        leaf = path->nodes[0];
4698        slot = path->slots[0];
4699
4700        nritems = btrfs_header_nritems(leaf);
4701        data_end = leaf_data_end(fs_info, leaf);
4702
4703        if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4704                btrfs_print_leaf(leaf);
4705                btrfs_crit(fs_info, "not enough freespace need %u have %d",
4706                           total_size, btrfs_leaf_free_space(fs_info, leaf));
4707                BUG();
4708        }
4709
4710        if (slot != nritems) {
4711                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4712
4713                if (old_data < data_end) {
4714                        btrfs_print_leaf(leaf);
4715                        btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4716                                   slot, old_data, data_end);
4717                        BUG_ON(1);
4718                }
4719                /*
4720                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4721                 */
4722                /* first correct the data pointers */
4723                for (i = slot; i < nritems; i++) {
4724                        u32 ioff;
4725
4726                        item = btrfs_item_nr(i);
4727                        ioff = btrfs_token_item_offset(leaf, item, &token);
4728                        btrfs_set_token_item_offset(leaf, item,
4729                                                    ioff - total_data, &token);
4730                }
4731                /* shift the items */
4732                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4733                              btrfs_item_nr_offset(slot),
4734                              (nritems - slot) * sizeof(struct btrfs_item));
4735
4736                /* shift the data */
4737                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4738                              data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4739                              data_end, old_data - data_end);
4740                data_end = old_data;
4741        }
4742
4743        /* setup the item for the new data */
4744        for (i = 0; i < nr; i++) {
4745                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4746                btrfs_set_item_key(leaf, &disk_key, slot + i);
4747                item = btrfs_item_nr(slot + i);
4748                btrfs_set_token_item_offset(leaf, item,
4749                                            data_end - data_size[i], &token);
4750                data_end -= data_size[i];
4751                btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4752        }
4753
4754        btrfs_set_header_nritems(leaf, nritems + nr);
4755        btrfs_mark_buffer_dirty(leaf);
4756
4757        if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4758                btrfs_print_leaf(leaf);
4759                BUG();
4760        }
4761}
4762
4763/*
4764 * Given a key and some data, insert items into the tree.
4765 * This does all the path init required, making room in the tree if needed.
4766 */
4767int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4768                            struct btrfs_root *root,
4769                            struct btrfs_path *path,
4770                            const struct btrfs_key *cpu_key, u32 *data_size,
4771                            int nr)
4772{
4773        int ret = 0;
4774        int slot;
4775        int i;
4776        u32 total_size = 0;
4777        u32 total_data = 0;
4778
4779        for (i = 0; i < nr; i++)
4780                total_data += data_size[i];
4781
4782        total_size = total_data + (nr * sizeof(struct btrfs_item));
4783        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4784        if (ret == 0)
4785                return -EEXIST;
4786        if (ret < 0)
4787                return ret;
4788
4789        slot = path->slots[0];
4790        BUG_ON(slot < 0);
4791
4792        setup_items_for_insert(root, path, cpu_key, data_size,
4793                               total_data, total_size, nr);
4794        return 0;
4795}
4796
4797/*
4798 * Given a key and some data, insert an item into the tree.
4799 * This does all the path init required, making room in the tree if needed.
4800 */
4801int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4802                      const struct btrfs_key *cpu_key, void *data,
4803                      u32 data_size)
4804{
4805        int ret = 0;
4806        struct btrfs_path *path;
4807        struct extent_buffer *leaf;
4808        unsigned long ptr;
4809
4810        path = btrfs_alloc_path();
4811        if (!path)
4812                return -ENOMEM;
4813        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4814        if (!ret) {
4815                leaf = path->nodes[0];
4816                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4817                write_extent_buffer(leaf, data, ptr, data_size);
4818                btrfs_mark_buffer_dirty(leaf);
4819        }
4820        btrfs_free_path(path);
4821        return ret;
4822}
4823
4824/*
4825 * delete the pointer from a given node.
4826 *
4827 * the tree should have been previously balanced so the deletion does not
4828 * empty a node.
4829 */
4830static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4831                    int level, int slot)
4832{
4833        struct extent_buffer *parent = path->nodes[level];
4834        u32 nritems;
4835        int ret;
4836
4837        nritems = btrfs_header_nritems(parent);
4838        if (slot != nritems - 1) {
4839                if (level) {
4840                        ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4841                                        nritems - slot - 1);
4842                        BUG_ON(ret < 0);
4843                }
4844                memmove_extent_buffer(parent,
4845                              btrfs_node_key_ptr_offset(slot),
4846                              btrfs_node_key_ptr_offset(slot + 1),
4847                              sizeof(struct btrfs_key_ptr) *
4848                              (nritems - slot - 1));
4849        } else if (level) {
4850                ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4851                                GFP_NOFS);
4852                BUG_ON(ret < 0);
4853        }
4854
4855        nritems--;
4856        btrfs_set_header_nritems(parent, nritems);
4857        if (nritems == 0 && parent == root->node) {
4858                BUG_ON(btrfs_header_level(root->node) != 1);
4859                /* just turn the root into a leaf and break */
4860                btrfs_set_header_level(root->node, 0);
4861        } else if (slot == 0) {
4862                struct btrfs_disk_key disk_key;
4863
4864                btrfs_node_key(parent, &disk_key, 0);
4865                fixup_low_keys(path, &disk_key, level + 1);
4866        }
4867        btrfs_mark_buffer_dirty(parent);
4868}
4869
4870/*
4871 * a helper function to delete the leaf pointed to by path->slots[1] and
4872 * path->nodes[1].
4873 *
4874 * This deletes the pointer in path->nodes[1] and frees the leaf
4875 * block extent.  zero is returned if it all worked out, < 0 otherwise.
4876 *
4877 * The path must have already been setup for deleting the leaf, including
4878 * all the proper balancing.  path->nodes[1] must be locked.
4879 */
4880static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4881                                    struct btrfs_root *root,
4882                                    struct btrfs_path *path,
4883                                    struct extent_buffer *leaf)
4884{
4885        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4886        del_ptr(root, path, 1, path->slots[1]);
4887
4888        /*
4889         * btrfs_free_extent is expensive, we want to make sure we
4890         * aren't holding any locks when we call it
4891         */
4892        btrfs_unlock_up_safe(path, 0);
4893
4894        root_sub_used(root, leaf->len);
4895
4896        extent_buffer_get(leaf);
4897        btrfs_free_tree_block(trans, root, leaf, 0, 1);
4898        free_extent_buffer_stale(leaf);
4899}
4900/*
4901 * delete the item at the leaf level in path.  If that empties
4902 * the leaf, remove it from the tree
4903 */
4904int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4905                    struct btrfs_path *path, int slot, int nr)
4906{
4907        struct btrfs_fs_info *fs_info = root->fs_info;
4908        struct extent_buffer *leaf;
4909        struct btrfs_item *item;
4910        u32 last_off;
4911        u32 dsize = 0;
4912        int ret = 0;
4913        int wret;
4914        int i;
4915        u32 nritems;
4916        struct btrfs_map_token token;
4917
4918        btrfs_init_map_token(&token);
4919
4920        leaf = path->nodes[0];
4921        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4922
4923        for (i = 0; i < nr; i++)
4924                dsize += btrfs_item_size_nr(leaf, slot + i);
4925
4926        nritems = btrfs_header_nritems(leaf);
4927
4928        if (slot + nr != nritems) {
4929                int data_end = leaf_data_end(fs_info, leaf);
4930
4931                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4932                              data_end + dsize,
4933                              BTRFS_LEAF_DATA_OFFSET + data_end,
4934                              last_off - data_end);
4935
4936                for (i = slot + nr; i < nritems; i++) {
4937                        u32 ioff;
4938
4939                        item = btrfs_item_nr(i);
4940                        ioff = btrfs_token_item_offset(leaf, item, &token);
4941                        btrfs_set_token_item_offset(leaf, item,
4942                                                    ioff + dsize, &token);
4943                }
4944
4945                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4946                              btrfs_item_nr_offset(slot + nr),
4947                              sizeof(struct btrfs_item) *
4948                              (nritems - slot - nr));
4949        }
4950        btrfs_set_header_nritems(leaf, nritems - nr);
4951        nritems -= nr;
4952
4953        /* delete the leaf if we've emptied it */
4954        if (nritems == 0) {
4955                if (leaf == root->node) {
4956                        btrfs_set_header_level(leaf, 0);
4957                } else {
4958                        btrfs_set_path_blocking(path);
4959                        clean_tree_block(fs_info, leaf);
4960                        btrfs_del_leaf(trans, root, path, leaf);
4961                }
4962        } else {
4963                int used = leaf_space_used(leaf, 0, nritems);
4964                if (slot == 0) {
4965                        struct btrfs_disk_key disk_key;
4966
4967                        btrfs_item_key(leaf, &disk_key, 0);
4968                        fixup_low_keys(path, &disk_key, 1);
4969                }
4970
4971                /* delete the leaf if it is mostly empty */
4972                if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4973                        /* push_leaf_left fixes the path.
4974                         * make sure the path still points to our leaf
4975                         * for possible call to del_ptr below
4976                         */
4977                        slot = path->slots[1];
4978                        extent_buffer_get(leaf);
4979
4980                        btrfs_set_path_blocking(path);
4981                        wret = push_leaf_left(trans, root, path, 1, 1,
4982                                              1, (u32)-1);
4983                        if (wret < 0 && wret != -ENOSPC)
4984                                ret = wret;
4985
4986                        if (path->nodes[0] == leaf &&
4987                            btrfs_header_nritems(leaf)) {
4988                                wret = push_leaf_right(trans, root, path, 1,
4989                                                       1, 1, 0);
4990                                if (wret < 0 && wret != -ENOSPC)
4991                                        ret = wret;
4992                        }
4993
4994                        if (btrfs_header_nritems(leaf) == 0) {
4995                                path->slots[1] = slot;
4996                                btrfs_del_leaf(trans, root, path, leaf);
4997                                free_extent_buffer(leaf);
4998                                ret = 0;
4999                        } else {
5000                                /* if we're still in the path, make sure
5001                                 * we're dirty.  Otherwise, one of the
5002                                 * push_leaf functions must have already
5003                                 * dirtied this buffer
5004                                 */
5005                                if (path->nodes[0] == leaf)
5006                                        btrfs_mark_buffer_dirty(leaf);
5007                                free_extent_buffer(leaf);
5008                        }
5009                } else {
5010                        btrfs_mark_buffer_dirty(leaf);
5011                }
5012        }
5013        return ret;
5014}
5015
5016/*
5017 * search the tree again to find a leaf with lesser keys
5018 * returns 0 if it found something or 1 if there are no lesser leaves.
5019 * returns < 0 on io errors.
5020 *
5021 * This may release the path, and so you may lose any locks held at the
5022 * time you call it.
5023 */
5024int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5025{
5026        struct btrfs_key key;
5027        struct btrfs_disk_key found_key;
5028        int ret;
5029
5030        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5031
5032        if (key.offset > 0) {
5033                key.offset--;
5034        } else if (key.type > 0) {
5035                key.type--;
5036                key.offset = (u64)-1;
5037        } else if (key.objectid > 0) {
5038                key.objectid--;
5039                key.type = (u8)-1;
5040                key.offset = (u64)-1;
5041        } else {
5042                return 1;
5043        }
5044
5045        btrfs_release_path(path);
5046        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5047        if (ret < 0)
5048                return ret;
5049        btrfs_item_key(path->nodes[0], &found_key, 0);
5050        ret = comp_keys(&found_key, &key);
5051        /*
5052         * We might have had an item with the previous key in the tree right
5053         * before we released our path. And after we released our path, that
5054         * item might have been pushed to the first slot (0) of the leaf we
5055         * were holding due to a tree balance. Alternatively, an item with the
5056         * previous key can exist as the only element of a leaf (big fat item).
5057         * Therefore account for these 2 cases, so that our callers (like
5058         * btrfs_previous_item) don't miss an existing item with a key matching
5059         * the previous key we computed above.
5060         */
5061        if (ret <= 0)
5062                return 0;
5063        return 1;
5064}
5065
5066/*
5067 * A helper function to walk down the tree starting at min_key, and looking
5068 * for nodes or leaves that are have a minimum transaction id.
5069 * This is used by the btree defrag code, and tree logging
5070 *
5071 * This does not cow, but it does stuff the starting key it finds back
5072 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5073 * key and get a writable path.
5074 *
5075 * This honors path->lowest_level to prevent descent past a given level
5076 * of the tree.
5077 *
5078 * min_trans indicates the oldest transaction that you are interested
5079 * in walking through.  Any nodes or leaves older than min_trans are
5080 * skipped over (without reading them).
5081 *
5082 * returns zero if something useful was found, < 0 on error and 1 if there
5083 * was nothing in the tree that matched the search criteria.
5084 */
5085int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5086                         struct btrfs_path *path,
5087                         u64 min_trans)
5088{
5089        struct btrfs_fs_info *fs_info = root->fs_info;
5090        struct extent_buffer *cur;
5091        struct btrfs_key found_key;
5092        int slot;
5093        int sret;
5094        u32 nritems;
5095        int level;
5096        int ret = 1;
5097        int keep_locks = path->keep_locks;
5098
5099        path->keep_locks = 1;
5100again:
5101        cur = btrfs_read_lock_root_node(root);
5102        level = btrfs_header_level(cur);
5103        WARN_ON(path->nodes[level]);
5104        path->nodes[level] = cur;
5105        path->locks[level] = BTRFS_READ_LOCK;
5106
5107        if (btrfs_header_generation(cur) < min_trans) {
5108                ret = 1;
5109                goto out;
5110        }
5111        while (1) {
5112                nritems = btrfs_header_nritems(cur);
5113                level = btrfs_header_level(cur);
5114                sret = btrfs_bin_search(cur, min_key, level, &slot);
5115
5116                /* at the lowest level, we're done, setup the path and exit */
5117                if (level == path->lowest_level) {
5118                        if (slot >= nritems)
5119                                goto find_next_key;
5120                        ret = 0;
5121                        path->slots[level] = slot;
5122                        btrfs_item_key_to_cpu(cur, &found_key, slot);
5123                        goto out;
5124                }
5125                if (sret && slot > 0)
5126                        slot--;
5127                /*
5128                 * check this node pointer against the min_trans parameters.
5129                 * If it is too old, old, skip to the next one.
5130                 */
5131                while (slot < nritems) {
5132                        u64 gen;
5133
5134                        gen = btrfs_node_ptr_generation(cur, slot);
5135                        if (gen < min_trans) {
5136                                slot++;
5137                                continue;
5138                        }
5139                        break;
5140                }
5141find_next_key:
5142                /*
5143                 * we didn't find a candidate key in this node, walk forward
5144                 * and find another one
5145                 */
5146                if (slot >= nritems) {
5147                        path->slots[level] = slot;
5148                        btrfs_set_path_blocking(path);
5149                        sret = btrfs_find_next_key(root, path, min_key, level,
5150                                                  min_trans);
5151                        if (sret == 0) {
5152                                btrfs_release_path(path);
5153                                goto again;
5154                        } else {
5155                                goto out;
5156                        }
5157                }
5158                /* save our key for returning back */
5159                btrfs_node_key_to_cpu(cur, &found_key, slot);
5160                path->slots[level] = slot;
5161                if (level == path->lowest_level) {
5162                        ret = 0;
5163                        goto out;
5164                }
5165                btrfs_set_path_blocking(path);
5166                cur = read_node_slot(fs_info, cur, slot);
5167                if (IS_ERR(cur)) {
5168                        ret = PTR_ERR(cur);
5169                        goto out;
5170                }
5171
5172                btrfs_tree_read_lock(cur);
5173
5174                path->locks[level - 1] = BTRFS_READ_LOCK;
5175                path->nodes[level - 1] = cur;
5176                unlock_up(path, level, 1, 0, NULL);
5177        }
5178out:
5179        path->keep_locks = keep_locks;
5180        if (ret == 0) {
5181                btrfs_unlock_up_safe(path, path->lowest_level + 1);
5182                btrfs_set_path_blocking(path);
5183                memcpy(min_key, &found_key, sizeof(found_key));
5184        }
5185        return ret;
5186}
5187
5188static int tree_move_down(struct btrfs_fs_info *fs_info,
5189                           struct btrfs_path *path,
5190                           int *level)
5191{
5192        struct extent_buffer *eb;
5193
5194        BUG_ON(*level == 0);
5195        eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5196        if (IS_ERR(eb))
5197                return PTR_ERR(eb);
5198
5199        path->nodes[*level - 1] = eb;
5200        path->slots[*level - 1] = 0;
5201        (*level)--;
5202        return 0;
5203}
5204
5205static int tree_move_next_or_upnext(struct btrfs_path *path,
5206                                    int *level, int root_level)
5207{
5208        int ret = 0;
5209        int nritems;
5210        nritems = btrfs_header_nritems(path->nodes[*level]);
5211
5212        path->slots[*level]++;
5213
5214        while (path->slots[*level] >= nritems) {
5215                if (*level == root_level)
5216                        return -1;
5217
5218                /* move upnext */
5219                path->slots[*level] = 0;
5220                free_extent_buffer(path->nodes[*level]);
5221                path->nodes[*level] = NULL;
5222                (*level)++;
5223                path->slots[*level]++;
5224
5225                nritems = btrfs_header_nritems(path->nodes[*level]);
5226                ret = 1;
5227        }
5228        return ret;
5229}
5230
5231/*
5232 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5233 * or down.
5234 */
5235static int tree_advance(struct btrfs_fs_info *fs_info,
5236                        struct btrfs_path *path,
5237                        int *level, int root_level,
5238                        int allow_down,
5239                        struct btrfs_key *key)
5240{
5241        int ret;
5242
5243        if (*level == 0 || !allow_down) {
5244                ret = tree_move_next_or_upnext(path, level, root_level);
5245        } else {
5246                ret = tree_move_down(fs_info, path, level);
5247        }
5248        if (ret >= 0) {
5249                if (*level == 0)
5250                        btrfs_item_key_to_cpu(path->nodes[*level], key,
5251                                        path->slots[*level]);
5252                else
5253                        btrfs_node_key_to_cpu(path->nodes[*level], key,
5254                                        path->slots[*level]);
5255        }
5256        return ret;
5257}
5258
5259static int tree_compare_item(struct btrfs_path *left_path,
5260                             struct btrfs_path *right_path,
5261                             char *tmp_buf)
5262{
5263        int cmp;
5264        int len1, len2;
5265        unsigned long off1, off2;
5266
5267        len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5268        len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5269        if (len1 != len2)
5270                return 1;
5271
5272        off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5273        off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5274                                right_path->slots[0]);
5275
5276        read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5277
5278        cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5279        if (cmp)
5280                return 1;
5281        return 0;
5282}
5283
5284#define ADVANCE 1
5285#define ADVANCE_ONLY_NEXT -1
5286
5287/*
5288 * This function compares two trees and calls the provided callback for
5289 * every changed/new/deleted item it finds.
5290 * If shared tree blocks are encountered, whole subtrees are skipped, making
5291 * the compare pretty fast on snapshotted subvolumes.
5292 *
5293 * This currently works on commit roots only. As commit roots are read only,
5294 * we don't do any locking. The commit roots are protected with transactions.
5295 * Transactions are ended and rejoined when a commit is tried in between.
5296 *
5297 * This function checks for modifications done to the trees while comparing.
5298 * If it detects a change, it aborts immediately.
5299 */
5300int btrfs_compare_trees(struct btrfs_root *left_root,
5301                        struct btrfs_root *right_root,
5302                        btrfs_changed_cb_t changed_cb, void *ctx)
5303{
5304        struct btrfs_fs_info *fs_info = left_root->fs_info;
5305        int ret;
5306        int cmp;
5307        struct btrfs_path *left_path = NULL;
5308        struct btrfs_path *right_path = NULL;
5309        struct btrfs_key left_key;
5310        struct btrfs_key right_key;
5311        char *tmp_buf = NULL;
5312        int left_root_level;
5313        int right_root_level;
5314        int left_level;
5315        int right_level;
5316        int left_end_reached;
5317        int right_end_reached;
5318        int advance_left;
5319        int advance_right;
5320        u64 left_blockptr;
5321        u64 right_blockptr;
5322        u64 left_gen;
5323        u64 right_gen;
5324
5325        left_path = btrfs_alloc_path();
5326        if (!left_path) {
5327                ret = -ENOMEM;
5328                goto out;
5329        }
5330        right_path = btrfs_alloc_path();
5331        if (!right_path) {
5332                ret = -ENOMEM;
5333                goto out;
5334        }
5335
5336        tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5337        if (!tmp_buf) {
5338                ret = -ENOMEM;
5339                goto out;
5340        }
5341
5342        left_path->search_commit_root = 1;
5343        left_path->skip_locking = 1;
5344        right_path->search_commit_root = 1;
5345        right_path->skip_locking = 1;
5346
5347        /*
5348         * Strategy: Go to the first items of both trees. Then do
5349         *
5350         * If both trees are at level 0
5351         *   Compare keys of current items
5352         *     If left < right treat left item as new, advance left tree
5353         *       and repeat
5354         *     If left > right treat right item as deleted, advance right tree
5355         *       and repeat
5356         *     If left == right do deep compare of items, treat as changed if
5357         *       needed, advance both trees and repeat
5358         * If both trees are at the same level but not at level 0
5359         *   Compare keys of current nodes/leafs
5360         *     If left < right advance left tree and repeat
5361         *     If left > right advance right tree and repeat
5362         *     If left == right compare blockptrs of the next nodes/leafs
5363         *       If they match advance both trees but stay at the same level
5364         *         and repeat
5365         *       If they don't match advance both trees while allowing to go
5366         *         deeper and repeat
5367         * If tree levels are different
5368         *   Advance the tree that needs it and repeat
5369         *
5370         * Advancing a tree means:
5371         *   If we are at level 0, try to go to the next slot. If that's not
5372         *   possible, go one level up and repeat. Stop when we found a level
5373         *   where we could go to the next slot. We may at this point be on a
5374         *   node or a leaf.
5375         *
5376         *   If we are not at level 0 and not on shared tree blocks, go one
5377         *   level deeper.
5378         *
5379         *   If we are not at level 0 and on shared tree blocks, go one slot to
5380         *   the right if possible or go up and right.
5381         */
5382
5383        down_read(&fs_info->commit_root_sem);
5384        left_level = btrfs_header_level(left_root->commit_root);
5385        left_root_level = left_level;
5386        left_path->nodes[left_level] =
5387                        btrfs_clone_extent_buffer(left_root->commit_root);
5388        if (!left_path->nodes[left_level]) {
5389                up_read(&fs_info->commit_root_sem);
5390                ret = -ENOMEM;
5391                goto out;
5392        }
5393        extent_buffer_get(left_path->nodes[left_level]);
5394
5395        right_level = btrfs_header_level(right_root->commit_root);
5396        right_root_level = right_level;
5397        right_path->nodes[right_level] =
5398                        btrfs_clone_extent_buffer(right_root->commit_root);
5399        if (!right_path->nodes[right_level]) {
5400                up_read(&fs_info->commit_root_sem);
5401                ret = -ENOMEM;
5402                goto out;
5403        }
5404        extent_buffer_get(right_path->nodes[right_level]);
5405        up_read(&fs_info->commit_root_sem);
5406
5407        if (left_level == 0)
5408                btrfs_item_key_to_cpu(left_path->nodes[left_level],
5409                                &left_key, left_path->slots[left_level]);
5410        else
5411                btrfs_node_key_to_cpu(left_path->nodes[left_level],
5412                                &left_key, left_path->slots[left_level]);
5413        if (right_level == 0)
5414                btrfs_item_key_to_cpu(right_path->nodes[right_level],
5415                                &right_key, right_path->slots[right_level]);
5416        else
5417                btrfs_node_key_to_cpu(right_path->nodes[right_level],
5418                                &right_key, right_path->slots[right_level]);
5419
5420        left_end_reached = right_end_reached = 0;
5421        advance_left = advance_right = 0;
5422
5423        while (1) {
5424                if (advance_left && !left_end_reached) {
5425                        ret = tree_advance(fs_info, left_path, &left_level,
5426                                        left_root_level,
5427                                        advance_left != ADVANCE_ONLY_NEXT,
5428                                        &left_key);
5429                        if (ret == -1)
5430                                left_end_reached = ADVANCE;
5431                        else if (ret < 0)
5432                                goto out;
5433                        advance_left = 0;
5434                }
5435                if (advance_right && !right_end_reached) {
5436                        ret = tree_advance(fs_info, right_path, &right_level,
5437                                        right_root_level,
5438                                        advance_right != ADVANCE_ONLY_NEXT,
5439                                        &right_key);
5440                        if (ret == -1)
5441                                right_end_reached = ADVANCE;
5442                        else if (ret < 0)
5443                                goto out;
5444                        advance_right = 0;
5445                }
5446
5447                if (left_end_reached && right_end_reached) {
5448                        ret = 0;
5449                        goto out;
5450                } else if (left_end_reached) {
5451                        if (right_level == 0) {
5452                                ret = changed_cb(left_path, right_path,
5453                                                &right_key,
5454                                                BTRFS_COMPARE_TREE_DELETED,
5455                                                ctx);
5456                                if (ret < 0)
5457                                        goto out;
5458                        }
5459                        advance_right = ADVANCE;
5460                        continue;
5461                } else if (right_end_reached) {
5462                        if (left_level == 0) {
5463                                ret = changed_cb(left_path, right_path,
5464                                                &left_key,
5465                                                BTRFS_COMPARE_TREE_NEW,
5466                                                ctx);
5467                                if (ret < 0)
5468                                        goto out;
5469                        }
5470                        advance_left = ADVANCE;
5471                        continue;
5472                }
5473
5474                if (left_level == 0 && right_level == 0) {
5475                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5476                        if (cmp < 0) {
5477                                ret = changed_cb(left_path, right_path,
5478                                                &left_key,
5479                                                BTRFS_COMPARE_TREE_NEW,
5480                                                ctx);
5481                                if (ret < 0)
5482                                        goto out;
5483                                advance_left = ADVANCE;
5484                        } else if (cmp > 0) {
5485                                ret = changed_cb(left_path, right_path,
5486                                                &right_key,
5487                                                BTRFS_COMPARE_TREE_DELETED,
5488                                                ctx);
5489                                if (ret < 0)
5490                                        goto out;
5491                                advance_right = ADVANCE;
5492                        } else {
5493                                enum btrfs_compare_tree_result result;
5494
5495                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5496                                ret = tree_compare_item(left_path, right_path,
5497                                                        tmp_buf);
5498                                if (ret)
5499                                        result = BTRFS_COMPARE_TREE_CHANGED;
5500                                else
5501                                        result = BTRFS_COMPARE_TREE_SAME;
5502                                ret = changed_cb(left_path, right_path,
5503                                                 &left_key, result, ctx);
5504                                if (ret < 0)
5505                                        goto out;
5506                                advance_left = ADVANCE;
5507                                advance_right = ADVANCE;
5508                        }
5509                } else if (left_level == right_level) {
5510                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5511                        if (cmp < 0) {
5512                                advance_left = ADVANCE;
5513                        } else if (cmp > 0) {
5514                                advance_right = ADVANCE;
5515                        } else {
5516                                left_blockptr = btrfs_node_blockptr(
5517                                                left_path->nodes[left_level],
5518                                                left_path->slots[left_level]);
5519                                right_blockptr = btrfs_node_blockptr(
5520                                                right_path->nodes[right_level],
5521                                                right_path->slots[right_level]);
5522                                left_gen = btrfs_node_ptr_generation(
5523                                                left_path->nodes[left_level],
5524                                                left_path->slots[left_level]);
5525                                right_gen = btrfs_node_ptr_generation(
5526                                                right_path->nodes[right_level],
5527                                                right_path->slots[right_level]);
5528                                if (left_blockptr == right_blockptr &&
5529                                    left_gen == right_gen) {
5530                                        /*
5531                                         * As we're on a shared block, don't
5532                                         * allow to go deeper.
5533                                         */
5534                                        advance_left = ADVANCE_ONLY_NEXT;
5535                                        advance_right = ADVANCE_ONLY_NEXT;
5536                                } else {
5537                                        advance_left = ADVANCE;
5538                                        advance_right = ADVANCE;
5539                                }
5540                        }
5541                } else if (left_level < right_level) {
5542                        advance_right = ADVANCE;
5543                } else {
5544                        advance_left = ADVANCE;
5545                }
5546        }
5547
5548out:
5549        btrfs_free_path(left_path);
5550        btrfs_free_path(right_path);
5551        kvfree(tmp_buf);
5552        return ret;
5553}
5554
5555/*
5556 * this is similar to btrfs_next_leaf, but does not try to preserve
5557 * and fixup the path.  It looks for and returns the next key in the
5558 * tree based on the current path and the min_trans parameters.
5559 *
5560 * 0 is returned if another key is found, < 0 if there are any errors
5561 * and 1 is returned if there are no higher keys in the tree
5562 *
5563 * path->keep_locks should be set to 1 on the search made before
5564 * calling this function.
5565 */
5566int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5567                        struct btrfs_key *key, int level, u64 min_trans)
5568{
5569        int slot;
5570        struct extent_buffer *c;
5571
5572        WARN_ON(!path->keep_locks);
5573        while (level < BTRFS_MAX_LEVEL) {
5574                if (!path->nodes[level])
5575                        return 1;
5576
5577                slot = path->slots[level] + 1;
5578                c = path->nodes[level];
5579next:
5580                if (slot >= btrfs_header_nritems(c)) {
5581                        int ret;
5582                        int orig_lowest;
5583                        struct btrfs_key cur_key;
5584                        if (level + 1 >= BTRFS_MAX_LEVEL ||
5585                            !path->nodes[level + 1])
5586                                return 1;
5587
5588                        if (path->locks[level + 1]) {
5589                                level++;
5590                                continue;
5591                        }
5592
5593                        slot = btrfs_header_nritems(c) - 1;
5594                        if (level == 0)
5595                                btrfs_item_key_to_cpu(c, &cur_key, slot);
5596                        else
5597                                btrfs_node_key_to_cpu(c, &cur_key, slot);
5598
5599                        orig_lowest = path->lowest_level;
5600                        btrfs_release_path(path);
5601                        path->lowest_level = level;
5602                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
5603                                                0, 0);
5604                        path->lowest_level = orig_lowest;
5605                        if (ret < 0)
5606                                return ret;
5607
5608                        c = path->nodes[level];
5609                        slot = path->slots[level];
5610                        if (ret == 0)
5611                                slot++;
5612                        goto next;
5613                }
5614
5615                if (level == 0)
5616                        btrfs_item_key_to_cpu(c, key, slot);
5617                else {
5618                        u64 gen = btrfs_node_ptr_generation(c, slot);
5619
5620                        if (gen < min_trans) {
5621                                slot++;
5622                                goto next;
5623                        }
5624                        btrfs_node_key_to_cpu(c, key, slot);
5625                }
5626                return 0;
5627        }
5628        return 1;
5629}
5630
5631/*
5632 * search the tree again to find a leaf with greater keys
5633 * returns 0 if it found something or 1 if there are no greater leaves.
5634 * returns < 0 on io errors.
5635 */
5636int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5637{
5638        return btrfs_next_old_leaf(root, path, 0);
5639}
5640
5641int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5642                        u64 time_seq)
5643{
5644        int slot;
5645        int level;
5646        struct extent_buffer *c;
5647        struct extent_buffer *next;
5648        struct btrfs_key key;
5649        u32 nritems;
5650        int ret;
5651        int old_spinning = path->leave_spinning;
5652        int next_rw_lock = 0;
5653
5654        nritems = btrfs_header_nritems(path->nodes[0]);
5655        if (nritems == 0)
5656                return 1;
5657
5658        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5659again:
5660        level = 1;
5661        next = NULL;
5662        next_rw_lock = 0;
5663        btrfs_release_path(path);
5664
5665        path->keep_locks = 1;
5666        path->leave_spinning = 1;
5667
5668        if (time_seq)
5669                ret = btrfs_search_old_slot(root, &key, path, time_seq);
5670        else
5671                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5672        path->keep_locks = 0;
5673
5674        if (ret < 0)
5675                return ret;
5676
5677        nritems = btrfs_header_nritems(path->nodes[0]);
5678        /*
5679         * by releasing the path above we dropped all our locks.  A balance
5680         * could have added more items next to the key that used to be
5681         * at the very end of the block.  So, check again here and
5682         * advance the path if there are now more items available.
5683         */
5684        if (nritems > 0 && path->slots[0] < nritems - 1) {
5685                if (ret == 0)
5686                        path->slots[0]++;
5687                ret = 0;
5688                goto done;
5689        }
5690        /*
5691         * So the above check misses one case:
5692         * - after releasing the path above, someone has removed the item that
5693         *   used to be at the very end of the block, and balance between leafs
5694         *   gets another one with bigger key.offset to replace it.
5695         *
5696         * This one should be returned as well, or we can get leaf corruption
5697         * later(esp. in __btrfs_drop_extents()).
5698         *
5699         * And a bit more explanation about this check,
5700         * with ret > 0, the key isn't found, the path points to the slot
5701         * where it should be inserted, so the path->slots[0] item must be the
5702         * bigger one.
5703         */
5704        if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5705                ret = 0;
5706                goto done;
5707        }
5708
5709        while (level < BTRFS_MAX_LEVEL) {
5710                if (!path->nodes[level]) {
5711                        ret = 1;
5712                        goto done;
5713                }
5714
5715                slot = path->slots[level] + 1;
5716                c = path->nodes[level];
5717                if (slot >= btrfs_header_nritems(c)) {
5718                        level++;
5719                        if (level == BTRFS_MAX_LEVEL) {
5720                                ret = 1;
5721                                goto done;
5722                        }
5723                        continue;
5724                }
5725
5726                if (next) {
5727                        btrfs_tree_unlock_rw(next, next_rw_lock);
5728                        free_extent_buffer(next);
5729                }
5730
5731                next = c;
5732                next_rw_lock = path->locks[level];
5733                ret = read_block_for_search(root, path, &next, level,
5734                                            slot, &key);
5735                if (ret == -EAGAIN)
5736                        goto again;
5737
5738                if (ret < 0) {
5739                        btrfs_release_path(path);
5740                        goto done;
5741                }
5742
5743                if (!path->skip_locking) {
5744                        ret = btrfs_try_tree_read_lock(next);
5745                        if (!ret && time_seq) {
5746                                /*
5747                                 * If we don't get the lock, we may be racing
5748                                 * with push_leaf_left, holding that lock while
5749                                 * itself waiting for the leaf we've currently
5750                                 * locked. To solve this situation, we give up
5751                                 * on our lock and cycle.
5752                                 */
5753                                free_extent_buffer(next);
5754                                btrfs_release_path(path);
5755                                cond_resched();
5756                                goto again;
5757                        }
5758                        if (!ret) {
5759                                btrfs_set_path_blocking(path);
5760                                btrfs_tree_read_lock(next);
5761                        }
5762                        next_rw_lock = BTRFS_READ_LOCK;
5763                }
5764                break;
5765        }
5766        path->slots[level] = slot;
5767        while (1) {
5768                level--;
5769                c = path->nodes[level];
5770                if (path->locks[level])
5771                        btrfs_tree_unlock_rw(c, path->locks[level]);
5772
5773                free_extent_buffer(c);
5774                path->nodes[level] = next;
5775                path->slots[level] = 0;
5776                if (!path->skip_locking)
5777                        path->locks[level] = next_rw_lock;
5778                if (!level)
5779                        break;
5780
5781                ret = read_block_for_search(root, path, &next, level,
5782                                            0, &key);
5783                if (ret == -EAGAIN)
5784                        goto again;
5785
5786                if (ret < 0) {
5787                        btrfs_release_path(path);
5788                        goto done;
5789                }
5790
5791                if (!path->skip_locking) {
5792                        ret = btrfs_try_tree_read_lock(next);
5793                        if (!ret) {
5794                                btrfs_set_path_blocking(path);
5795                                btrfs_tree_read_lock(next);
5796                        }
5797                        next_rw_lock = BTRFS_READ_LOCK;
5798                }
5799        }
5800        ret = 0;
5801done:
5802        unlock_up(path, 0, 1, 0, NULL);
5803        path->leave_spinning = old_spinning;
5804        if (!old_spinning)
5805                btrfs_set_path_blocking(path);
5806
5807        return ret;
5808}
5809
5810/*
5811 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5812 * searching until it gets past min_objectid or finds an item of 'type'
5813 *
5814 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5815 */
5816int btrfs_previous_item(struct btrfs_root *root,
5817                        struct btrfs_path *path, u64 min_objectid,
5818                        int type)
5819{
5820        struct btrfs_key found_key;
5821        struct extent_buffer *leaf;
5822        u32 nritems;
5823        int ret;
5824
5825        while (1) {
5826                if (path->slots[0] == 0) {
5827                        btrfs_set_path_blocking(path);
5828                        ret = btrfs_prev_leaf(root, path);
5829                        if (ret != 0)
5830                                return ret;
5831                } else {
5832                        path->slots[0]--;
5833                }
5834                leaf = path->nodes[0];
5835                nritems = btrfs_header_nritems(leaf);
5836                if (nritems == 0)
5837                        return 1;
5838                if (path->slots[0] == nritems)
5839                        path->slots[0]--;
5840
5841                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5842                if (found_key.objectid < min_objectid)
5843                        break;
5844                if (found_key.type == type)
5845                        return 0;
5846                if (found_key.objectid == min_objectid &&
5847                    found_key.type < type)
5848                        break;
5849        }
5850        return 1;
5851}
5852
5853/*
5854 * search in extent tree to find a previous Metadata/Data extent item with
5855 * min objecitd.
5856 *
5857 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5858 */
5859int btrfs_previous_extent_item(struct btrfs_root *root,
5860                        struct btrfs_path *path, u64 min_objectid)
5861{
5862        struct btrfs_key found_key;
5863        struct extent_buffer *leaf;
5864        u32 nritems;
5865        int ret;
5866
5867        while (1) {
5868                if (path->slots[0] == 0) {
5869                        btrfs_set_path_blocking(path);
5870                        ret = btrfs_prev_leaf(root, path);
5871                        if (ret != 0)
5872                                return ret;
5873                } else {
5874                        path->slots[0]--;
5875                }
5876                leaf = path->nodes[0];
5877                nritems = btrfs_header_nritems(leaf);
5878                if (nritems == 0)
5879                        return 1;
5880                if (path->slots[0] == nritems)
5881                        path->slots[0]--;
5882
5883                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5884                if (found_key.objectid < min_objectid)
5885                        break;
5886                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5887                    found_key.type == BTRFS_METADATA_ITEM_KEY)
5888                        return 0;
5889                if (found_key.objectid == min_objectid &&
5890                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5891                        break;
5892        }
5893        return 1;
5894}
5895