linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/slab.h>
   8#include <linux/rbtree.h>
   9#include <linux/mm.h>
  10#include "ctree.h"
  11#include "disk-io.h"
  12#include "transaction.h"
  13#include "print-tree.h"
  14#include "locking.h"
  15#include "volumes.h"
  16
  17static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  18                      *root, struct btrfs_path *path, int level);
  19static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  20                      const struct btrfs_key *ins_key, struct btrfs_path *path,
  21                      int data_size, int extend);
  22static int push_node_left(struct btrfs_trans_handle *trans,
  23                          struct btrfs_fs_info *fs_info,
  24                          struct extent_buffer *dst,
  25                          struct extent_buffer *src, int empty);
  26static int balance_node_right(struct btrfs_trans_handle *trans,
  27                              struct btrfs_fs_info *fs_info,
  28                              struct extent_buffer *dst_buf,
  29                              struct extent_buffer *src_buf);
  30static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
  31                    int level, int slot);
  32
  33struct btrfs_path *btrfs_alloc_path(void)
  34{
  35        return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  36}
  37
  38/*
  39 * set all locked nodes in the path to blocking locks.  This should
  40 * be done before scheduling
  41 */
  42noinline void btrfs_set_path_blocking(struct btrfs_path *p)
  43{
  44        int i;
  45        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  46                if (!p->nodes[i] || !p->locks[i])
  47                        continue;
  48                btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
  49                if (p->locks[i] == BTRFS_READ_LOCK)
  50                        p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
  51                else if (p->locks[i] == BTRFS_WRITE_LOCK)
  52                        p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
  53        }
  54}
  55
  56/* this also releases the path */
  57void btrfs_free_path(struct btrfs_path *p)
  58{
  59        if (!p)
  60                return;
  61        btrfs_release_path(p);
  62        kmem_cache_free(btrfs_path_cachep, p);
  63}
  64
  65/*
  66 * path release drops references on the extent buffers in the path
  67 * and it drops any locks held by this path
  68 *
  69 * It is safe to call this on paths that no locks or extent buffers held.
  70 */
  71noinline void btrfs_release_path(struct btrfs_path *p)
  72{
  73        int i;
  74
  75        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
  76                p->slots[i] = 0;
  77                if (!p->nodes[i])
  78                        continue;
  79                if (p->locks[i]) {
  80                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
  81                        p->locks[i] = 0;
  82                }
  83                free_extent_buffer(p->nodes[i]);
  84                p->nodes[i] = NULL;
  85        }
  86}
  87
  88/*
  89 * safely gets a reference on the root node of a tree.  A lock
  90 * is not taken, so a concurrent writer may put a different node
  91 * at the root of the tree.  See btrfs_lock_root_node for the
  92 * looping required.
  93 *
  94 * The extent buffer returned by this has a reference taken, so
  95 * it won't disappear.  It may stop being the root of the tree
  96 * at any time because there are no locks held.
  97 */
  98struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
  99{
 100        struct extent_buffer *eb;
 101
 102        while (1) {
 103                rcu_read_lock();
 104                eb = rcu_dereference(root->node);
 105
 106                /*
 107                 * RCU really hurts here, we could free up the root node because
 108                 * it was COWed but we may not get the new root node yet so do
 109                 * the inc_not_zero dance and if it doesn't work then
 110                 * synchronize_rcu and try again.
 111                 */
 112                if (atomic_inc_not_zero(&eb->refs)) {
 113                        rcu_read_unlock();
 114                        break;
 115                }
 116                rcu_read_unlock();
 117                synchronize_rcu();
 118        }
 119        return eb;
 120}
 121
 122/* loop around taking references on and locking the root node of the
 123 * tree until you end up with a lock on the root.  A locked buffer
 124 * is returned, with a reference held.
 125 */
 126struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
 127{
 128        struct extent_buffer *eb;
 129
 130        while (1) {
 131                eb = btrfs_root_node(root);
 132                btrfs_tree_lock(eb);
 133                if (eb == root->node)
 134                        break;
 135                btrfs_tree_unlock(eb);
 136                free_extent_buffer(eb);
 137        }
 138        return eb;
 139}
 140
 141/* loop around taking references on and locking the root node of the
 142 * tree until you end up with a lock on the root.  A locked buffer
 143 * is returned, with a reference held.
 144 */
 145struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 146{
 147        struct extent_buffer *eb;
 148
 149        while (1) {
 150                eb = btrfs_root_node(root);
 151                btrfs_tree_read_lock(eb);
 152                if (eb == root->node)
 153                        break;
 154                btrfs_tree_read_unlock(eb);
 155                free_extent_buffer(eb);
 156        }
 157        return eb;
 158}
 159
 160/* cowonly root (everything not a reference counted cow subvolume), just get
 161 * put onto a simple dirty list.  transaction.c walks this to make sure they
 162 * get properly updated on disk.
 163 */
 164static void add_root_to_dirty_list(struct btrfs_root *root)
 165{
 166        struct btrfs_fs_info *fs_info = root->fs_info;
 167
 168        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
 169            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
 170                return;
 171
 172        spin_lock(&fs_info->trans_lock);
 173        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
 174                /* Want the extent tree to be the last on the list */
 175                if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
 176                        list_move_tail(&root->dirty_list,
 177                                       &fs_info->dirty_cowonly_roots);
 178                else
 179                        list_move(&root->dirty_list,
 180                                  &fs_info->dirty_cowonly_roots);
 181        }
 182        spin_unlock(&fs_info->trans_lock);
 183}
 184
 185/*
 186 * used by snapshot creation to make a copy of a root for a tree with
 187 * a given objectid.  The buffer with the new root node is returned in
 188 * cow_ret, and this func returns zero on success or a negative error code.
 189 */
 190int btrfs_copy_root(struct btrfs_trans_handle *trans,
 191                      struct btrfs_root *root,
 192                      struct extent_buffer *buf,
 193                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 194{
 195        struct btrfs_fs_info *fs_info = root->fs_info;
 196        struct extent_buffer *cow;
 197        int ret = 0;
 198        int level;
 199        struct btrfs_disk_key disk_key;
 200
 201        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 202                trans->transid != fs_info->running_transaction->transid);
 203        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 204                trans->transid != root->last_trans);
 205
 206        level = btrfs_header_level(buf);
 207        if (level == 0)
 208                btrfs_item_key(buf, &disk_key, 0);
 209        else
 210                btrfs_node_key(buf, &disk_key, 0);
 211
 212        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 213                        &disk_key, level, buf->start, 0);
 214        if (IS_ERR(cow))
 215                return PTR_ERR(cow);
 216
 217        copy_extent_buffer_full(cow, buf);
 218        btrfs_set_header_bytenr(cow, cow->start);
 219        btrfs_set_header_generation(cow, trans->transid);
 220        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 221        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 222                                     BTRFS_HEADER_FLAG_RELOC);
 223        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 224                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 225        else
 226                btrfs_set_header_owner(cow, new_root_objectid);
 227
 228        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
 229
 230        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 231        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 232                ret = btrfs_inc_ref(trans, root, cow, 1);
 233        else
 234                ret = btrfs_inc_ref(trans, root, cow, 0);
 235
 236        if (ret)
 237                return ret;
 238
 239        btrfs_mark_buffer_dirty(cow);
 240        *cow_ret = cow;
 241        return 0;
 242}
 243
 244enum mod_log_op {
 245        MOD_LOG_KEY_REPLACE,
 246        MOD_LOG_KEY_ADD,
 247        MOD_LOG_KEY_REMOVE,
 248        MOD_LOG_KEY_REMOVE_WHILE_FREEING,
 249        MOD_LOG_KEY_REMOVE_WHILE_MOVING,
 250        MOD_LOG_MOVE_KEYS,
 251        MOD_LOG_ROOT_REPLACE,
 252};
 253
 254struct tree_mod_root {
 255        u64 logical;
 256        u8 level;
 257};
 258
 259struct tree_mod_elem {
 260        struct rb_node node;
 261        u64 logical;
 262        u64 seq;
 263        enum mod_log_op op;
 264
 265        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
 266        int slot;
 267
 268        /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
 269        u64 generation;
 270
 271        /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
 272        struct btrfs_disk_key key;
 273        u64 blockptr;
 274
 275        /* this is used for op == MOD_LOG_MOVE_KEYS */
 276        struct {
 277                int dst_slot;
 278                int nr_items;
 279        } move;
 280
 281        /* this is used for op == MOD_LOG_ROOT_REPLACE */
 282        struct tree_mod_root old_root;
 283};
 284
 285/*
 286 * Pull a new tree mod seq number for our operation.
 287 */
 288static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 289{
 290        return atomic64_inc_return(&fs_info->tree_mod_seq);
 291}
 292
 293/*
 294 * This adds a new blocker to the tree mod log's blocker list if the @elem
 295 * passed does not already have a sequence number set. So when a caller expects
 296 * to record tree modifications, it should ensure to set elem->seq to zero
 297 * before calling btrfs_get_tree_mod_seq.
 298 * Returns a fresh, unused tree log modification sequence number, even if no new
 299 * blocker was added.
 300 */
 301u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 302                           struct seq_list *elem)
 303{
 304        write_lock(&fs_info->tree_mod_log_lock);
 305        spin_lock(&fs_info->tree_mod_seq_lock);
 306        if (!elem->seq) {
 307                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 308                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 309        }
 310        spin_unlock(&fs_info->tree_mod_seq_lock);
 311        write_unlock(&fs_info->tree_mod_log_lock);
 312
 313        return elem->seq;
 314}
 315
 316void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 317                            struct seq_list *elem)
 318{
 319        struct rb_root *tm_root;
 320        struct rb_node *node;
 321        struct rb_node *next;
 322        struct seq_list *cur_elem;
 323        struct tree_mod_elem *tm;
 324        u64 min_seq = (u64)-1;
 325        u64 seq_putting = elem->seq;
 326
 327        if (!seq_putting)
 328                return;
 329
 330        spin_lock(&fs_info->tree_mod_seq_lock);
 331        list_del(&elem->list);
 332        elem->seq = 0;
 333
 334        list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
 335                if (cur_elem->seq < min_seq) {
 336                        if (seq_putting > cur_elem->seq) {
 337                                /*
 338                                 * blocker with lower sequence number exists, we
 339                                 * cannot remove anything from the log
 340                                 */
 341                                spin_unlock(&fs_info->tree_mod_seq_lock);
 342                                return;
 343                        }
 344                        min_seq = cur_elem->seq;
 345                }
 346        }
 347        spin_unlock(&fs_info->tree_mod_seq_lock);
 348
 349        /*
 350         * anything that's lower than the lowest existing (read: blocked)
 351         * sequence number can be removed from the tree.
 352         */
 353        write_lock(&fs_info->tree_mod_log_lock);
 354        tm_root = &fs_info->tree_mod_log;
 355        for (node = rb_first(tm_root); node; node = next) {
 356                next = rb_next(node);
 357                tm = rb_entry(node, struct tree_mod_elem, node);
 358                if (tm->seq > min_seq)
 359                        continue;
 360                rb_erase(node, tm_root);
 361                kfree(tm);
 362        }
 363        write_unlock(&fs_info->tree_mod_log_lock);
 364}
 365
 366/*
 367 * key order of the log:
 368 *       node/leaf start address -> sequence
 369 *
 370 * The 'start address' is the logical address of the *new* root node
 371 * for root replace operations, or the logical address of the affected
 372 * block for all other operations.
 373 *
 374 * Note: must be called with write lock for fs_info::tree_mod_log_lock.
 375 */
 376static noinline int
 377__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 378{
 379        struct rb_root *tm_root;
 380        struct rb_node **new;
 381        struct rb_node *parent = NULL;
 382        struct tree_mod_elem *cur;
 383
 384        tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 385
 386        tm_root = &fs_info->tree_mod_log;
 387        new = &tm_root->rb_node;
 388        while (*new) {
 389                cur = rb_entry(*new, struct tree_mod_elem, node);
 390                parent = *new;
 391                if (cur->logical < tm->logical)
 392                        new = &((*new)->rb_left);
 393                else if (cur->logical > tm->logical)
 394                        new = &((*new)->rb_right);
 395                else if (cur->seq < tm->seq)
 396                        new = &((*new)->rb_left);
 397                else if (cur->seq > tm->seq)
 398                        new = &((*new)->rb_right);
 399                else
 400                        return -EEXIST;
 401        }
 402
 403        rb_link_node(&tm->node, parent, new);
 404        rb_insert_color(&tm->node, tm_root);
 405        return 0;
 406}
 407
 408/*
 409 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 410 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 411 * this until all tree mod log insertions are recorded in the rb tree and then
 412 * write unlock fs_info::tree_mod_log_lock.
 413 */
 414static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 415                                    struct extent_buffer *eb) {
 416        smp_mb();
 417        if (list_empty(&(fs_info)->tree_mod_seq_list))
 418                return 1;
 419        if (eb && btrfs_header_level(eb) == 0)
 420                return 1;
 421
 422        write_lock(&fs_info->tree_mod_log_lock);
 423        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 424                write_unlock(&fs_info->tree_mod_log_lock);
 425                return 1;
 426        }
 427
 428        return 0;
 429}
 430
 431/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 432static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 433                                    struct extent_buffer *eb)
 434{
 435        smp_mb();
 436        if (list_empty(&(fs_info)->tree_mod_seq_list))
 437                return 0;
 438        if (eb && btrfs_header_level(eb) == 0)
 439                return 0;
 440
 441        return 1;
 442}
 443
 444static struct tree_mod_elem *
 445alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
 446                    enum mod_log_op op, gfp_t flags)
 447{
 448        struct tree_mod_elem *tm;
 449
 450        tm = kzalloc(sizeof(*tm), flags);
 451        if (!tm)
 452                return NULL;
 453
 454        tm->logical = eb->start;
 455        if (op != MOD_LOG_KEY_ADD) {
 456                btrfs_node_key(eb, &tm->key, slot);
 457                tm->blockptr = btrfs_node_blockptr(eb, slot);
 458        }
 459        tm->op = op;
 460        tm->slot = slot;
 461        tm->generation = btrfs_node_ptr_generation(eb, slot);
 462        RB_CLEAR_NODE(&tm->node);
 463
 464        return tm;
 465}
 466
 467static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 468                enum mod_log_op op, gfp_t flags)
 469{
 470        struct tree_mod_elem *tm;
 471        int ret;
 472
 473        if (!tree_mod_need_log(eb->fs_info, eb))
 474                return 0;
 475
 476        tm = alloc_tree_mod_elem(eb, slot, op, flags);
 477        if (!tm)
 478                return -ENOMEM;
 479
 480        if (tree_mod_dont_log(eb->fs_info, eb)) {
 481                kfree(tm);
 482                return 0;
 483        }
 484
 485        ret = __tree_mod_log_insert(eb->fs_info, tm);
 486        write_unlock(&eb->fs_info->tree_mod_log_lock);
 487        if (ret)
 488                kfree(tm);
 489
 490        return ret;
 491}
 492
 493static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
 494                int dst_slot, int src_slot, int nr_items)
 495{
 496        struct tree_mod_elem *tm = NULL;
 497        struct tree_mod_elem **tm_list = NULL;
 498        int ret = 0;
 499        int i;
 500        int locked = 0;
 501
 502        if (!tree_mod_need_log(eb->fs_info, eb))
 503                return 0;
 504
 505        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
 506        if (!tm_list)
 507                return -ENOMEM;
 508
 509        tm = kzalloc(sizeof(*tm), GFP_NOFS);
 510        if (!tm) {
 511                ret = -ENOMEM;
 512                goto free_tms;
 513        }
 514
 515        tm->logical = eb->start;
 516        tm->slot = src_slot;
 517        tm->move.dst_slot = dst_slot;
 518        tm->move.nr_items = nr_items;
 519        tm->op = MOD_LOG_MOVE_KEYS;
 520
 521        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 522                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 523                    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
 524                if (!tm_list[i]) {
 525                        ret = -ENOMEM;
 526                        goto free_tms;
 527                }
 528        }
 529
 530        if (tree_mod_dont_log(eb->fs_info, eb))
 531                goto free_tms;
 532        locked = 1;
 533
 534        /*
 535         * When we override something during the move, we log these removals.
 536         * This can only happen when we move towards the beginning of the
 537         * buffer, i.e. dst_slot < src_slot.
 538         */
 539        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 540                ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
 541                if (ret)
 542                        goto free_tms;
 543        }
 544
 545        ret = __tree_mod_log_insert(eb->fs_info, tm);
 546        if (ret)
 547                goto free_tms;
 548        write_unlock(&eb->fs_info->tree_mod_log_lock);
 549        kfree(tm_list);
 550
 551        return 0;
 552free_tms:
 553        for (i = 0; i < nr_items; i++) {
 554                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 555                        rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
 556                kfree(tm_list[i]);
 557        }
 558        if (locked)
 559                write_unlock(&eb->fs_info->tree_mod_log_lock);
 560        kfree(tm_list);
 561        kfree(tm);
 562
 563        return ret;
 564}
 565
 566static inline int
 567__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 568                       struct tree_mod_elem **tm_list,
 569                       int nritems)
 570{
 571        int i, j;
 572        int ret;
 573
 574        for (i = nritems - 1; i >= 0; i--) {
 575                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 576                if (ret) {
 577                        for (j = nritems - 1; j > i; j--)
 578                                rb_erase(&tm_list[j]->node,
 579                                         &fs_info->tree_mod_log);
 580                        return ret;
 581                }
 582        }
 583
 584        return 0;
 585}
 586
 587static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
 588                         struct extent_buffer *new_root, int log_removal)
 589{
 590        struct btrfs_fs_info *fs_info = old_root->fs_info;
 591        struct tree_mod_elem *tm = NULL;
 592        struct tree_mod_elem **tm_list = NULL;
 593        int nritems = 0;
 594        int ret = 0;
 595        int i;
 596
 597        if (!tree_mod_need_log(fs_info, NULL))
 598                return 0;
 599
 600        if (log_removal && btrfs_header_level(old_root) > 0) {
 601                nritems = btrfs_header_nritems(old_root);
 602                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 603                                  GFP_NOFS);
 604                if (!tm_list) {
 605                        ret = -ENOMEM;
 606                        goto free_tms;
 607                }
 608                for (i = 0; i < nritems; i++) {
 609                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
 610                            MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 611                        if (!tm_list[i]) {
 612                                ret = -ENOMEM;
 613                                goto free_tms;
 614                        }
 615                }
 616        }
 617
 618        tm = kzalloc(sizeof(*tm), GFP_NOFS);
 619        if (!tm) {
 620                ret = -ENOMEM;
 621                goto free_tms;
 622        }
 623
 624        tm->logical = new_root->start;
 625        tm->old_root.logical = old_root->start;
 626        tm->old_root.level = btrfs_header_level(old_root);
 627        tm->generation = btrfs_header_generation(old_root);
 628        tm->op = MOD_LOG_ROOT_REPLACE;
 629
 630        if (tree_mod_dont_log(fs_info, NULL))
 631                goto free_tms;
 632
 633        if (tm_list)
 634                ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 635        if (!ret)
 636                ret = __tree_mod_log_insert(fs_info, tm);
 637
 638        write_unlock(&fs_info->tree_mod_log_lock);
 639        if (ret)
 640                goto free_tms;
 641        kfree(tm_list);
 642
 643        return ret;
 644
 645free_tms:
 646        if (tm_list) {
 647                for (i = 0; i < nritems; i++)
 648                        kfree(tm_list[i]);
 649                kfree(tm_list);
 650        }
 651        kfree(tm);
 652
 653        return ret;
 654}
 655
 656static struct tree_mod_elem *
 657__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 658                      int smallest)
 659{
 660        struct rb_root *tm_root;
 661        struct rb_node *node;
 662        struct tree_mod_elem *cur = NULL;
 663        struct tree_mod_elem *found = NULL;
 664
 665        read_lock(&fs_info->tree_mod_log_lock);
 666        tm_root = &fs_info->tree_mod_log;
 667        node = tm_root->rb_node;
 668        while (node) {
 669                cur = rb_entry(node, struct tree_mod_elem, node);
 670                if (cur->logical < start) {
 671                        node = node->rb_left;
 672                } else if (cur->logical > start) {
 673                        node = node->rb_right;
 674                } else if (cur->seq < min_seq) {
 675                        node = node->rb_left;
 676                } else if (!smallest) {
 677                        /* we want the node with the highest seq */
 678                        if (found)
 679                                BUG_ON(found->seq > cur->seq);
 680                        found = cur;
 681                        node = node->rb_left;
 682                } else if (cur->seq > min_seq) {
 683                        /* we want the node with the smallest seq */
 684                        if (found)
 685                                BUG_ON(found->seq < cur->seq);
 686                        found = cur;
 687                        node = node->rb_right;
 688                } else {
 689                        found = cur;
 690                        break;
 691                }
 692        }
 693        read_unlock(&fs_info->tree_mod_log_lock);
 694
 695        return found;
 696}
 697
 698/*
 699 * this returns the element from the log with the smallest time sequence
 700 * value that's in the log (the oldest log item). any element with a time
 701 * sequence lower than min_seq will be ignored.
 702 */
 703static struct tree_mod_elem *
 704tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
 705                           u64 min_seq)
 706{
 707        return __tree_mod_log_search(fs_info, start, min_seq, 1);
 708}
 709
 710/*
 711 * this returns the element from the log with the largest time sequence
 712 * value that's in the log (the most recent log item). any element with
 713 * a time sequence lower than min_seq will be ignored.
 714 */
 715static struct tree_mod_elem *
 716tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 717{
 718        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 719}
 720
 721static noinline int
 722tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 723                     struct extent_buffer *src, unsigned long dst_offset,
 724                     unsigned long src_offset, int nr_items)
 725{
 726        int ret = 0;
 727        struct tree_mod_elem **tm_list = NULL;
 728        struct tree_mod_elem **tm_list_add, **tm_list_rem;
 729        int i;
 730        int locked = 0;
 731
 732        if (!tree_mod_need_log(fs_info, NULL))
 733                return 0;
 734
 735        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 736                return 0;
 737
 738        tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 739                          GFP_NOFS);
 740        if (!tm_list)
 741                return -ENOMEM;
 742
 743        tm_list_add = tm_list;
 744        tm_list_rem = tm_list + nr_items;
 745        for (i = 0; i < nr_items; i++) {
 746                tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 747                    MOD_LOG_KEY_REMOVE, GFP_NOFS);
 748                if (!tm_list_rem[i]) {
 749                        ret = -ENOMEM;
 750                        goto free_tms;
 751                }
 752
 753                tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 754                    MOD_LOG_KEY_ADD, GFP_NOFS);
 755                if (!tm_list_add[i]) {
 756                        ret = -ENOMEM;
 757                        goto free_tms;
 758                }
 759        }
 760
 761        if (tree_mod_dont_log(fs_info, NULL))
 762                goto free_tms;
 763        locked = 1;
 764
 765        for (i = 0; i < nr_items; i++) {
 766                ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
 767                if (ret)
 768                        goto free_tms;
 769                ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
 770                if (ret)
 771                        goto free_tms;
 772        }
 773
 774        write_unlock(&fs_info->tree_mod_log_lock);
 775        kfree(tm_list);
 776
 777        return 0;
 778
 779free_tms:
 780        for (i = 0; i < nr_items * 2; i++) {
 781                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 782                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 783                kfree(tm_list[i]);
 784        }
 785        if (locked)
 786                write_unlock(&fs_info->tree_mod_log_lock);
 787        kfree(tm_list);
 788
 789        return ret;
 790}
 791
 792static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
 793{
 794        struct tree_mod_elem **tm_list = NULL;
 795        int nritems = 0;
 796        int i;
 797        int ret = 0;
 798
 799        if (btrfs_header_level(eb) == 0)
 800                return 0;
 801
 802        if (!tree_mod_need_log(eb->fs_info, NULL))
 803                return 0;
 804
 805        nritems = btrfs_header_nritems(eb);
 806        tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 807        if (!tm_list)
 808                return -ENOMEM;
 809
 810        for (i = 0; i < nritems; i++) {
 811                tm_list[i] = alloc_tree_mod_elem(eb, i,
 812                    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 813                if (!tm_list[i]) {
 814                        ret = -ENOMEM;
 815                        goto free_tms;
 816                }
 817        }
 818
 819        if (tree_mod_dont_log(eb->fs_info, eb))
 820                goto free_tms;
 821
 822        ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
 823        write_unlock(&eb->fs_info->tree_mod_log_lock);
 824        if (ret)
 825                goto free_tms;
 826        kfree(tm_list);
 827
 828        return 0;
 829
 830free_tms:
 831        for (i = 0; i < nritems; i++)
 832                kfree(tm_list[i]);
 833        kfree(tm_list);
 834
 835        return ret;
 836}
 837
 838/*
 839 * check if the tree block can be shared by multiple trees
 840 */
 841int btrfs_block_can_be_shared(struct btrfs_root *root,
 842                              struct extent_buffer *buf)
 843{
 844        /*
 845         * Tree blocks not in reference counted trees and tree roots
 846         * are never shared. If a block was allocated after the last
 847         * snapshot and the block was not allocated by tree relocation,
 848         * we know the block is not shared.
 849         */
 850        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 851            buf != root->node && buf != root->commit_root &&
 852            (btrfs_header_generation(buf) <=
 853             btrfs_root_last_snapshot(&root->root_item) ||
 854             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 855                return 1;
 856
 857        return 0;
 858}
 859
 860static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 861                                       struct btrfs_root *root,
 862                                       struct extent_buffer *buf,
 863                                       struct extent_buffer *cow,
 864                                       int *last_ref)
 865{
 866        struct btrfs_fs_info *fs_info = root->fs_info;
 867        u64 refs;
 868        u64 owner;
 869        u64 flags;
 870        u64 new_flags = 0;
 871        int ret;
 872
 873        /*
 874         * Backrefs update rules:
 875         *
 876         * Always use full backrefs for extent pointers in tree block
 877         * allocated by tree relocation.
 878         *
 879         * If a shared tree block is no longer referenced by its owner
 880         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 881         * use full backrefs for extent pointers in tree block.
 882         *
 883         * If a tree block is been relocating
 884         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
 885         * use full backrefs for extent pointers in tree block.
 886         * The reason for this is some operations (such as drop tree)
 887         * are only allowed for blocks use full backrefs.
 888         */
 889
 890        if (btrfs_block_can_be_shared(root, buf)) {
 891                ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
 892                                               btrfs_header_level(buf), 1,
 893                                               &refs, &flags);
 894                if (ret)
 895                        return ret;
 896                if (refs == 0) {
 897                        ret = -EROFS;
 898                        btrfs_handle_fs_error(fs_info, ret, NULL);
 899                        return ret;
 900                }
 901        } else {
 902                refs = 1;
 903                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
 904                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 905                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 906                else
 907                        flags = 0;
 908        }
 909
 910        owner = btrfs_header_owner(buf);
 911        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
 912               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 913
 914        if (refs > 1) {
 915                if ((owner == root->root_key.objectid ||
 916                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 917                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
 918                        ret = btrfs_inc_ref(trans, root, buf, 1);
 919                        if (ret)
 920                                return ret;
 921
 922                        if (root->root_key.objectid ==
 923                            BTRFS_TREE_RELOC_OBJECTID) {
 924                                ret = btrfs_dec_ref(trans, root, buf, 0);
 925                                if (ret)
 926                                        return ret;
 927                                ret = btrfs_inc_ref(trans, root, cow, 1);
 928                                if (ret)
 929                                        return ret;
 930                        }
 931                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 932                } else {
 933
 934                        if (root->root_key.objectid ==
 935                            BTRFS_TREE_RELOC_OBJECTID)
 936                                ret = btrfs_inc_ref(trans, root, cow, 1);
 937                        else
 938                                ret = btrfs_inc_ref(trans, root, cow, 0);
 939                        if (ret)
 940                                return ret;
 941                }
 942                if (new_flags != 0) {
 943                        int level = btrfs_header_level(buf);
 944
 945                        ret = btrfs_set_disk_extent_flags(trans, fs_info,
 946                                                          buf->start,
 947                                                          buf->len,
 948                                                          new_flags, level, 0);
 949                        if (ret)
 950                                return ret;
 951                }
 952        } else {
 953                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 954                        if (root->root_key.objectid ==
 955                            BTRFS_TREE_RELOC_OBJECTID)
 956                                ret = btrfs_inc_ref(trans, root, cow, 1);
 957                        else
 958                                ret = btrfs_inc_ref(trans, root, cow, 0);
 959                        if (ret)
 960                                return ret;
 961                        ret = btrfs_dec_ref(trans, root, buf, 1);
 962                        if (ret)
 963                                return ret;
 964                }
 965                clean_tree_block(fs_info, buf);
 966                *last_ref = 1;
 967        }
 968        return 0;
 969}
 970
 971static struct extent_buffer *alloc_tree_block_no_bg_flush(
 972                                          struct btrfs_trans_handle *trans,
 973                                          struct btrfs_root *root,
 974                                          u64 parent_start,
 975                                          const struct btrfs_disk_key *disk_key,
 976                                          int level,
 977                                          u64 hint,
 978                                          u64 empty_size)
 979{
 980        struct btrfs_fs_info *fs_info = root->fs_info;
 981        struct extent_buffer *ret;
 982
 983        /*
 984         * If we are COWing a node/leaf from the extent, chunk, device or free
 985         * space trees, make sure that we do not finish block group creation of
 986         * pending block groups. We do this to avoid a deadlock.
 987         * COWing can result in allocation of a new chunk, and flushing pending
 988         * block groups (btrfs_create_pending_block_groups()) can be triggered
 989         * when finishing allocation of a new chunk. Creation of a pending block
 990         * group modifies the extent, chunk, device and free space trees,
 991         * therefore we could deadlock with ourselves since we are holding a
 992         * lock on an extent buffer that btrfs_create_pending_block_groups() may
 993         * try to COW later.
 994         * For similar reasons, we also need to delay flushing pending block
 995         * groups when splitting a leaf or node, from one of those trees, since
 996         * we are holding a write lock on it and its parent or when inserting a
 997         * new root node for one of those trees.
 998         */
 999        if (root == fs_info->extent_root ||
1000            root == fs_info->chunk_root ||
1001            root == fs_info->dev_root ||
1002            root == fs_info->free_space_root)
1003                trans->can_flush_pending_bgs = false;
1004
1005        ret = btrfs_alloc_tree_block(trans, root, parent_start,
1006                                     root->root_key.objectid, disk_key, level,
1007                                     hint, empty_size);
1008        trans->can_flush_pending_bgs = true;
1009
1010        return ret;
1011}
1012
1013/*
1014 * does the dirty work in cow of a single block.  The parent block (if
1015 * supplied) is updated to point to the new cow copy.  The new buffer is marked
1016 * dirty and returned locked.  If you modify the block it needs to be marked
1017 * dirty again.
1018 *
1019 * search_start -- an allocation hint for the new block
1020 *
1021 * empty_size -- a hint that you plan on doing more cow.  This is the size in
1022 * bytes the allocator should try to find free next to the block it returns.
1023 * This is just a hint and may be ignored by the allocator.
1024 */
1025static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1026                             struct btrfs_root *root,
1027                             struct extent_buffer *buf,
1028                             struct extent_buffer *parent, int parent_slot,
1029                             struct extent_buffer **cow_ret,
1030                             u64 search_start, u64 empty_size)
1031{
1032        struct btrfs_fs_info *fs_info = root->fs_info;
1033        struct btrfs_disk_key disk_key;
1034        struct extent_buffer *cow;
1035        int level, ret;
1036        int last_ref = 0;
1037        int unlock_orig = 0;
1038        u64 parent_start = 0;
1039
1040        if (*cow_ret == buf)
1041                unlock_orig = 1;
1042
1043        btrfs_assert_tree_locked(buf);
1044
1045        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1046                trans->transid != fs_info->running_transaction->transid);
1047        WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1048                trans->transid != root->last_trans);
1049
1050        level = btrfs_header_level(buf);
1051
1052        if (level == 0)
1053                btrfs_item_key(buf, &disk_key, 0);
1054        else
1055                btrfs_node_key(buf, &disk_key, 0);
1056
1057        if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1058                parent_start = parent->start;
1059
1060        cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1061                                           level, search_start, empty_size);
1062        if (IS_ERR(cow))
1063                return PTR_ERR(cow);
1064
1065        /* cow is set to blocking by btrfs_init_new_buffer */
1066
1067        copy_extent_buffer_full(cow, buf);
1068        btrfs_set_header_bytenr(cow, cow->start);
1069        btrfs_set_header_generation(cow, trans->transid);
1070        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1071        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1072                                     BTRFS_HEADER_FLAG_RELOC);
1073        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1074                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1075        else
1076                btrfs_set_header_owner(cow, root->root_key.objectid);
1077
1078        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1079
1080        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1081        if (ret) {
1082                btrfs_abort_transaction(trans, ret);
1083                return ret;
1084        }
1085
1086        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1087                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1088                if (ret) {
1089                        btrfs_abort_transaction(trans, ret);
1090                        return ret;
1091                }
1092        }
1093
1094        if (buf == root->node) {
1095                WARN_ON(parent && parent != buf);
1096                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1097                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1098                        parent_start = buf->start;
1099
1100                extent_buffer_get(cow);
1101                ret = tree_mod_log_insert_root(root->node, cow, 1);
1102                BUG_ON(ret < 0);
1103                rcu_assign_pointer(root->node, cow);
1104
1105                btrfs_free_tree_block(trans, root, buf, parent_start,
1106                                      last_ref);
1107                free_extent_buffer(buf);
1108                add_root_to_dirty_list(root);
1109        } else {
1110                WARN_ON(trans->transid != btrfs_header_generation(parent));
1111                tree_mod_log_insert_key(parent, parent_slot,
1112                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1113                btrfs_set_node_blockptr(parent, parent_slot,
1114                                        cow->start);
1115                btrfs_set_node_ptr_generation(parent, parent_slot,
1116                                              trans->transid);
1117                btrfs_mark_buffer_dirty(parent);
1118                if (last_ref) {
1119                        ret = tree_mod_log_free_eb(buf);
1120                        if (ret) {
1121                                btrfs_abort_transaction(trans, ret);
1122                                return ret;
1123                        }
1124                }
1125                btrfs_free_tree_block(trans, root, buf, parent_start,
1126                                      last_ref);
1127        }
1128        if (unlock_orig)
1129                btrfs_tree_unlock(buf);
1130        free_extent_buffer_stale(buf);
1131        btrfs_mark_buffer_dirty(cow);
1132        *cow_ret = cow;
1133        return 0;
1134}
1135
1136/*
1137 * returns the logical address of the oldest predecessor of the given root.
1138 * entries older than time_seq are ignored.
1139 */
1140static struct tree_mod_elem *__tree_mod_log_oldest_root(
1141                struct extent_buffer *eb_root, u64 time_seq)
1142{
1143        struct tree_mod_elem *tm;
1144        struct tree_mod_elem *found = NULL;
1145        u64 root_logical = eb_root->start;
1146        int looped = 0;
1147
1148        if (!time_seq)
1149                return NULL;
1150
1151        /*
1152         * the very last operation that's logged for a root is the
1153         * replacement operation (if it is replaced at all). this has
1154         * the logical address of the *new* root, making it the very
1155         * first operation that's logged for this root.
1156         */
1157        while (1) {
1158                tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1159                                                time_seq);
1160                if (!looped && !tm)
1161                        return NULL;
1162                /*
1163                 * if there are no tree operation for the oldest root, we simply
1164                 * return it. this should only happen if that (old) root is at
1165                 * level 0.
1166                 */
1167                if (!tm)
1168                        break;
1169
1170                /*
1171                 * if there's an operation that's not a root replacement, we
1172                 * found the oldest version of our root. normally, we'll find a
1173                 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1174                 */
1175                if (tm->op != MOD_LOG_ROOT_REPLACE)
1176                        break;
1177
1178                found = tm;
1179                root_logical = tm->old_root.logical;
1180                looped = 1;
1181        }
1182
1183        /* if there's no old root to return, return what we found instead */
1184        if (!found)
1185                found = tm;
1186
1187        return found;
1188}
1189
1190/*
1191 * tm is a pointer to the first operation to rewind within eb. then, all
1192 * previous operations will be rewound (until we reach something older than
1193 * time_seq).
1194 */
1195static void
1196__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1197                      u64 time_seq, struct tree_mod_elem *first_tm)
1198{
1199        u32 n;
1200        struct rb_node *next;
1201        struct tree_mod_elem *tm = first_tm;
1202        unsigned long o_dst;
1203        unsigned long o_src;
1204        unsigned long p_size = sizeof(struct btrfs_key_ptr);
1205
1206        n = btrfs_header_nritems(eb);
1207        read_lock(&fs_info->tree_mod_log_lock);
1208        while (tm && tm->seq >= time_seq) {
1209                /*
1210                 * all the operations are recorded with the operator used for
1211                 * the modification. as we're going backwards, we do the
1212                 * opposite of each operation here.
1213                 */
1214                switch (tm->op) {
1215                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1216                        BUG_ON(tm->slot < n);
1217                        /* Fallthrough */
1218                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1219                case MOD_LOG_KEY_REMOVE:
1220                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1221                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1222                        btrfs_set_node_ptr_generation(eb, tm->slot,
1223                                                      tm->generation);
1224                        n++;
1225                        break;
1226                case MOD_LOG_KEY_REPLACE:
1227                        BUG_ON(tm->slot >= n);
1228                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1229                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1230                        btrfs_set_node_ptr_generation(eb, tm->slot,
1231                                                      tm->generation);
1232                        break;
1233                case MOD_LOG_KEY_ADD:
1234                        /* if a move operation is needed it's in the log */
1235                        n--;
1236                        break;
1237                case MOD_LOG_MOVE_KEYS:
1238                        o_dst = btrfs_node_key_ptr_offset(tm->slot);
1239                        o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1240                        memmove_extent_buffer(eb, o_dst, o_src,
1241                                              tm->move.nr_items * p_size);
1242                        break;
1243                case MOD_LOG_ROOT_REPLACE:
1244                        /*
1245                         * this operation is special. for roots, this must be
1246                         * handled explicitly before rewinding.
1247                         * for non-roots, this operation may exist if the node
1248                         * was a root: root A -> child B; then A gets empty and
1249                         * B is promoted to the new root. in the mod log, we'll
1250                         * have a root-replace operation for B, a tree block
1251                         * that is no root. we simply ignore that operation.
1252                         */
1253                        break;
1254                }
1255                next = rb_next(&tm->node);
1256                if (!next)
1257                        break;
1258                tm = rb_entry(next, struct tree_mod_elem, node);
1259                if (tm->logical != first_tm->logical)
1260                        break;
1261        }
1262        read_unlock(&fs_info->tree_mod_log_lock);
1263        btrfs_set_header_nritems(eb, n);
1264}
1265
1266/*
1267 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1268 * is returned. If rewind operations happen, a fresh buffer is returned. The
1269 * returned buffer is always read-locked. If the returned buffer is not the
1270 * input buffer, the lock on the input buffer is released and the input buffer
1271 * is freed (its refcount is decremented).
1272 */
1273static struct extent_buffer *
1274tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1275                    struct extent_buffer *eb, u64 time_seq)
1276{
1277        struct extent_buffer *eb_rewin;
1278        struct tree_mod_elem *tm;
1279
1280        if (!time_seq)
1281                return eb;
1282
1283        if (btrfs_header_level(eb) == 0)
1284                return eb;
1285
1286        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1287        if (!tm)
1288                return eb;
1289
1290        btrfs_set_path_blocking(path);
1291        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1292
1293        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1294                BUG_ON(tm->slot != 0);
1295                eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1296                if (!eb_rewin) {
1297                        btrfs_tree_read_unlock_blocking(eb);
1298                        free_extent_buffer(eb);
1299                        return NULL;
1300                }
1301                btrfs_set_header_bytenr(eb_rewin, eb->start);
1302                btrfs_set_header_backref_rev(eb_rewin,
1303                                             btrfs_header_backref_rev(eb));
1304                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1305                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1306        } else {
1307                eb_rewin = btrfs_clone_extent_buffer(eb);
1308                if (!eb_rewin) {
1309                        btrfs_tree_read_unlock_blocking(eb);
1310                        free_extent_buffer(eb);
1311                        return NULL;
1312                }
1313        }
1314
1315        btrfs_tree_read_unlock_blocking(eb);
1316        free_extent_buffer(eb);
1317
1318        btrfs_tree_read_lock(eb_rewin);
1319        __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1320        WARN_ON(btrfs_header_nritems(eb_rewin) >
1321                BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1322
1323        return eb_rewin;
1324}
1325
1326/*
1327 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1328 * value. If there are no changes, the current root->root_node is returned. If
1329 * anything changed in between, there's a fresh buffer allocated on which the
1330 * rewind operations are done. In any case, the returned buffer is read locked.
1331 * Returns NULL on error (with no locks held).
1332 */
1333static inline struct extent_buffer *
1334get_old_root(struct btrfs_root *root, u64 time_seq)
1335{
1336        struct btrfs_fs_info *fs_info = root->fs_info;
1337        struct tree_mod_elem *tm;
1338        struct extent_buffer *eb = NULL;
1339        struct extent_buffer *eb_root;
1340        struct extent_buffer *old;
1341        struct tree_mod_root *old_root = NULL;
1342        u64 old_generation = 0;
1343        u64 logical;
1344        int level;
1345
1346        eb_root = btrfs_read_lock_root_node(root);
1347        tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1348        if (!tm)
1349                return eb_root;
1350
1351        if (tm->op == MOD_LOG_ROOT_REPLACE) {
1352                old_root = &tm->old_root;
1353                old_generation = tm->generation;
1354                logical = old_root->logical;
1355                level = old_root->level;
1356        } else {
1357                logical = eb_root->start;
1358                level = btrfs_header_level(eb_root);
1359        }
1360
1361        tm = tree_mod_log_search(fs_info, logical, time_seq);
1362        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1363                btrfs_tree_read_unlock(eb_root);
1364                free_extent_buffer(eb_root);
1365                old = read_tree_block(fs_info, logical, 0, level, NULL);
1366                if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1367                        if (!IS_ERR(old))
1368                                free_extent_buffer(old);
1369                        btrfs_warn(fs_info,
1370                                   "failed to read tree block %llu from get_old_root",
1371                                   logical);
1372                } else {
1373                        eb = btrfs_clone_extent_buffer(old);
1374                        free_extent_buffer(old);
1375                }
1376        } else if (old_root) {
1377                btrfs_tree_read_unlock(eb_root);
1378                free_extent_buffer(eb_root);
1379                eb = alloc_dummy_extent_buffer(fs_info, logical);
1380        } else {
1381                btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1382                eb = btrfs_clone_extent_buffer(eb_root);
1383                btrfs_tree_read_unlock_blocking(eb_root);
1384                free_extent_buffer(eb_root);
1385        }
1386
1387        if (!eb)
1388                return NULL;
1389        btrfs_tree_read_lock(eb);
1390        if (old_root) {
1391                btrfs_set_header_bytenr(eb, eb->start);
1392                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1393                btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1394                btrfs_set_header_level(eb, old_root->level);
1395                btrfs_set_header_generation(eb, old_generation);
1396        }
1397        if (tm)
1398                __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1399        else
1400                WARN_ON(btrfs_header_level(eb) != 0);
1401        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1402
1403        return eb;
1404}
1405
1406int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1407{
1408        struct tree_mod_elem *tm;
1409        int level;
1410        struct extent_buffer *eb_root = btrfs_root_node(root);
1411
1412        tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1413        if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1414                level = tm->old_root.level;
1415        } else {
1416                level = btrfs_header_level(eb_root);
1417        }
1418        free_extent_buffer(eb_root);
1419
1420        return level;
1421}
1422
1423static inline int should_cow_block(struct btrfs_trans_handle *trans,
1424                                   struct btrfs_root *root,
1425                                   struct extent_buffer *buf)
1426{
1427        if (btrfs_is_testing(root->fs_info))
1428                return 0;
1429
1430        /* Ensure we can see the FORCE_COW bit */
1431        smp_mb__before_atomic();
1432
1433        /*
1434         * We do not need to cow a block if
1435         * 1) this block is not created or changed in this transaction;
1436         * 2) this block does not belong to TREE_RELOC tree;
1437         * 3) the root is not forced COW.
1438         *
1439         * What is forced COW:
1440         *    when we create snapshot during committing the transaction,
1441         *    after we've finished copying src root, we must COW the shared
1442         *    block to ensure the metadata consistency.
1443         */
1444        if (btrfs_header_generation(buf) == trans->transid &&
1445            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1446            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1447              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1448            !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1449                return 0;
1450        return 1;
1451}
1452
1453/*
1454 * cows a single block, see __btrfs_cow_block for the real work.
1455 * This version of it has extra checks so that a block isn't COWed more than
1456 * once per transaction, as long as it hasn't been written yet
1457 */
1458noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1459                    struct btrfs_root *root, struct extent_buffer *buf,
1460                    struct extent_buffer *parent, int parent_slot,
1461                    struct extent_buffer **cow_ret)
1462{
1463        struct btrfs_fs_info *fs_info = root->fs_info;
1464        u64 search_start;
1465        int ret;
1466
1467        if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1468                btrfs_err(fs_info,
1469                        "COW'ing blocks on a fs root that's being dropped");
1470
1471        if (trans->transaction != fs_info->running_transaction)
1472                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1473                       trans->transid,
1474                       fs_info->running_transaction->transid);
1475
1476        if (trans->transid != fs_info->generation)
1477                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1478                       trans->transid, fs_info->generation);
1479
1480        if (!should_cow_block(trans, root, buf)) {
1481                trans->dirty = true;
1482                *cow_ret = buf;
1483                return 0;
1484        }
1485
1486        search_start = buf->start & ~((u64)SZ_1G - 1);
1487
1488        if (parent)
1489                btrfs_set_lock_blocking(parent);
1490        btrfs_set_lock_blocking(buf);
1491
1492        ret = __btrfs_cow_block(trans, root, buf, parent,
1493                                 parent_slot, cow_ret, search_start, 0);
1494
1495        trace_btrfs_cow_block(root, buf, *cow_ret);
1496
1497        return ret;
1498}
1499
1500/*
1501 * helper function for defrag to decide if two blocks pointed to by a
1502 * node are actually close by
1503 */
1504static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1505{
1506        if (blocknr < other && other - (blocknr + blocksize) < 32768)
1507                return 1;
1508        if (blocknr > other && blocknr - (other + blocksize) < 32768)
1509                return 1;
1510        return 0;
1511}
1512
1513/*
1514 * compare two keys in a memcmp fashion
1515 */
1516static int comp_keys(const struct btrfs_disk_key *disk,
1517                     const struct btrfs_key *k2)
1518{
1519        struct btrfs_key k1;
1520
1521        btrfs_disk_key_to_cpu(&k1, disk);
1522
1523        return btrfs_comp_cpu_keys(&k1, k2);
1524}
1525
1526/*
1527 * same as comp_keys only with two btrfs_key's
1528 */
1529int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1530{
1531        if (k1->objectid > k2->objectid)
1532                return 1;
1533        if (k1->objectid < k2->objectid)
1534                return -1;
1535        if (k1->type > k2->type)
1536                return 1;
1537        if (k1->type < k2->type)
1538                return -1;
1539        if (k1->offset > k2->offset)
1540                return 1;
1541        if (k1->offset < k2->offset)
1542                return -1;
1543        return 0;
1544}
1545
1546/*
1547 * this is used by the defrag code to go through all the
1548 * leaves pointed to by a node and reallocate them so that
1549 * disk order is close to key order
1550 */
1551int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1552                       struct btrfs_root *root, struct extent_buffer *parent,
1553                       int start_slot, u64 *last_ret,
1554                       struct btrfs_key *progress)
1555{
1556        struct btrfs_fs_info *fs_info = root->fs_info;
1557        struct extent_buffer *cur;
1558        u64 blocknr;
1559        u64 gen;
1560        u64 search_start = *last_ret;
1561        u64 last_block = 0;
1562        u64 other;
1563        u32 parent_nritems;
1564        int end_slot;
1565        int i;
1566        int err = 0;
1567        int parent_level;
1568        int uptodate;
1569        u32 blocksize;
1570        int progress_passed = 0;
1571        struct btrfs_disk_key disk_key;
1572
1573        parent_level = btrfs_header_level(parent);
1574
1575        WARN_ON(trans->transaction != fs_info->running_transaction);
1576        WARN_ON(trans->transid != fs_info->generation);
1577
1578        parent_nritems = btrfs_header_nritems(parent);
1579        blocksize = fs_info->nodesize;
1580        end_slot = parent_nritems - 1;
1581
1582        if (parent_nritems <= 1)
1583                return 0;
1584
1585        btrfs_set_lock_blocking(parent);
1586
1587        for (i = start_slot; i <= end_slot; i++) {
1588                struct btrfs_key first_key;
1589                int close = 1;
1590
1591                btrfs_node_key(parent, &disk_key, i);
1592                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1593                        continue;
1594
1595                progress_passed = 1;
1596                blocknr = btrfs_node_blockptr(parent, i);
1597                gen = btrfs_node_ptr_generation(parent, i);
1598                btrfs_node_key_to_cpu(parent, &first_key, i);
1599                if (last_block == 0)
1600                        last_block = blocknr;
1601
1602                if (i > 0) {
1603                        other = btrfs_node_blockptr(parent, i - 1);
1604                        close = close_blocks(blocknr, other, blocksize);
1605                }
1606                if (!close && i < end_slot) {
1607                        other = btrfs_node_blockptr(parent, i + 1);
1608                        close = close_blocks(blocknr, other, blocksize);
1609                }
1610                if (close) {
1611                        last_block = blocknr;
1612                        continue;
1613                }
1614
1615                cur = find_extent_buffer(fs_info, blocknr);
1616                if (cur)
1617                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1618                else
1619                        uptodate = 0;
1620                if (!cur || !uptodate) {
1621                        if (!cur) {
1622                                cur = read_tree_block(fs_info, blocknr, gen,
1623                                                      parent_level - 1,
1624                                                      &first_key);
1625                                if (IS_ERR(cur)) {
1626                                        return PTR_ERR(cur);
1627                                } else if (!extent_buffer_uptodate(cur)) {
1628                                        free_extent_buffer(cur);
1629                                        return -EIO;
1630                                }
1631                        } else if (!uptodate) {
1632                                err = btrfs_read_buffer(cur, gen,
1633                                                parent_level - 1,&first_key);
1634                                if (err) {
1635                                        free_extent_buffer(cur);
1636                                        return err;
1637                                }
1638                        }
1639                }
1640                if (search_start == 0)
1641                        search_start = last_block;
1642
1643                btrfs_tree_lock(cur);
1644                btrfs_set_lock_blocking(cur);
1645                err = __btrfs_cow_block(trans, root, cur, parent, i,
1646                                        &cur, search_start,
1647                                        min(16 * blocksize,
1648                                            (end_slot - i) * blocksize));
1649                if (err) {
1650                        btrfs_tree_unlock(cur);
1651                        free_extent_buffer(cur);
1652                        break;
1653                }
1654                search_start = cur->start;
1655                last_block = cur->start;
1656                *last_ret = search_start;
1657                btrfs_tree_unlock(cur);
1658                free_extent_buffer(cur);
1659        }
1660        return err;
1661}
1662
1663/*
1664 * search for key in the extent_buffer.  The items start at offset p,
1665 * and they are item_size apart.  There are 'max' items in p.
1666 *
1667 * the slot in the array is returned via slot, and it points to
1668 * the place where you would insert key if it is not found in
1669 * the array.
1670 *
1671 * slot may point to max if the key is bigger than all of the keys
1672 */
1673static noinline int generic_bin_search(struct extent_buffer *eb,
1674                                       unsigned long p, int item_size,
1675                                       const struct btrfs_key *key,
1676                                       int max, int *slot)
1677{
1678        int low = 0;
1679        int high = max;
1680        int mid;
1681        int ret;
1682        struct btrfs_disk_key *tmp = NULL;
1683        struct btrfs_disk_key unaligned;
1684        unsigned long offset;
1685        char *kaddr = NULL;
1686        unsigned long map_start = 0;
1687        unsigned long map_len = 0;
1688        int err;
1689
1690        if (low > high) {
1691                btrfs_err(eb->fs_info,
1692                 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1693                          __func__, low, high, eb->start,
1694                          btrfs_header_owner(eb), btrfs_header_level(eb));
1695                return -EINVAL;
1696        }
1697
1698        while (low < high) {
1699                mid = (low + high) / 2;
1700                offset = p + mid * item_size;
1701
1702                if (!kaddr || offset < map_start ||
1703                    (offset + sizeof(struct btrfs_disk_key)) >
1704                    map_start + map_len) {
1705
1706                        err = map_private_extent_buffer(eb, offset,
1707                                                sizeof(struct btrfs_disk_key),
1708                                                &kaddr, &map_start, &map_len);
1709
1710                        if (!err) {
1711                                tmp = (struct btrfs_disk_key *)(kaddr + offset -
1712                                                        map_start);
1713                        } else if (err == 1) {
1714                                read_extent_buffer(eb, &unaligned,
1715                                                   offset, sizeof(unaligned));
1716                                tmp = &unaligned;
1717                        } else {
1718                                return err;
1719                        }
1720
1721                } else {
1722                        tmp = (struct btrfs_disk_key *)(kaddr + offset -
1723                                                        map_start);
1724                }
1725                ret = comp_keys(tmp, key);
1726
1727                if (ret < 0)
1728                        low = mid + 1;
1729                else if (ret > 0)
1730                        high = mid;
1731                else {
1732                        *slot = mid;
1733                        return 0;
1734                }
1735        }
1736        *slot = low;
1737        return 1;
1738}
1739
1740/*
1741 * simple bin_search frontend that does the right thing for
1742 * leaves vs nodes
1743 */
1744int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1745                     int level, int *slot)
1746{
1747        if (level == 0)
1748                return generic_bin_search(eb,
1749                                          offsetof(struct btrfs_leaf, items),
1750                                          sizeof(struct btrfs_item),
1751                                          key, btrfs_header_nritems(eb),
1752                                          slot);
1753        else
1754                return generic_bin_search(eb,
1755                                          offsetof(struct btrfs_node, ptrs),
1756                                          sizeof(struct btrfs_key_ptr),
1757                                          key, btrfs_header_nritems(eb),
1758                                          slot);
1759}
1760
1761static void root_add_used(struct btrfs_root *root, u32 size)
1762{
1763        spin_lock(&root->accounting_lock);
1764        btrfs_set_root_used(&root->root_item,
1765                            btrfs_root_used(&root->root_item) + size);
1766        spin_unlock(&root->accounting_lock);
1767}
1768
1769static void root_sub_used(struct btrfs_root *root, u32 size)
1770{
1771        spin_lock(&root->accounting_lock);
1772        btrfs_set_root_used(&root->root_item,
1773                            btrfs_root_used(&root->root_item) - size);
1774        spin_unlock(&root->accounting_lock);
1775}
1776
1777/* given a node and slot number, this reads the blocks it points to.  The
1778 * extent buffer is returned with a reference taken (but unlocked).
1779 */
1780static noinline struct extent_buffer *
1781read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1782               int slot)
1783{
1784        int level = btrfs_header_level(parent);
1785        struct extent_buffer *eb;
1786        struct btrfs_key first_key;
1787
1788        if (slot < 0 || slot >= btrfs_header_nritems(parent))
1789                return ERR_PTR(-ENOENT);
1790
1791        BUG_ON(level == 0);
1792
1793        btrfs_node_key_to_cpu(parent, &first_key, slot);
1794        eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1795                             btrfs_node_ptr_generation(parent, slot),
1796                             level - 1, &first_key);
1797        if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1798                free_extent_buffer(eb);
1799                eb = ERR_PTR(-EIO);
1800        }
1801
1802        return eb;
1803}
1804
1805/*
1806 * node level balancing, used to make sure nodes are in proper order for
1807 * item deletion.  We balance from the top down, so we have to make sure
1808 * that a deletion won't leave an node completely empty later on.
1809 */
1810static noinline int balance_level(struct btrfs_trans_handle *trans,
1811                         struct btrfs_root *root,
1812                         struct btrfs_path *path, int level)
1813{
1814        struct btrfs_fs_info *fs_info = root->fs_info;
1815        struct extent_buffer *right = NULL;
1816        struct extent_buffer *mid;
1817        struct extent_buffer *left = NULL;
1818        struct extent_buffer *parent = NULL;
1819        int ret = 0;
1820        int wret;
1821        int pslot;
1822        int orig_slot = path->slots[level];
1823        u64 orig_ptr;
1824
1825        ASSERT(level > 0);
1826
1827        mid = path->nodes[level];
1828
1829        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1830                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1831        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1832
1833        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1834
1835        if (level < BTRFS_MAX_LEVEL - 1) {
1836                parent = path->nodes[level + 1];
1837                pslot = path->slots[level + 1];
1838        }
1839
1840        /*
1841         * deal with the case where there is only one pointer in the root
1842         * by promoting the node below to a root
1843         */
1844        if (!parent) {
1845                struct extent_buffer *child;
1846
1847                if (btrfs_header_nritems(mid) != 1)
1848                        return 0;
1849
1850                /* promote the child to a root */
1851                child = read_node_slot(fs_info, mid, 0);
1852                if (IS_ERR(child)) {
1853                        ret = PTR_ERR(child);
1854                        btrfs_handle_fs_error(fs_info, ret, NULL);
1855                        goto enospc;
1856                }
1857
1858                btrfs_tree_lock(child);
1859                btrfs_set_lock_blocking(child);
1860                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1861                if (ret) {
1862                        btrfs_tree_unlock(child);
1863                        free_extent_buffer(child);
1864                        goto enospc;
1865                }
1866
1867                ret = tree_mod_log_insert_root(root->node, child, 1);
1868                BUG_ON(ret < 0);
1869                rcu_assign_pointer(root->node, child);
1870
1871                add_root_to_dirty_list(root);
1872                btrfs_tree_unlock(child);
1873
1874                path->locks[level] = 0;
1875                path->nodes[level] = NULL;
1876                clean_tree_block(fs_info, mid);
1877                btrfs_tree_unlock(mid);
1878                /* once for the path */
1879                free_extent_buffer(mid);
1880
1881                root_sub_used(root, mid->len);
1882                btrfs_free_tree_block(trans, root, mid, 0, 1);
1883                /* once for the root ptr */
1884                free_extent_buffer_stale(mid);
1885                return 0;
1886        }
1887        if (btrfs_header_nritems(mid) >
1888            BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1889                return 0;
1890
1891        left = read_node_slot(fs_info, parent, pslot - 1);
1892        if (IS_ERR(left))
1893                left = NULL;
1894
1895        if (left) {
1896                btrfs_tree_lock(left);
1897                btrfs_set_lock_blocking(left);
1898                wret = btrfs_cow_block(trans, root, left,
1899                                       parent, pslot - 1, &left);
1900                if (wret) {
1901                        ret = wret;
1902                        goto enospc;
1903                }
1904        }
1905
1906        right = read_node_slot(fs_info, parent, pslot + 1);
1907        if (IS_ERR(right))
1908                right = NULL;
1909
1910        if (right) {
1911                btrfs_tree_lock(right);
1912                btrfs_set_lock_blocking(right);
1913                wret = btrfs_cow_block(trans, root, right,
1914                                       parent, pslot + 1, &right);
1915                if (wret) {
1916                        ret = wret;
1917                        goto enospc;
1918                }
1919        }
1920
1921        /* first, try to make some room in the middle buffer */
1922        if (left) {
1923                orig_slot += btrfs_header_nritems(left);
1924                wret = push_node_left(trans, fs_info, left, mid, 1);
1925                if (wret < 0)
1926                        ret = wret;
1927        }
1928
1929        /*
1930         * then try to empty the right most buffer into the middle
1931         */
1932        if (right) {
1933                wret = push_node_left(trans, fs_info, mid, right, 1);
1934                if (wret < 0 && wret != -ENOSPC)
1935                        ret = wret;
1936                if (btrfs_header_nritems(right) == 0) {
1937                        clean_tree_block(fs_info, right);
1938                        btrfs_tree_unlock(right);
1939                        del_ptr(root, path, level + 1, pslot + 1);
1940                        root_sub_used(root, right->len);
1941                        btrfs_free_tree_block(trans, root, right, 0, 1);
1942                        free_extent_buffer_stale(right);
1943                        right = NULL;
1944                } else {
1945                        struct btrfs_disk_key right_key;
1946                        btrfs_node_key(right, &right_key, 0);
1947                        ret = tree_mod_log_insert_key(parent, pslot + 1,
1948                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1949                        BUG_ON(ret < 0);
1950                        btrfs_set_node_key(parent, &right_key, pslot + 1);
1951                        btrfs_mark_buffer_dirty(parent);
1952                }
1953        }
1954        if (btrfs_header_nritems(mid) == 1) {
1955                /*
1956                 * we're not allowed to leave a node with one item in the
1957                 * tree during a delete.  A deletion from lower in the tree
1958                 * could try to delete the only pointer in this node.
1959                 * So, pull some keys from the left.
1960                 * There has to be a left pointer at this point because
1961                 * otherwise we would have pulled some pointers from the
1962                 * right
1963                 */
1964                if (!left) {
1965                        ret = -EROFS;
1966                        btrfs_handle_fs_error(fs_info, ret, NULL);
1967                        goto enospc;
1968                }
1969                wret = balance_node_right(trans, fs_info, mid, left);
1970                if (wret < 0) {
1971                        ret = wret;
1972                        goto enospc;
1973                }
1974                if (wret == 1) {
1975                        wret = push_node_left(trans, fs_info, left, mid, 1);
1976                        if (wret < 0)
1977                                ret = wret;
1978                }
1979                BUG_ON(wret == 1);
1980        }
1981        if (btrfs_header_nritems(mid) == 0) {
1982                clean_tree_block(fs_info, mid);
1983                btrfs_tree_unlock(mid);
1984                del_ptr(root, path, level + 1, pslot);
1985                root_sub_used(root, mid->len);
1986                btrfs_free_tree_block(trans, root, mid, 0, 1);
1987                free_extent_buffer_stale(mid);
1988                mid = NULL;
1989        } else {
1990                /* update the parent key to reflect our changes */
1991                struct btrfs_disk_key mid_key;
1992                btrfs_node_key(mid, &mid_key, 0);
1993                ret = tree_mod_log_insert_key(parent, pslot,
1994                                MOD_LOG_KEY_REPLACE, GFP_NOFS);
1995                BUG_ON(ret < 0);
1996                btrfs_set_node_key(parent, &mid_key, pslot);
1997                btrfs_mark_buffer_dirty(parent);
1998        }
1999
2000        /* update the path */
2001        if (left) {
2002                if (btrfs_header_nritems(left) > orig_slot) {
2003                        extent_buffer_get(left);
2004                        /* left was locked after cow */
2005                        path->nodes[level] = left;
2006                        path->slots[level + 1] -= 1;
2007                        path->slots[level] = orig_slot;
2008                        if (mid) {
2009                                btrfs_tree_unlock(mid);
2010                                free_extent_buffer(mid);
2011                        }
2012                } else {
2013                        orig_slot -= btrfs_header_nritems(left);
2014                        path->slots[level] = orig_slot;
2015                }
2016        }
2017        /* double check we haven't messed things up */
2018        if (orig_ptr !=
2019            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2020                BUG();
2021enospc:
2022        if (right) {
2023                btrfs_tree_unlock(right);
2024                free_extent_buffer(right);
2025        }
2026        if (left) {
2027                if (path->nodes[level] != left)
2028                        btrfs_tree_unlock(left);
2029                free_extent_buffer(left);
2030        }
2031        return ret;
2032}
2033
2034/* Node balancing for insertion.  Here we only split or push nodes around
2035 * when they are completely full.  This is also done top down, so we
2036 * have to be pessimistic.
2037 */
2038static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2039                                          struct btrfs_root *root,
2040                                          struct btrfs_path *path, int level)
2041{
2042        struct btrfs_fs_info *fs_info = root->fs_info;
2043        struct extent_buffer *right = NULL;
2044        struct extent_buffer *mid;
2045        struct extent_buffer *left = NULL;
2046        struct extent_buffer *parent = NULL;
2047        int ret = 0;
2048        int wret;
2049        int pslot;
2050        int orig_slot = path->slots[level];
2051
2052        if (level == 0)
2053                return 1;
2054
2055        mid = path->nodes[level];
2056        WARN_ON(btrfs_header_generation(mid) != trans->transid);
2057
2058        if (level < BTRFS_MAX_LEVEL - 1) {
2059                parent = path->nodes[level + 1];
2060                pslot = path->slots[level + 1];
2061        }
2062
2063        if (!parent)
2064                return 1;
2065
2066        left = read_node_slot(fs_info, parent, pslot - 1);
2067        if (IS_ERR(left))
2068                left = NULL;
2069
2070        /* first, try to make some room in the middle buffer */
2071        if (left) {
2072                u32 left_nr;
2073
2074                btrfs_tree_lock(left);
2075                btrfs_set_lock_blocking(left);
2076
2077                left_nr = btrfs_header_nritems(left);
2078                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2079                        wret = 1;
2080                } else {
2081                        ret = btrfs_cow_block(trans, root, left, parent,
2082                                              pslot - 1, &left);
2083                        if (ret)
2084                                wret = 1;
2085                        else {
2086                                wret = push_node_left(trans, fs_info,
2087                                                      left, mid, 0);
2088                        }
2089                }
2090                if (wret < 0)
2091                        ret = wret;
2092                if (wret == 0) {
2093                        struct btrfs_disk_key disk_key;
2094                        orig_slot += left_nr;
2095                        btrfs_node_key(mid, &disk_key, 0);
2096                        ret = tree_mod_log_insert_key(parent, pslot,
2097                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
2098                        BUG_ON(ret < 0);
2099                        btrfs_set_node_key(parent, &disk_key, pslot);
2100                        btrfs_mark_buffer_dirty(parent);
2101                        if (btrfs_header_nritems(left) > orig_slot) {
2102                                path->nodes[level] = left;
2103                                path->slots[level + 1] -= 1;
2104                                path->slots[level] = orig_slot;
2105                                btrfs_tree_unlock(mid);
2106                                free_extent_buffer(mid);
2107                        } else {
2108                                orig_slot -=
2109                                        btrfs_header_nritems(left);
2110                                path->slots[level] = orig_slot;
2111                                btrfs_tree_unlock(left);
2112                                free_extent_buffer(left);
2113                        }
2114                        return 0;
2115                }
2116                btrfs_tree_unlock(left);
2117                free_extent_buffer(left);
2118        }
2119        right = read_node_slot(fs_info, parent, pslot + 1);
2120        if (IS_ERR(right))
2121                right = NULL;
2122
2123        /*
2124         * then try to empty the right most buffer into the middle
2125         */
2126        if (right) {
2127                u32 right_nr;
2128
2129                btrfs_tree_lock(right);
2130                btrfs_set_lock_blocking(right);
2131
2132                right_nr = btrfs_header_nritems(right);
2133                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2134                        wret = 1;
2135                } else {
2136                        ret = btrfs_cow_block(trans, root, right,
2137                                              parent, pslot + 1,
2138                                              &right);
2139                        if (ret)
2140                                wret = 1;
2141                        else {
2142                                wret = balance_node_right(trans, fs_info,
2143                                                          right, mid);
2144                        }
2145                }
2146                if (wret < 0)
2147                        ret = wret;
2148                if (wret == 0) {
2149                        struct btrfs_disk_key disk_key;
2150
2151                        btrfs_node_key(right, &disk_key, 0);
2152                        ret = tree_mod_log_insert_key(parent, pslot + 1,
2153                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
2154                        BUG_ON(ret < 0);
2155                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
2156                        btrfs_mark_buffer_dirty(parent);
2157
2158                        if (btrfs_header_nritems(mid) <= orig_slot) {
2159                                path->nodes[level] = right;
2160                                path->slots[level + 1] += 1;
2161                                path->slots[level] = orig_slot -
2162                                        btrfs_header_nritems(mid);
2163                                btrfs_tree_unlock(mid);
2164                                free_extent_buffer(mid);
2165                        } else {
2166                                btrfs_tree_unlock(right);
2167                                free_extent_buffer(right);
2168                        }
2169                        return 0;
2170                }
2171                btrfs_tree_unlock(right);
2172                free_extent_buffer(right);
2173        }
2174        return 1;
2175}
2176
2177/*
2178 * readahead one full node of leaves, finding things that are close
2179 * to the block in 'slot', and triggering ra on them.
2180 */
2181static void reada_for_search(struct btrfs_fs_info *fs_info,
2182                             struct btrfs_path *path,
2183                             int level, int slot, u64 objectid)
2184{
2185        struct extent_buffer *node;
2186        struct btrfs_disk_key disk_key;
2187        u32 nritems;
2188        u64 search;
2189        u64 target;
2190        u64 nread = 0;
2191        struct extent_buffer *eb;
2192        u32 nr;
2193        u32 blocksize;
2194        u32 nscan = 0;
2195
2196        if (level != 1)
2197                return;
2198
2199        if (!path->nodes[level])
2200                return;
2201
2202        node = path->nodes[level];
2203
2204        search = btrfs_node_blockptr(node, slot);
2205        blocksize = fs_info->nodesize;
2206        eb = find_extent_buffer(fs_info, search);
2207        if (eb) {
2208                free_extent_buffer(eb);
2209                return;
2210        }
2211
2212        target = search;
2213
2214        nritems = btrfs_header_nritems(node);
2215        nr = slot;
2216
2217        while (1) {
2218                if (path->reada == READA_BACK) {
2219                        if (nr == 0)
2220                                break;
2221                        nr--;
2222                } else if (path->reada == READA_FORWARD) {
2223                        nr++;
2224                        if (nr >= nritems)
2225                                break;
2226                }
2227                if (path->reada == READA_BACK && objectid) {
2228                        btrfs_node_key(node, &disk_key, nr);
2229                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
2230                                break;
2231                }
2232                search = btrfs_node_blockptr(node, nr);
2233                if ((search <= target && target - search <= 65536) ||
2234                    (search > target && search - target <= 65536)) {
2235                        readahead_tree_block(fs_info, search);
2236                        nread += blocksize;
2237                }
2238                nscan++;
2239                if ((nread > 65536 || nscan > 32))
2240                        break;
2241        }
2242}
2243
2244static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2245                                       struct btrfs_path *path, int level)
2246{
2247        int slot;
2248        int nritems;
2249        struct extent_buffer *parent;
2250        struct extent_buffer *eb;
2251        u64 gen;
2252        u64 block1 = 0;
2253        u64 block2 = 0;
2254
2255        parent = path->nodes[level + 1];
2256        if (!parent)
2257                return;
2258
2259        nritems = btrfs_header_nritems(parent);
2260        slot = path->slots[level + 1];
2261
2262        if (slot > 0) {
2263                block1 = btrfs_node_blockptr(parent, slot - 1);
2264                gen = btrfs_node_ptr_generation(parent, slot - 1);
2265                eb = find_extent_buffer(fs_info, block1);
2266                /*
2267                 * if we get -eagain from btrfs_buffer_uptodate, we
2268                 * don't want to return eagain here.  That will loop
2269                 * forever
2270                 */
2271                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2272                        block1 = 0;
2273                free_extent_buffer(eb);
2274        }
2275        if (slot + 1 < nritems) {
2276                block2 = btrfs_node_blockptr(parent, slot + 1);
2277                gen = btrfs_node_ptr_generation(parent, slot + 1);
2278                eb = find_extent_buffer(fs_info, block2);
2279                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2280                        block2 = 0;
2281                free_extent_buffer(eb);
2282        }
2283
2284        if (block1)
2285                readahead_tree_block(fs_info, block1);
2286        if (block2)
2287                readahead_tree_block(fs_info, block2);
2288}
2289
2290
2291/*
2292 * when we walk down the tree, it is usually safe to unlock the higher layers
2293 * in the tree.  The exceptions are when our path goes through slot 0, because
2294 * operations on the tree might require changing key pointers higher up in the
2295 * tree.
2296 *
2297 * callers might also have set path->keep_locks, which tells this code to keep
2298 * the lock if the path points to the last slot in the block.  This is part of
2299 * walking through the tree, and selecting the next slot in the higher block.
2300 *
2301 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2302 * if lowest_unlock is 1, level 0 won't be unlocked
2303 */
2304static noinline void unlock_up(struct btrfs_path *path, int level,
2305                               int lowest_unlock, int min_write_lock_level,
2306                               int *write_lock_level)
2307{
2308        int i;
2309        int skip_level = level;
2310        int no_skips = 0;
2311        struct extent_buffer *t;
2312
2313        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2314                if (!path->nodes[i])
2315                        break;
2316                if (!path->locks[i])
2317                        break;
2318                if (!no_skips && path->slots[i] == 0) {
2319                        skip_level = i + 1;
2320                        continue;
2321                }
2322                if (!no_skips && path->keep_locks) {
2323                        u32 nritems;
2324                        t = path->nodes[i];
2325                        nritems = btrfs_header_nritems(t);
2326                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
2327                                skip_level = i + 1;
2328                                continue;
2329                        }
2330                }
2331                if (skip_level < i && i >= lowest_unlock)
2332                        no_skips = 1;
2333
2334                t = path->nodes[i];
2335                if (i >= lowest_unlock && i > skip_level) {
2336                        btrfs_tree_unlock_rw(t, path->locks[i]);
2337                        path->locks[i] = 0;
2338                        if (write_lock_level &&
2339                            i > min_write_lock_level &&
2340                            i <= *write_lock_level) {
2341                                *write_lock_level = i - 1;
2342                        }
2343                }
2344        }
2345}
2346
2347/*
2348 * This releases any locks held in the path starting at level and
2349 * going all the way up to the root.
2350 *
2351 * btrfs_search_slot will keep the lock held on higher nodes in a few
2352 * corner cases, such as COW of the block at slot zero in the node.  This
2353 * ignores those rules, and it should only be called when there are no
2354 * more updates to be done higher up in the tree.
2355 */
2356noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2357{
2358        int i;
2359
2360        if (path->keep_locks)
2361                return;
2362
2363        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2364                if (!path->nodes[i])
2365                        continue;
2366                if (!path->locks[i])
2367                        continue;
2368                btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2369                path->locks[i] = 0;
2370        }
2371}
2372
2373/*
2374 * helper function for btrfs_search_slot.  The goal is to find a block
2375 * in cache without setting the path to blocking.  If we find the block
2376 * we return zero and the path is unchanged.
2377 *
2378 * If we can't find the block, we set the path blocking and do some
2379 * reada.  -EAGAIN is returned and the search must be repeated.
2380 */
2381static int
2382read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2383                      struct extent_buffer **eb_ret, int level, int slot,
2384                      const struct btrfs_key *key)
2385{
2386        struct btrfs_fs_info *fs_info = root->fs_info;
2387        u64 blocknr;
2388        u64 gen;
2389        struct extent_buffer *b = *eb_ret;
2390        struct extent_buffer *tmp;
2391        struct btrfs_key first_key;
2392        int ret;
2393        int parent_level;
2394
2395        blocknr = btrfs_node_blockptr(b, slot);
2396        gen = btrfs_node_ptr_generation(b, slot);
2397        parent_level = btrfs_header_level(b);
2398        btrfs_node_key_to_cpu(b, &first_key, slot);
2399
2400        tmp = find_extent_buffer(fs_info, blocknr);
2401        if (tmp) {
2402                /* first we do an atomic uptodate check */
2403                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2404                        *eb_ret = tmp;
2405                        return 0;
2406                }
2407
2408                /* the pages were up to date, but we failed
2409                 * the generation number check.  Do a full
2410                 * read for the generation number that is correct.
2411                 * We must do this without dropping locks so
2412                 * we can trust our generation number
2413                 */
2414                btrfs_set_path_blocking(p);
2415
2416                /* now we're allowed to do a blocking uptodate check */
2417                ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2418                if (!ret) {
2419                        *eb_ret = tmp;
2420                        return 0;
2421                }
2422                free_extent_buffer(tmp);
2423                btrfs_release_path(p);
2424                return -EIO;
2425        }
2426
2427        /*
2428         * reduce lock contention at high levels
2429         * of the btree by dropping locks before
2430         * we read.  Don't release the lock on the current
2431         * level because we need to walk this node to figure
2432         * out which blocks to read.
2433         */
2434        btrfs_unlock_up_safe(p, level + 1);
2435        btrfs_set_path_blocking(p);
2436
2437        if (p->reada != READA_NONE)
2438                reada_for_search(fs_info, p, level, slot, key->objectid);
2439
2440        ret = -EAGAIN;
2441        tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2442                              &first_key);
2443        if (!IS_ERR(tmp)) {
2444                /*
2445                 * If the read above didn't mark this buffer up to date,
2446                 * it will never end up being up to date.  Set ret to EIO now
2447                 * and give up so that our caller doesn't loop forever
2448                 * on our EAGAINs.
2449                 */
2450                if (!extent_buffer_uptodate(tmp))
2451                        ret = -EIO;
2452                free_extent_buffer(tmp);
2453        } else {
2454                ret = PTR_ERR(tmp);
2455        }
2456
2457        btrfs_release_path(p);
2458        return ret;
2459}
2460
2461/*
2462 * helper function for btrfs_search_slot.  This does all of the checks
2463 * for node-level blocks and does any balancing required based on
2464 * the ins_len.
2465 *
2466 * If no extra work was required, zero is returned.  If we had to
2467 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2468 * start over
2469 */
2470static int
2471setup_nodes_for_search(struct btrfs_trans_handle *trans,
2472                       struct btrfs_root *root, struct btrfs_path *p,
2473                       struct extent_buffer *b, int level, int ins_len,
2474                       int *write_lock_level)
2475{
2476        struct btrfs_fs_info *fs_info = root->fs_info;
2477        int ret;
2478
2479        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2480            BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2481                int sret;
2482
2483                if (*write_lock_level < level + 1) {
2484                        *write_lock_level = level + 1;
2485                        btrfs_release_path(p);
2486                        goto again;
2487                }
2488
2489                btrfs_set_path_blocking(p);
2490                reada_for_balance(fs_info, p, level);
2491                sret = split_node(trans, root, p, level);
2492
2493                BUG_ON(sret > 0);
2494                if (sret) {
2495                        ret = sret;
2496                        goto done;
2497                }
2498                b = p->nodes[level];
2499        } else if (ins_len < 0 && btrfs_header_nritems(b) <
2500                   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2501                int sret;
2502
2503                if (*write_lock_level < level + 1) {
2504                        *write_lock_level = level + 1;
2505                        btrfs_release_path(p);
2506                        goto again;
2507                }
2508
2509                btrfs_set_path_blocking(p);
2510                reada_for_balance(fs_info, p, level);
2511                sret = balance_level(trans, root, p, level);
2512
2513                if (sret) {
2514                        ret = sret;
2515                        goto done;
2516                }
2517                b = p->nodes[level];
2518                if (!b) {
2519                        btrfs_release_path(p);
2520                        goto again;
2521                }
2522                BUG_ON(btrfs_header_nritems(b) == 1);
2523        }
2524        return 0;
2525
2526again:
2527        ret = -EAGAIN;
2528done:
2529        return ret;
2530}
2531
2532static void key_search_validate(struct extent_buffer *b,
2533                                const struct btrfs_key *key,
2534                                int level)
2535{
2536#ifdef CONFIG_BTRFS_ASSERT
2537        struct btrfs_disk_key disk_key;
2538
2539        btrfs_cpu_key_to_disk(&disk_key, key);
2540
2541        if (level == 0)
2542                ASSERT(!memcmp_extent_buffer(b, &disk_key,
2543                    offsetof(struct btrfs_leaf, items[0].key),
2544                    sizeof(disk_key)));
2545        else
2546                ASSERT(!memcmp_extent_buffer(b, &disk_key,
2547                    offsetof(struct btrfs_node, ptrs[0].key),
2548                    sizeof(disk_key)));
2549#endif
2550}
2551
2552static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2553                      int level, int *prev_cmp, int *slot)
2554{
2555        if (*prev_cmp != 0) {
2556                *prev_cmp = btrfs_bin_search(b, key, level, slot);
2557                return *prev_cmp;
2558        }
2559
2560        key_search_validate(b, key, level);
2561        *slot = 0;
2562
2563        return 0;
2564}
2565
2566int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2567                u64 iobjectid, u64 ioff, u8 key_type,
2568                struct btrfs_key *found_key)
2569{
2570        int ret;
2571        struct btrfs_key key;
2572        struct extent_buffer *eb;
2573
2574        ASSERT(path);
2575        ASSERT(found_key);
2576
2577        key.type = key_type;
2578        key.objectid = iobjectid;
2579        key.offset = ioff;
2580
2581        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2582        if (ret < 0)
2583                return ret;
2584
2585        eb = path->nodes[0];
2586        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2587                ret = btrfs_next_leaf(fs_root, path);
2588                if (ret)
2589                        return ret;
2590                eb = path->nodes[0];
2591        }
2592
2593        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2594        if (found_key->type != key.type ||
2595                        found_key->objectid != key.objectid)
2596                return 1;
2597
2598        return 0;
2599}
2600
2601static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2602                                                        struct btrfs_path *p,
2603                                                        int write_lock_level)
2604{
2605        struct btrfs_fs_info *fs_info = root->fs_info;
2606        struct extent_buffer *b;
2607        int root_lock;
2608        int level = 0;
2609
2610        /* We try very hard to do read locks on the root */
2611        root_lock = BTRFS_READ_LOCK;
2612
2613        if (p->search_commit_root) {
2614                /*
2615                 * The commit roots are read only so we always do read locks,
2616                 * and we always must hold the commit_root_sem when doing
2617                 * searches on them, the only exception is send where we don't
2618                 * want to block transaction commits for a long time, so
2619                 * we need to clone the commit root in order to avoid races
2620                 * with transaction commits that create a snapshot of one of
2621                 * the roots used by a send operation.
2622                 */
2623                if (p->need_commit_sem) {
2624                        down_read(&fs_info->commit_root_sem);
2625                        b = btrfs_clone_extent_buffer(root->commit_root);
2626                        up_read(&fs_info->commit_root_sem);
2627                        if (!b)
2628                                return ERR_PTR(-ENOMEM);
2629
2630                } else {
2631                        b = root->commit_root;
2632                        extent_buffer_get(b);
2633                }
2634                level = btrfs_header_level(b);
2635                /*
2636                 * Ensure that all callers have set skip_locking when
2637                 * p->search_commit_root = 1.
2638                 */
2639                ASSERT(p->skip_locking == 1);
2640
2641                goto out;
2642        }
2643
2644        if (p->skip_locking) {
2645                b = btrfs_root_node(root);
2646                level = btrfs_header_level(b);
2647                goto out;
2648        }
2649
2650        /*
2651         * If the level is set to maximum, we can skip trying to get the read
2652         * lock.
2653         */
2654        if (write_lock_level < BTRFS_MAX_LEVEL) {
2655                /*
2656                 * We don't know the level of the root node until we actually
2657                 * have it read locked
2658                 */
2659                b = btrfs_read_lock_root_node(root);
2660                level = btrfs_header_level(b);
2661                if (level > write_lock_level)
2662                        goto out;
2663
2664                /* Whoops, must trade for write lock */
2665                btrfs_tree_read_unlock(b);
2666                free_extent_buffer(b);
2667        }
2668
2669        b = btrfs_lock_root_node(root);
2670        root_lock = BTRFS_WRITE_LOCK;
2671
2672        /* The level might have changed, check again */
2673        level = btrfs_header_level(b);
2674
2675out:
2676        p->nodes[level] = b;
2677        if (!p->skip_locking)
2678                p->locks[level] = root_lock;
2679        /*
2680         * Callers are responsible for dropping b's references.
2681         */
2682        return b;
2683}
2684
2685
2686/*
2687 * btrfs_search_slot - look for a key in a tree and perform necessary
2688 * modifications to preserve tree invariants.
2689 *
2690 * @trans:      Handle of transaction, used when modifying the tree
2691 * @p:          Holds all btree nodes along the search path
2692 * @root:       The root node of the tree
2693 * @key:        The key we are looking for
2694 * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2695 *              deletions it's -1. 0 for plain searches
2696 * @cow:        boolean should CoW operations be performed. Must always be 1
2697 *              when modifying the tree.
2698 *
2699 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2700 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2701 *
2702 * If @key is found, 0 is returned and you can find the item in the leaf level
2703 * of the path (level 0)
2704 *
2705 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2706 * points to the slot where it should be inserted
2707 *
2708 * If an error is encountered while searching the tree a negative error number
2709 * is returned
2710 */
2711int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2712                      const struct btrfs_key *key, struct btrfs_path *p,
2713                      int ins_len, int cow)
2714{
2715        struct btrfs_fs_info *fs_info = root->fs_info;
2716        struct extent_buffer *b;
2717        int slot;
2718        int ret;
2719        int err;
2720        int level;
2721        int lowest_unlock = 1;
2722        /* everything at write_lock_level or lower must be write locked */
2723        int write_lock_level = 0;
2724        u8 lowest_level = 0;
2725        int min_write_lock_level;
2726        int prev_cmp;
2727
2728        lowest_level = p->lowest_level;
2729        WARN_ON(lowest_level && ins_len > 0);
2730        WARN_ON(p->nodes[0] != NULL);
2731        BUG_ON(!cow && ins_len);
2732
2733        if (ins_len < 0) {
2734                lowest_unlock = 2;
2735
2736                /* when we are removing items, we might have to go up to level
2737                 * two as we update tree pointers  Make sure we keep write
2738                 * for those levels as well
2739                 */
2740                write_lock_level = 2;
2741        } else if (ins_len > 0) {
2742                /*
2743                 * for inserting items, make sure we have a write lock on
2744                 * level 1 so we can update keys
2745                 */
2746                write_lock_level = 1;
2747        }
2748
2749        if (!cow)
2750                write_lock_level = -1;
2751
2752        if (cow && (p->keep_locks || p->lowest_level))
2753                write_lock_level = BTRFS_MAX_LEVEL;
2754
2755        min_write_lock_level = write_lock_level;
2756
2757again:
2758        prev_cmp = -1;
2759        b = btrfs_search_slot_get_root(root, p, write_lock_level);
2760        if (IS_ERR(b)) {
2761                ret = PTR_ERR(b);
2762                goto done;
2763        }
2764
2765        while (b) {
2766                level = btrfs_header_level(b);
2767
2768                /*
2769                 * setup the path here so we can release it under lock
2770                 * contention with the cow code
2771                 */
2772                if (cow) {
2773                        bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2774
2775                        /*
2776                         * if we don't really need to cow this block
2777                         * then we don't want to set the path blocking,
2778                         * so we test it here
2779                         */
2780                        if (!should_cow_block(trans, root, b)) {
2781                                trans->dirty = true;
2782                                goto cow_done;
2783                        }
2784
2785                        /*
2786                         * must have write locks on this node and the
2787                         * parent
2788                         */
2789                        if (level > write_lock_level ||
2790                            (level + 1 > write_lock_level &&
2791                            level + 1 < BTRFS_MAX_LEVEL &&
2792                            p->nodes[level + 1])) {
2793                                write_lock_level = level + 1;
2794                                btrfs_release_path(p);
2795                                goto again;
2796                        }
2797
2798                        btrfs_set_path_blocking(p);
2799                        if (last_level)
2800                                err = btrfs_cow_block(trans, root, b, NULL, 0,
2801                                                      &b);
2802                        else
2803                                err = btrfs_cow_block(trans, root, b,
2804                                                      p->nodes[level + 1],
2805                                                      p->slots[level + 1], &b);
2806                        if (err) {
2807                                ret = err;
2808                                goto done;
2809                        }
2810                }
2811cow_done:
2812                p->nodes[level] = b;
2813                /*
2814                 * Leave path with blocking locks to avoid massive
2815                 * lock context switch, this is made on purpose.
2816                 */
2817
2818                /*
2819                 * we have a lock on b and as long as we aren't changing
2820                 * the tree, there is no way to for the items in b to change.
2821                 * It is safe to drop the lock on our parent before we
2822                 * go through the expensive btree search on b.
2823                 *
2824                 * If we're inserting or deleting (ins_len != 0), then we might
2825                 * be changing slot zero, which may require changing the parent.
2826                 * So, we can't drop the lock until after we know which slot
2827                 * we're operating on.
2828                 */
2829                if (!ins_len && !p->keep_locks) {
2830                        int u = level + 1;
2831
2832                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2833                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2834                                p->locks[u] = 0;
2835                        }
2836                }
2837
2838                ret = key_search(b, key, level, &prev_cmp, &slot);
2839                if (ret < 0)
2840                        goto done;
2841
2842                if (level != 0) {
2843                        int dec = 0;
2844                        if (ret && slot > 0) {
2845                                dec = 1;
2846                                slot -= 1;
2847                        }
2848                        p->slots[level] = slot;
2849                        err = setup_nodes_for_search(trans, root, p, b, level,
2850                                             ins_len, &write_lock_level);
2851                        if (err == -EAGAIN)
2852                                goto again;
2853                        if (err) {
2854                                ret = err;
2855                                goto done;
2856                        }
2857                        b = p->nodes[level];
2858                        slot = p->slots[level];
2859
2860                        /*
2861                         * slot 0 is special, if we change the key
2862                         * we have to update the parent pointer
2863                         * which means we must have a write lock
2864                         * on the parent
2865                         */
2866                        if (slot == 0 && ins_len &&
2867                            write_lock_level < level + 1) {
2868                                write_lock_level = level + 1;
2869                                btrfs_release_path(p);
2870                                goto again;
2871                        }
2872
2873                        unlock_up(p, level, lowest_unlock,
2874                                  min_write_lock_level, &write_lock_level);
2875
2876                        if (level == lowest_level) {
2877                                if (dec)
2878                                        p->slots[level]++;
2879                                goto done;
2880                        }
2881
2882                        err = read_block_for_search(root, p, &b, level,
2883                                                    slot, key);
2884                        if (err == -EAGAIN)
2885                                goto again;
2886                        if (err) {
2887                                ret = err;
2888                                goto done;
2889                        }
2890
2891                        if (!p->skip_locking) {
2892                                level = btrfs_header_level(b);
2893                                if (level <= write_lock_level) {
2894                                        err = btrfs_try_tree_write_lock(b);
2895                                        if (!err) {
2896                                                btrfs_set_path_blocking(p);
2897                                                btrfs_tree_lock(b);
2898                                        }
2899                                        p->locks[level] = BTRFS_WRITE_LOCK;
2900                                } else {
2901                                        err = btrfs_tree_read_lock_atomic(b);
2902                                        if (!err) {
2903                                                btrfs_set_path_blocking(p);
2904                                                btrfs_tree_read_lock(b);
2905                                        }
2906                                        p->locks[level] = BTRFS_READ_LOCK;
2907                                }
2908                                p->nodes[level] = b;
2909                        }
2910                } else {
2911                        p->slots[level] = slot;
2912                        if (ins_len > 0 &&
2913                            btrfs_leaf_free_space(fs_info, b) < ins_len) {
2914                                if (write_lock_level < 1) {
2915                                        write_lock_level = 1;
2916                                        btrfs_release_path(p);
2917                                        goto again;
2918                                }
2919
2920                                btrfs_set_path_blocking(p);
2921                                err = split_leaf(trans, root, key,
2922                                                 p, ins_len, ret == 0);
2923
2924                                BUG_ON(err > 0);
2925                                if (err) {
2926                                        ret = err;
2927                                        goto done;
2928                                }
2929                        }
2930                        if (!p->search_for_split)
2931                                unlock_up(p, level, lowest_unlock,
2932                                          min_write_lock_level, NULL);
2933                        goto done;
2934                }
2935        }
2936        ret = 1;
2937done:
2938        /*
2939         * we don't really know what they plan on doing with the path
2940         * from here on, so for now just mark it as blocking
2941         */
2942        if (!p->leave_spinning)
2943                btrfs_set_path_blocking(p);
2944        if (ret < 0 && !p->skip_release_on_error)
2945                btrfs_release_path(p);
2946        return ret;
2947}
2948
2949/*
2950 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2951 * current state of the tree together with the operations recorded in the tree
2952 * modification log to search for the key in a previous version of this tree, as
2953 * denoted by the time_seq parameter.
2954 *
2955 * Naturally, there is no support for insert, delete or cow operations.
2956 *
2957 * The resulting path and return value will be set up as if we called
2958 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2959 */
2960int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2961                          struct btrfs_path *p, u64 time_seq)
2962{
2963        struct btrfs_fs_info *fs_info = root->fs_info;
2964        struct extent_buffer *b;
2965        int slot;
2966        int ret;
2967        int err;
2968        int level;
2969        int lowest_unlock = 1;
2970        u8 lowest_level = 0;
2971        int prev_cmp = -1;
2972
2973        lowest_level = p->lowest_level;
2974        WARN_ON(p->nodes[0] != NULL);
2975
2976        if (p->search_commit_root) {
2977                BUG_ON(time_seq);
2978                return btrfs_search_slot(NULL, root, key, p, 0, 0);
2979        }
2980
2981again:
2982        b = get_old_root(root, time_seq);
2983        if (!b) {
2984                ret = -EIO;
2985                goto done;
2986        }
2987        level = btrfs_header_level(b);
2988        p->locks[level] = BTRFS_READ_LOCK;
2989
2990        while (b) {
2991                level = btrfs_header_level(b);
2992                p->nodes[level] = b;
2993
2994                /*
2995                 * we have a lock on b and as long as we aren't changing
2996                 * the tree, there is no way to for the items in b to change.
2997                 * It is safe to drop the lock on our parent before we
2998                 * go through the expensive btree search on b.
2999                 */
3000                btrfs_unlock_up_safe(p, level + 1);
3001
3002                /*
3003                 * Since we can unwind ebs we want to do a real search every
3004                 * time.
3005                 */
3006                prev_cmp = -1;
3007                ret = key_search(b, key, level, &prev_cmp, &slot);
3008
3009                if (level != 0) {
3010                        int dec = 0;
3011                        if (ret && slot > 0) {
3012                                dec = 1;
3013                                slot -= 1;
3014                        }
3015                        p->slots[level] = slot;
3016                        unlock_up(p, level, lowest_unlock, 0, NULL);
3017
3018                        if (level == lowest_level) {
3019                                if (dec)
3020                                        p->slots[level]++;
3021                                goto done;
3022                        }
3023
3024                        err = read_block_for_search(root, p, &b, level,
3025                                                    slot, key);
3026                        if (err == -EAGAIN)
3027                                goto again;
3028                        if (err) {
3029                                ret = err;
3030                                goto done;
3031                        }
3032
3033                        level = btrfs_header_level(b);
3034                        err = btrfs_tree_read_lock_atomic(b);
3035                        if (!err) {
3036                                btrfs_set_path_blocking(p);
3037                                btrfs_tree_read_lock(b);
3038                        }
3039                        b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3040                        if (!b) {
3041                                ret = -ENOMEM;
3042                                goto done;
3043                        }
3044                        p->locks[level] = BTRFS_READ_LOCK;
3045                        p->nodes[level] = b;
3046                } else {
3047                        p->slots[level] = slot;
3048                        unlock_up(p, level, lowest_unlock, 0, NULL);
3049                        goto done;
3050                }
3051        }
3052        ret = 1;
3053done:
3054        if (!p->leave_spinning)
3055                btrfs_set_path_blocking(p);
3056        if (ret < 0)
3057                btrfs_release_path(p);
3058
3059        return ret;
3060}
3061
3062/*
3063 * helper to use instead of search slot if no exact match is needed but
3064 * instead the next or previous item should be returned.
3065 * When find_higher is true, the next higher item is returned, the next lower
3066 * otherwise.
3067 * When return_any and find_higher are both true, and no higher item is found,
3068 * return the next lower instead.
3069 * When return_any is true and find_higher is false, and no lower item is found,
3070 * return the next higher instead.
3071 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3072 * < 0 on error
3073 */
3074int btrfs_search_slot_for_read(struct btrfs_root *root,
3075                               const struct btrfs_key *key,
3076                               struct btrfs_path *p, int find_higher,
3077                               int return_any)
3078{
3079        int ret;
3080        struct extent_buffer *leaf;
3081
3082again:
3083        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3084        if (ret <= 0)
3085                return ret;
3086        /*
3087         * a return value of 1 means the path is at the position where the
3088         * item should be inserted. Normally this is the next bigger item,
3089         * but in case the previous item is the last in a leaf, path points
3090         * to the first free slot in the previous leaf, i.e. at an invalid
3091         * item.
3092         */
3093        leaf = p->nodes[0];
3094
3095        if (find_higher) {
3096                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3097                        ret = btrfs_next_leaf(root, p);
3098                        if (ret <= 0)
3099                                return ret;
3100                        if (!return_any)
3101                                return 1;
3102                        /*
3103                         * no higher item found, return the next
3104                         * lower instead
3105                         */
3106                        return_any = 0;
3107                        find_higher = 0;
3108                        btrfs_release_path(p);
3109                        goto again;
3110                }
3111        } else {
3112                if (p->slots[0] == 0) {
3113                        ret = btrfs_prev_leaf(root, p);
3114                        if (ret < 0)
3115                                return ret;
3116                        if (!ret) {
3117                                leaf = p->nodes[0];
3118                                if (p->slots[0] == btrfs_header_nritems(leaf))
3119                                        p->slots[0]--;
3120                                return 0;
3121                        }
3122                        if (!return_any)
3123                                return 1;
3124                        /*
3125                         * no lower item found, return the next
3126                         * higher instead
3127                         */
3128                        return_any = 0;
3129                        find_higher = 1;
3130                        btrfs_release_path(p);
3131                        goto again;
3132                } else {
3133                        --p->slots[0];
3134                }
3135        }
3136        return 0;
3137}
3138
3139/*
3140 * adjust the pointers going up the tree, starting at level
3141 * making sure the right key of each node is points to 'key'.
3142 * This is used after shifting pointers to the left, so it stops
3143 * fixing up pointers when a given leaf/node is not in slot 0 of the
3144 * higher levels
3145 *
3146 */
3147static void fixup_low_keys(struct btrfs_path *path,
3148                           struct btrfs_disk_key *key, int level)
3149{
3150        int i;
3151        struct extent_buffer *t;
3152        int ret;
3153
3154        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3155                int tslot = path->slots[i];
3156
3157                if (!path->nodes[i])
3158                        break;
3159                t = path->nodes[i];
3160                ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3161                                GFP_ATOMIC);
3162                BUG_ON(ret < 0);
3163                btrfs_set_node_key(t, key, tslot);
3164                btrfs_mark_buffer_dirty(path->nodes[i]);
3165                if (tslot != 0)
3166                        break;
3167        }
3168}
3169
3170/*
3171 * update item key.
3172 *
3173 * This function isn't completely safe. It's the caller's responsibility
3174 * that the new key won't break the order
3175 */
3176void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3177                             struct btrfs_path *path,
3178                             const struct btrfs_key *new_key)
3179{
3180        struct btrfs_disk_key disk_key;
3181        struct extent_buffer *eb;
3182        int slot;
3183
3184        eb = path->nodes[0];
3185        slot = path->slots[0];
3186        if (slot > 0) {
3187                btrfs_item_key(eb, &disk_key, slot - 1);
3188                BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3189        }
3190        if (slot < btrfs_header_nritems(eb) - 1) {
3191                btrfs_item_key(eb, &disk_key, slot + 1);
3192                BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3193        }
3194
3195        btrfs_cpu_key_to_disk(&disk_key, new_key);
3196        btrfs_set_item_key(eb, &disk_key, slot);
3197        btrfs_mark_buffer_dirty(eb);
3198        if (slot == 0)
3199                fixup_low_keys(path, &disk_key, 1);
3200}
3201
3202/*
3203 * try to push data from one node into the next node left in the
3204 * tree.
3205 *
3206 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3207 * error, and > 0 if there was no room in the left hand block.
3208 */
3209static int push_node_left(struct btrfs_trans_handle *trans,
3210                          struct btrfs_fs_info *fs_info,
3211                          struct extent_buffer *dst,
3212                          struct extent_buffer *src, int empty)
3213{
3214        int push_items = 0;
3215        int src_nritems;
3216        int dst_nritems;
3217        int ret = 0;
3218
3219        src_nritems = btrfs_header_nritems(src);
3220        dst_nritems = btrfs_header_nritems(dst);
3221        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3222        WARN_ON(btrfs_header_generation(src) != trans->transid);
3223        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3224
3225        if (!empty && src_nritems <= 8)
3226                return 1;
3227
3228        if (push_items <= 0)
3229                return 1;
3230
3231        if (empty) {
3232                push_items = min(src_nritems, push_items);
3233                if (push_items < src_nritems) {
3234                        /* leave at least 8 pointers in the node if
3235                         * we aren't going to empty it
3236                         */
3237                        if (src_nritems - push_items < 8) {
3238                                if (push_items <= 8)
3239                                        return 1;
3240                                push_items -= 8;
3241                        }
3242                }
3243        } else
3244                push_items = min(src_nritems - 8, push_items);
3245
3246        ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3247                                   push_items);
3248        if (ret) {
3249                btrfs_abort_transaction(trans, ret);
3250                return ret;
3251        }
3252        copy_extent_buffer(dst, src,
3253                           btrfs_node_key_ptr_offset(dst_nritems),
3254                           btrfs_node_key_ptr_offset(0),
3255                           push_items * sizeof(struct btrfs_key_ptr));
3256
3257        if (push_items < src_nritems) {
3258                /*
3259                 * Don't call tree_mod_log_insert_move here, key removal was
3260                 * already fully logged by tree_mod_log_eb_copy above.
3261                 */
3262                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3263                                      btrfs_node_key_ptr_offset(push_items),
3264                                      (src_nritems - push_items) *
3265                                      sizeof(struct btrfs_key_ptr));
3266        }
3267        btrfs_set_header_nritems(src, src_nritems - push_items);
3268        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3269        btrfs_mark_buffer_dirty(src);
3270        btrfs_mark_buffer_dirty(dst);
3271
3272        return ret;
3273}
3274
3275/*
3276 * try to push data from one node into the next node right in the
3277 * tree.
3278 *
3279 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3280 * error, and > 0 if there was no room in the right hand block.
3281 *
3282 * this will  only push up to 1/2 the contents of the left node over
3283 */
3284static int balance_node_right(struct btrfs_trans_handle *trans,
3285                              struct btrfs_fs_info *fs_info,
3286                              struct extent_buffer *dst,
3287                              struct extent_buffer *src)
3288{
3289        int push_items = 0;
3290        int max_push;
3291        int src_nritems;
3292        int dst_nritems;
3293        int ret = 0;
3294
3295        WARN_ON(btrfs_header_generation(src) != trans->transid);
3296        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3297
3298        src_nritems = btrfs_header_nritems(src);
3299        dst_nritems = btrfs_header_nritems(dst);
3300        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3301        if (push_items <= 0)
3302                return 1;
3303
3304        if (src_nritems < 4)
3305                return 1;
3306
3307        max_push = src_nritems / 2 + 1;
3308        /* don't try to empty the node */
3309        if (max_push >= src_nritems)
3310                return 1;
3311
3312        if (max_push < push_items)
3313                push_items = max_push;
3314
3315        ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3316        BUG_ON(ret < 0);
3317        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3318                                      btrfs_node_key_ptr_offset(0),
3319                                      (dst_nritems) *
3320                                      sizeof(struct btrfs_key_ptr));
3321
3322        ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3323                                   src_nritems - push_items, push_items);
3324        if (ret) {
3325                btrfs_abort_transaction(trans, ret);
3326                return ret;
3327        }
3328        copy_extent_buffer(dst, src,
3329                           btrfs_node_key_ptr_offset(0),
3330                           btrfs_node_key_ptr_offset(src_nritems - push_items),
3331                           push_items * sizeof(struct btrfs_key_ptr));
3332
3333        btrfs_set_header_nritems(src, src_nritems - push_items);
3334        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3335
3336        btrfs_mark_buffer_dirty(src);
3337        btrfs_mark_buffer_dirty(dst);
3338
3339        return ret;
3340}
3341
3342/*
3343 * helper function to insert a new root level in the tree.
3344 * A new node is allocated, and a single item is inserted to
3345 * point to the existing root
3346 *
3347 * returns zero on success or < 0 on failure.
3348 */
3349static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3350                           struct btrfs_root *root,
3351                           struct btrfs_path *path, int level)
3352{
3353        struct btrfs_fs_info *fs_info = root->fs_info;
3354        u64 lower_gen;
3355        struct extent_buffer *lower;
3356        struct extent_buffer *c;
3357        struct extent_buffer *old;
3358        struct btrfs_disk_key lower_key;
3359        int ret;
3360
3361        BUG_ON(path->nodes[level]);
3362        BUG_ON(path->nodes[level-1] != root->node);
3363
3364        lower = path->nodes[level-1];
3365        if (level == 1)
3366                btrfs_item_key(lower, &lower_key, 0);
3367        else
3368                btrfs_node_key(lower, &lower_key, 0);
3369
3370        c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3371                                         root->node->start, 0);
3372        if (IS_ERR(c))
3373                return PTR_ERR(c);
3374
3375        root_add_used(root, fs_info->nodesize);
3376
3377        btrfs_set_header_nritems(c, 1);
3378        btrfs_set_node_key(c, &lower_key, 0);
3379        btrfs_set_node_blockptr(c, 0, lower->start);
3380        lower_gen = btrfs_header_generation(lower);
3381        WARN_ON(lower_gen != trans->transid);
3382
3383        btrfs_set_node_ptr_generation(c, 0, lower_gen);
3384
3385        btrfs_mark_buffer_dirty(c);
3386
3387        old = root->node;
3388        ret = tree_mod_log_insert_root(root->node, c, 0);
3389        BUG_ON(ret < 0);
3390        rcu_assign_pointer(root->node, c);
3391
3392        /* the super has an extra ref to root->node */
3393        free_extent_buffer(old);
3394
3395        add_root_to_dirty_list(root);
3396        extent_buffer_get(c);
3397        path->nodes[level] = c;
3398        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3399        path->slots[level] = 0;
3400        return 0;
3401}
3402
3403/*
3404 * worker function to insert a single pointer in a node.
3405 * the node should have enough room for the pointer already
3406 *
3407 * slot and level indicate where you want the key to go, and
3408 * blocknr is the block the key points to.
3409 */
3410static void insert_ptr(struct btrfs_trans_handle *trans,
3411                       struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3412                       struct btrfs_disk_key *key, u64 bytenr,
3413                       int slot, int level)
3414{
3415        struct extent_buffer *lower;
3416        int nritems;
3417        int ret;
3418
3419        BUG_ON(!path->nodes[level]);
3420        btrfs_assert_tree_locked(path->nodes[level]);
3421        lower = path->nodes[level];
3422        nritems = btrfs_header_nritems(lower);
3423        BUG_ON(slot > nritems);
3424        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3425        if (slot != nritems) {
3426                if (level) {
3427                        ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3428                                        nritems - slot);
3429                        BUG_ON(ret < 0);
3430                }
3431                memmove_extent_buffer(lower,
3432                              btrfs_node_key_ptr_offset(slot + 1),
3433                              btrfs_node_key_ptr_offset(slot),
3434                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
3435        }
3436        if (level) {
3437                ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3438                                GFP_NOFS);
3439                BUG_ON(ret < 0);
3440        }
3441        btrfs_set_node_key(lower, key, slot);
3442        btrfs_set_node_blockptr(lower, slot, bytenr);
3443        WARN_ON(trans->transid == 0);
3444        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3445        btrfs_set_header_nritems(lower, nritems + 1);
3446        btrfs_mark_buffer_dirty(lower);
3447}
3448
3449/*
3450 * split the node at the specified level in path in two.
3451 * The path is corrected to point to the appropriate node after the split
3452 *
3453 * Before splitting this tries to make some room in the node by pushing
3454 * left and right, if either one works, it returns right away.
3455 *
3456 * returns 0 on success and < 0 on failure
3457 */
3458static noinline int split_node(struct btrfs_trans_handle *trans,
3459                               struct btrfs_root *root,
3460                               struct btrfs_path *path, int level)
3461{
3462        struct btrfs_fs_info *fs_info = root->fs_info;
3463        struct extent_buffer *c;
3464        struct extent_buffer *split;
3465        struct btrfs_disk_key disk_key;
3466        int mid;
3467        int ret;
3468        u32 c_nritems;
3469
3470        c = path->nodes[level];
3471        WARN_ON(btrfs_header_generation(c) != trans->transid);
3472        if (c == root->node) {
3473                /*
3474                 * trying to split the root, lets make a new one
3475                 *
3476                 * tree mod log: We don't log_removal old root in
3477                 * insert_new_root, because that root buffer will be kept as a
3478                 * normal node. We are going to log removal of half of the
3479                 * elements below with tree_mod_log_eb_copy. We're holding a
3480                 * tree lock on the buffer, which is why we cannot race with
3481                 * other tree_mod_log users.
3482                 */
3483                ret = insert_new_root(trans, root, path, level + 1);
3484                if (ret)
3485                        return ret;
3486        } else {
3487                ret = push_nodes_for_insert(trans, root, path, level);
3488                c = path->nodes[level];
3489                if (!ret && btrfs_header_nritems(c) <
3490                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3491                        return 0;
3492                if (ret < 0)
3493                        return ret;
3494        }
3495
3496        c_nritems = btrfs_header_nritems(c);
3497        mid = (c_nritems + 1) / 2;
3498        btrfs_node_key(c, &disk_key, mid);
3499
3500        split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3501                                             c->start, 0);
3502        if (IS_ERR(split))
3503                return PTR_ERR(split);
3504
3505        root_add_used(root, fs_info->nodesize);
3506        ASSERT(btrfs_header_level(c) == level);
3507
3508        ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3509        if (ret) {
3510                btrfs_abort_transaction(trans, ret);
3511                return ret;
3512        }
3513        copy_extent_buffer(split, c,
3514                           btrfs_node_key_ptr_offset(0),
3515                           btrfs_node_key_ptr_offset(mid),
3516                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3517        btrfs_set_header_nritems(split, c_nritems - mid);
3518        btrfs_set_header_nritems(c, mid);
3519        ret = 0;
3520
3521        btrfs_mark_buffer_dirty(c);
3522        btrfs_mark_buffer_dirty(split);
3523
3524        insert_ptr(trans, fs_info, path, &disk_key, split->start,
3525                   path->slots[level + 1] + 1, level + 1);
3526
3527        if (path->slots[level] >= mid) {
3528                path->slots[level] -= mid;
3529                btrfs_tree_unlock(c);
3530                free_extent_buffer(c);
3531                path->nodes[level] = split;
3532                path->slots[level + 1] += 1;
3533        } else {
3534                btrfs_tree_unlock(split);
3535                free_extent_buffer(split);
3536        }
3537        return ret;
3538}
3539
3540/*
3541 * how many bytes are required to store the items in a leaf.  start
3542 * and nr indicate which items in the leaf to check.  This totals up the
3543 * space used both by the item structs and the item data
3544 */
3545static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3546{
3547        struct btrfs_item *start_item;
3548        struct btrfs_item *end_item;
3549        struct btrfs_map_token token;
3550        int data_len;
3551        int nritems = btrfs_header_nritems(l);
3552        int end = min(nritems, start + nr) - 1;
3553
3554        if (!nr)
3555                return 0;
3556        btrfs_init_map_token(&token);
3557        start_item = btrfs_item_nr(start);
3558        end_item = btrfs_item_nr(end);
3559        data_len = btrfs_token_item_offset(l, start_item, &token) +
3560                btrfs_token_item_size(l, start_item, &token);
3561        data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3562        data_len += sizeof(struct btrfs_item) * nr;
3563        WARN_ON(data_len < 0);
3564        return data_len;
3565}
3566
3567/*
3568 * The space between the end of the leaf items and
3569 * the start of the leaf data.  IOW, how much room
3570 * the leaf has left for both items and data
3571 */
3572noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3573                                   struct extent_buffer *leaf)
3574{
3575        int nritems = btrfs_header_nritems(leaf);
3576        int ret;
3577
3578        ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3579        if (ret < 0) {
3580                btrfs_crit(fs_info,
3581                           "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3582                           ret,
3583                           (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3584                           leaf_space_used(leaf, 0, nritems), nritems);
3585        }
3586        return ret;
3587}
3588
3589/*
3590 * min slot controls the lowest index we're willing to push to the
3591 * right.  We'll push up to and including min_slot, but no lower
3592 */
3593static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3594                                      struct btrfs_path *path,
3595                                      int data_size, int empty,
3596                                      struct extent_buffer *right,
3597                                      int free_space, u32 left_nritems,
3598                                      u32 min_slot)
3599{
3600        struct extent_buffer *left = path->nodes[0];
3601        struct extent_buffer *upper = path->nodes[1];
3602        struct btrfs_map_token token;
3603        struct btrfs_disk_key disk_key;
3604        int slot;
3605        u32 i;
3606        int push_space = 0;
3607        int push_items = 0;
3608        struct btrfs_item *item;
3609        u32 nr;
3610        u32 right_nritems;
3611        u32 data_end;
3612        u32 this_item_size;
3613
3614        btrfs_init_map_token(&token);
3615
3616        if (empty)
3617                nr = 0;
3618        else
3619                nr = max_t(u32, 1, min_slot);
3620
3621        if (path->slots[0] >= left_nritems)
3622                push_space += data_size;
3623
3624        slot = path->slots[1];
3625        i = left_nritems - 1;
3626        while (i >= nr) {
3627                item = btrfs_item_nr(i);
3628
3629                if (!empty && push_items > 0) {
3630                        if (path->slots[0] > i)
3631                                break;
3632                        if (path->slots[0] == i) {
3633                                int space = btrfs_leaf_free_space(fs_info, left);
3634                                if (space + push_space * 2 > free_space)
3635                                        break;
3636                        }
3637                }
3638
3639                if (path->slots[0] == i)
3640                        push_space += data_size;
3641
3642                this_item_size = btrfs_item_size(left, item);
3643                if (this_item_size + sizeof(*item) + push_space > free_space)
3644                        break;
3645
3646                push_items++;
3647                push_space += this_item_size + sizeof(*item);
3648                if (i == 0)
3649                        break;
3650                i--;
3651        }
3652
3653        if (push_items == 0)
3654                goto out_unlock;
3655
3656        WARN_ON(!empty && push_items == left_nritems);
3657
3658        /* push left to right */
3659        right_nritems = btrfs_header_nritems(right);
3660
3661        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3662        push_space -= leaf_data_end(fs_info, left);
3663
3664        /* make room in the right data area */
3665        data_end = leaf_data_end(fs_info, right);
3666        memmove_extent_buffer(right,
3667                              BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3668                              BTRFS_LEAF_DATA_OFFSET + data_end,
3669                              BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3670
3671        /* copy from the left data area */
3672        copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3673                     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3674                     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3675                     push_space);
3676
3677        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3678                              btrfs_item_nr_offset(0),
3679                              right_nritems * sizeof(struct btrfs_item));
3680
3681        /* copy the items from left to right */
3682        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3683                   btrfs_item_nr_offset(left_nritems - push_items),
3684                   push_items * sizeof(struct btrfs_item));
3685
3686        /* update the item pointers */
3687        right_nritems += push_items;
3688        btrfs_set_header_nritems(right, right_nritems);
3689        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3690        for (i = 0; i < right_nritems; i++) {
3691                item = btrfs_item_nr(i);
3692                push_space -= btrfs_token_item_size(right, item, &token);
3693                btrfs_set_token_item_offset(right, item, push_space, &token);
3694        }
3695
3696        left_nritems -= push_items;
3697        btrfs_set_header_nritems(left, left_nritems);
3698
3699        if (left_nritems)
3700                btrfs_mark_buffer_dirty(left);
3701        else
3702                clean_tree_block(fs_info, left);
3703
3704        btrfs_mark_buffer_dirty(right);
3705
3706        btrfs_item_key(right, &disk_key, 0);
3707        btrfs_set_node_key(upper, &disk_key, slot + 1);
3708        btrfs_mark_buffer_dirty(upper);
3709
3710        /* then fixup the leaf pointer in the path */
3711        if (path->slots[0] >= left_nritems) {
3712                path->slots[0] -= left_nritems;
3713                if (btrfs_header_nritems(path->nodes[0]) == 0)
3714                        clean_tree_block(fs_info, path->nodes[0]);
3715                btrfs_tree_unlock(path->nodes[0]);
3716                free_extent_buffer(path->nodes[0]);
3717                path->nodes[0] = right;
3718                path->slots[1] += 1;
3719        } else {
3720                btrfs_tree_unlock(right);
3721                free_extent_buffer(right);
3722        }
3723        return 0;
3724
3725out_unlock:
3726        btrfs_tree_unlock(right);
3727        free_extent_buffer(right);
3728        return 1;
3729}
3730
3731/*
3732 * push some data in the path leaf to the right, trying to free up at
3733 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3734 *
3735 * returns 1 if the push failed because the other node didn't have enough
3736 * room, 0 if everything worked out and < 0 if there were major errors.
3737 *
3738 * this will push starting from min_slot to the end of the leaf.  It won't
3739 * push any slot lower than min_slot
3740 */
3741static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3742                           *root, struct btrfs_path *path,
3743                           int min_data_size, int data_size,
3744                           int empty, u32 min_slot)
3745{
3746        struct btrfs_fs_info *fs_info = root->fs_info;
3747        struct extent_buffer *left = path->nodes[0];
3748        struct extent_buffer *right;
3749        struct extent_buffer *upper;
3750        int slot;
3751        int free_space;
3752        u32 left_nritems;
3753        int ret;
3754
3755        if (!path->nodes[1])
3756                return 1;
3757
3758        slot = path->slots[1];
3759        upper = path->nodes[1];
3760        if (slot >= btrfs_header_nritems(upper) - 1)
3761                return 1;
3762
3763        btrfs_assert_tree_locked(path->nodes[1]);
3764
3765        right = read_node_slot(fs_info, upper, slot + 1);
3766        /*
3767         * slot + 1 is not valid or we fail to read the right node,
3768         * no big deal, just return.
3769         */
3770        if (IS_ERR(right))
3771                return 1;
3772
3773        btrfs_tree_lock(right);
3774        btrfs_set_lock_blocking(right);
3775
3776        free_space = btrfs_leaf_free_space(fs_info, right);
3777        if (free_space < data_size)
3778                goto out_unlock;
3779
3780        /* cow and double check */
3781        ret = btrfs_cow_block(trans, root, right, upper,
3782                              slot + 1, &right);
3783        if (ret)
3784                goto out_unlock;
3785
3786        free_space = btrfs_leaf_free_space(fs_info, right);
3787        if (free_space < data_size)
3788                goto out_unlock;
3789
3790        left_nritems = btrfs_header_nritems(left);
3791        if (left_nritems == 0)
3792                goto out_unlock;
3793
3794        if (path->slots[0] == left_nritems && !empty) {
3795                /* Key greater than all keys in the leaf, right neighbor has
3796                 * enough room for it and we're not emptying our leaf to delete
3797                 * it, therefore use right neighbor to insert the new item and
3798                 * no need to touch/dirty our left leaf. */
3799                btrfs_tree_unlock(left);
3800                free_extent_buffer(left);
3801                path->nodes[0] = right;
3802                path->slots[0] = 0;
3803                path->slots[1]++;
3804                return 0;
3805        }
3806
3807        return __push_leaf_right(fs_info, path, min_data_size, empty,
3808                                right, free_space, left_nritems, min_slot);
3809out_unlock:
3810        btrfs_tree_unlock(right);
3811        free_extent_buffer(right);
3812        return 1;
3813}
3814
3815/*
3816 * push some data in the path leaf to the left, trying to free up at
3817 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3818 *
3819 * max_slot can put a limit on how far into the leaf we'll push items.  The
3820 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3821 * items
3822 */
3823static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3824                                     struct btrfs_path *path, int data_size,
3825                                     int empty, struct extent_buffer *left,
3826                                     int free_space, u32 right_nritems,
3827                                     u32 max_slot)
3828{
3829        struct btrfs_disk_key disk_key;
3830        struct extent_buffer *right = path->nodes[0];
3831        int i;
3832        int push_space = 0;
3833        int push_items = 0;
3834        struct btrfs_item *item;
3835        u32 old_left_nritems;
3836        u32 nr;
3837        int ret = 0;
3838        u32 this_item_size;
3839        u32 old_left_item_size;
3840        struct btrfs_map_token token;
3841
3842        btrfs_init_map_token(&token);
3843
3844        if (empty)
3845                nr = min(right_nritems, max_slot);
3846        else
3847                nr = min(right_nritems - 1, max_slot);
3848
3849        for (i = 0; i < nr; i++) {
3850                item = btrfs_item_nr(i);
3851
3852                if (!empty && push_items > 0) {
3853                        if (path->slots[0] < i)
3854                                break;
3855                        if (path->slots[0] == i) {
3856                                int space = btrfs_leaf_free_space(fs_info, right);
3857                                if (space + push_space * 2 > free_space)
3858                                        break;
3859                        }
3860                }
3861
3862                if (path->slots[0] == i)
3863                        push_space += data_size;
3864
3865                this_item_size = btrfs_item_size(right, item);
3866                if (this_item_size + sizeof(*item) + push_space > free_space)
3867                        break;
3868
3869                push_items++;
3870                push_space += this_item_size + sizeof(*item);
3871        }
3872
3873        if (push_items == 0) {
3874                ret = 1;
3875                goto out;
3876        }
3877        WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3878
3879        /* push data from right to left */
3880        copy_extent_buffer(left, right,
3881                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
3882                           btrfs_item_nr_offset(0),
3883                           push_items * sizeof(struct btrfs_item));
3884
3885        push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3886                     btrfs_item_offset_nr(right, push_items - 1);
3887
3888        copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3889                     leaf_data_end(fs_info, left) - push_space,
3890                     BTRFS_LEAF_DATA_OFFSET +
3891                     btrfs_item_offset_nr(right, push_items - 1),
3892                     push_space);
3893        old_left_nritems = btrfs_header_nritems(left);
3894        BUG_ON(old_left_nritems <= 0);
3895
3896        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3897        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3898                u32 ioff;
3899
3900                item = btrfs_item_nr(i);
3901
3902                ioff = btrfs_token_item_offset(left, item, &token);
3903                btrfs_set_token_item_offset(left, item,
3904                      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3905                      &token);
3906        }
3907        btrfs_set_header_nritems(left, old_left_nritems + push_items);
3908
3909        /* fixup right node */
3910        if (push_items > right_nritems)
3911                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3912                       right_nritems);
3913
3914        if (push_items < right_nritems) {
3915                push_space = btrfs_item_offset_nr(right, push_items - 1) -
3916                                                  leaf_data_end(fs_info, right);
3917                memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3918                                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3919                                      BTRFS_LEAF_DATA_OFFSET +
3920                                      leaf_data_end(fs_info, right), push_space);
3921
3922                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3923                              btrfs_item_nr_offset(push_items),
3924                             (btrfs_header_nritems(right) - push_items) *
3925                             sizeof(struct btrfs_item));
3926        }
3927        right_nritems -= push_items;
3928        btrfs_set_header_nritems(right, right_nritems);
3929        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3930        for (i = 0; i < right_nritems; i++) {
3931                item = btrfs_item_nr(i);
3932
3933                push_space = push_space - btrfs_token_item_size(right,
3934                                                                item, &token);
3935                btrfs_set_token_item_offset(right, item, push_space, &token);
3936        }
3937
3938        btrfs_mark_buffer_dirty(left);
3939        if (right_nritems)
3940                btrfs_mark_buffer_dirty(right);
3941        else
3942                clean_tree_block(fs_info, right);
3943
3944        btrfs_item_key(right, &disk_key, 0);
3945        fixup_low_keys(path, &disk_key, 1);
3946
3947        /* then fixup the leaf pointer in the path */
3948        if (path->slots[0] < push_items) {
3949                path->slots[0] += old_left_nritems;
3950                btrfs_tree_unlock(path->nodes[0]);
3951                free_extent_buffer(path->nodes[0]);
3952                path->nodes[0] = left;
3953                path->slots[1] -= 1;
3954        } else {
3955                btrfs_tree_unlock(left);
3956                free_extent_buffer(left);
3957                path->slots[0] -= push_items;
3958        }
3959        BUG_ON(path->slots[0] < 0);
3960        return ret;
3961out:
3962        btrfs_tree_unlock(left);
3963        free_extent_buffer(left);
3964        return ret;
3965}
3966
3967/*
3968 * push some data in the path leaf to the left, trying to free up at
3969 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3970 *
3971 * max_slot can put a limit on how far into the leaf we'll push items.  The
3972 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3973 * items
3974 */
3975static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3976                          *root, struct btrfs_path *path, int min_data_size,
3977                          int data_size, int empty, u32 max_slot)
3978{
3979        struct btrfs_fs_info *fs_info = root->fs_info;
3980        struct extent_buffer *right = path->nodes[0];
3981        struct extent_buffer *left;
3982        int slot;
3983        int free_space;
3984        u32 right_nritems;
3985        int ret = 0;
3986
3987        slot = path->slots[1];
3988        if (slot == 0)
3989                return 1;
3990        if (!path->nodes[1])
3991                return 1;
3992
3993        right_nritems = btrfs_header_nritems(right);
3994        if (right_nritems == 0)
3995                return 1;
3996
3997        btrfs_assert_tree_locked(path->nodes[1]);
3998
3999        left = read_node_slot(fs_info, path->nodes[1], slot - 1);
4000        /*
4001         * slot - 1 is not valid or we fail to read the left node,
4002         * no big deal, just return.
4003         */
4004        if (IS_ERR(left))
4005                return 1;
4006
4007        btrfs_tree_lock(left);
4008        btrfs_set_lock_blocking(left);
4009
4010        free_space = btrfs_leaf_free_space(fs_info, left);
4011        if (free_space < data_size) {
4012                ret = 1;
4013                goto out;
4014        }
4015
4016        /* cow and double check */
4017        ret = btrfs_cow_block(trans, root, left,
4018                              path->nodes[1], slot - 1, &left);
4019        if (ret) {
4020                /* we hit -ENOSPC, but it isn't fatal here */
4021                if (ret == -ENOSPC)
4022                        ret = 1;
4023                goto out;
4024        }
4025
4026        free_space = btrfs_leaf_free_space(fs_info, left);
4027        if (free_space < data_size) {
4028                ret = 1;
4029                goto out;
4030        }
4031
4032        return __push_leaf_left(fs_info, path, min_data_size,
4033                               empty, left, free_space, right_nritems,
4034                               max_slot);
4035out:
4036        btrfs_tree_unlock(left);
4037        free_extent_buffer(left);
4038        return ret;
4039}
4040
4041/*
4042 * split the path's leaf in two, making sure there is at least data_size
4043 * available for the resulting leaf level of the path.
4044 */
4045static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4046                                    struct btrfs_fs_info *fs_info,
4047                                    struct btrfs_path *path,
4048                                    struct extent_buffer *l,
4049                                    struct extent_buffer *right,
4050                                    int slot, int mid, int nritems)
4051{
4052        int data_copy_size;
4053        int rt_data_off;
4054        int i;
4055        struct btrfs_disk_key disk_key;
4056        struct btrfs_map_token token;
4057
4058        btrfs_init_map_token(&token);
4059
4060        nritems = nritems - mid;
4061        btrfs_set_header_nritems(right, nritems);
4062        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4063
4064        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4065                           btrfs_item_nr_offset(mid),
4066                           nritems * sizeof(struct btrfs_item));
4067
4068        copy_extent_buffer(right, l,
4069                     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4070                     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4071                     leaf_data_end(fs_info, l), data_copy_size);
4072
4073        rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4074
4075        for (i = 0; i < nritems; i++) {
4076                struct btrfs_item *item = btrfs_item_nr(i);
4077                u32 ioff;
4078
4079                ioff = btrfs_token_item_offset(right, item, &token);
4080                btrfs_set_token_item_offset(right, item,
4081                                            ioff + rt_data_off, &token);
4082        }
4083
4084        btrfs_set_header_nritems(l, mid);
4085        btrfs_item_key(right, &disk_key, 0);
4086        insert_ptr(trans, fs_info, path, &disk_key, right->start,
4087                   path->slots[1] + 1, 1);
4088
4089        btrfs_mark_buffer_dirty(right);
4090        btrfs_mark_buffer_dirty(l);
4091        BUG_ON(path->slots[0] != slot);
4092
4093        if (mid <= slot) {
4094                btrfs_tree_unlock(path->nodes[0]);
4095                free_extent_buffer(path->nodes[0]);
4096                path->nodes[0] = right;
4097                path->slots[0] -= mid;
4098                path->slots[1] += 1;
4099        } else {
4100                btrfs_tree_unlock(right);
4101                free_extent_buffer(right);
4102        }
4103
4104        BUG_ON(path->slots[0] < 0);
4105}
4106
4107/*
4108 * double splits happen when we need to insert a big item in the middle
4109 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4110 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4111 *          A                 B                 C
4112 *
4113 * We avoid this by trying to push the items on either side of our target
4114 * into the adjacent leaves.  If all goes well we can avoid the double split
4115 * completely.
4116 */
4117static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4118                                          struct btrfs_root *root,
4119                                          struct btrfs_path *path,
4120                                          int data_size)
4121{
4122        struct btrfs_fs_info *fs_info = root->fs_info;
4123        int ret;
4124        int progress = 0;
4125        int slot;
4126        u32 nritems;
4127        int space_needed = data_size;
4128
4129        slot = path->slots[0];
4130        if (slot < btrfs_header_nritems(path->nodes[0]))
4131                space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4132
4133        /*
4134         * try to push all the items after our slot into the
4135         * right leaf
4136         */
4137        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4138        if (ret < 0)
4139                return ret;
4140
4141        if (ret == 0)
4142                progress++;
4143
4144        nritems = btrfs_header_nritems(path->nodes[0]);
4145        /*
4146         * our goal is to get our slot at the start or end of a leaf.  If
4147         * we've done so we're done
4148         */
4149        if (path->slots[0] == 0 || path->slots[0] == nritems)
4150                return 0;
4151
4152        if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4153                return 0;
4154
4155        /* try to push all the items before our slot into the next leaf */
4156        slot = path->slots[0];
4157        space_needed = data_size;
4158        if (slot > 0)
4159                space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4160        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4161        if (ret < 0)
4162                return ret;
4163
4164        if (ret == 0)
4165                progress++;
4166
4167        if (progress)
4168                return 0;
4169        return 1;
4170}
4171
4172/*
4173 * split the path's leaf in two, making sure there is at least data_size
4174 * available for the resulting leaf level of the path.
4175 *
4176 * returns 0 if all went well and < 0 on failure.
4177 */
4178static noinline int split_leaf(struct btrfs_trans_handle *trans,
4179                               struct btrfs_root *root,
4180                               const struct btrfs_key *ins_key,
4181                               struct btrfs_path *path, int data_size,
4182                               int extend)
4183{
4184        struct btrfs_disk_key disk_key;
4185        struct extent_buffer *l;
4186        u32 nritems;
4187        int mid;
4188        int slot;
4189        struct extent_buffer *right;
4190        struct btrfs_fs_info *fs_info = root->fs_info;
4191        int ret = 0;
4192        int wret;
4193        int split;
4194        int num_doubles = 0;
4195        int tried_avoid_double = 0;
4196
4197        l = path->nodes[0];
4198        slot = path->slots[0];
4199        if (extend && data_size + btrfs_item_size_nr(l, slot) +
4200            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4201                return -EOVERFLOW;
4202
4203        /* first try to make some room by pushing left and right */
4204        if (data_size && path->nodes[1]) {
4205                int space_needed = data_size;
4206
4207                if (slot < btrfs_header_nritems(l))
4208                        space_needed -= btrfs_leaf_free_space(fs_info, l);
4209
4210                wret = push_leaf_right(trans, root, path, space_needed,
4211                                       space_needed, 0, 0);
4212                if (wret < 0)
4213                        return wret;
4214                if (wret) {
4215                        space_needed = data_size;
4216                        if (slot > 0)
4217                                space_needed -= btrfs_leaf_free_space(fs_info,
4218                                                                      l);
4219                        wret = push_leaf_left(trans, root, path, space_needed,
4220                                              space_needed, 0, (u32)-1);
4221                        if (wret < 0)
4222                                return wret;
4223                }
4224                l = path->nodes[0];
4225
4226                /* did the pushes work? */
4227                if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4228                        return 0;
4229        }
4230
4231        if (!path->nodes[1]) {
4232                ret = insert_new_root(trans, root, path, 1);
4233                if (ret)
4234                        return ret;
4235        }
4236again:
4237        split = 1;
4238        l = path->nodes[0];
4239        slot = path->slots[0];
4240        nritems = btrfs_header_nritems(l);
4241        mid = (nritems + 1) / 2;
4242
4243        if (mid <= slot) {
4244                if (nritems == 1 ||
4245                    leaf_space_used(l, mid, nritems - mid) + data_size >
4246                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
4247                        if (slot >= nritems) {
4248                                split = 0;
4249                        } else {
4250                                mid = slot;
4251                                if (mid != nritems &&
4252                                    leaf_space_used(l, mid, nritems - mid) +
4253                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4254                                        if (data_size && !tried_avoid_double)
4255                                                goto push_for_double;
4256                                        split = 2;
4257                                }
4258                        }
4259                }
4260        } else {
4261                if (leaf_space_used(l, 0, mid) + data_size >
4262                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
4263                        if (!extend && data_size && slot == 0) {
4264                                split = 0;
4265                        } else if ((extend || !data_size) && slot == 0) {
4266                                mid = 1;
4267                        } else {
4268                                mid = slot;
4269                                if (mid != nritems &&
4270                                    leaf_space_used(l, mid, nritems - mid) +
4271                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4272                                        if (data_size && !tried_avoid_double)
4273                                                goto push_for_double;
4274                                        split = 2;
4275                                }
4276                        }
4277                }
4278        }
4279
4280        if (split == 0)
4281                btrfs_cpu_key_to_disk(&disk_key, ins_key);
4282        else
4283                btrfs_item_key(l, &disk_key, mid);
4284
4285        right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4286                                             l->start, 0);
4287        if (IS_ERR(right))
4288                return PTR_ERR(right);
4289
4290        root_add_used(root, fs_info->nodesize);
4291
4292        if (split == 0) {
4293                if (mid <= slot) {
4294                        btrfs_set_header_nritems(right, 0);
4295                        insert_ptr(trans, fs_info, path, &disk_key,
4296                                   right->start, path->slots[1] + 1, 1);
4297                        btrfs_tree_unlock(path->nodes[0]);
4298                        free_extent_buffer(path->nodes[0]);
4299                        path->nodes[0] = right;
4300                        path->slots[0] = 0;
4301                        path->slots[1] += 1;
4302                } else {
4303                        btrfs_set_header_nritems(right, 0);
4304                        insert_ptr(trans, fs_info, path, &disk_key,
4305                                   right->start, path->slots[1], 1);
4306                        btrfs_tree_unlock(path->nodes[0]);
4307                        free_extent_buffer(path->nodes[0]);
4308                        path->nodes[0] = right;
4309                        path->slots[0] = 0;
4310                        if (path->slots[1] == 0)
4311                                fixup_low_keys(path, &disk_key, 1);
4312                }
4313                /*
4314                 * We create a new leaf 'right' for the required ins_len and
4315                 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4316                 * the content of ins_len to 'right'.
4317                 */
4318                return ret;
4319        }
4320
4321        copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4322
4323        if (split == 2) {
4324                BUG_ON(num_doubles != 0);
4325                num_doubles++;
4326                goto again;
4327        }
4328
4329        return 0;
4330
4331push_for_double:
4332        push_for_double_split(trans, root, path, data_size);
4333        tried_avoid_double = 1;
4334        if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4335                return 0;
4336        goto again;
4337}
4338
4339static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4340                                         struct btrfs_root *root,
4341                                         struct btrfs_path *path, int ins_len)
4342{
4343        struct btrfs_fs_info *fs_info = root->fs_info;
4344        struct btrfs_key key;
4345        struct extent_buffer *leaf;
4346        struct btrfs_file_extent_item *fi;
4347        u64 extent_len = 0;
4348        u32 item_size;
4349        int ret;
4350
4351        leaf = path->nodes[0];
4352        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4353
4354        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4355               key.type != BTRFS_EXTENT_CSUM_KEY);
4356
4357        if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4358                return 0;
4359
4360        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4361        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4362                fi = btrfs_item_ptr(leaf, path->slots[0],
4363                                    struct btrfs_file_extent_item);
4364                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4365        }
4366        btrfs_release_path(path);
4367
4368        path->keep_locks = 1;
4369        path->search_for_split = 1;
4370        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4371        path->search_for_split = 0;
4372        if (ret > 0)
4373                ret = -EAGAIN;
4374        if (ret < 0)
4375                goto err;
4376
4377        ret = -EAGAIN;
4378        leaf = path->nodes[0];
4379        /* if our item isn't there, return now */
4380        if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4381                goto err;
4382
4383        /* the leaf has  changed, it now has room.  return now */
4384        if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4385                goto err;
4386
4387        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4388                fi = btrfs_item_ptr(leaf, path->slots[0],
4389                                    struct btrfs_file_extent_item);
4390                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4391                        goto err;
4392        }
4393
4394        btrfs_set_path_blocking(path);
4395        ret = split_leaf(trans, root, &key, path, ins_len, 1);
4396        if (ret)
4397                goto err;
4398
4399        path->keep_locks = 0;
4400        btrfs_unlock_up_safe(path, 1);
4401        return 0;
4402err:
4403        path->keep_locks = 0;
4404        return ret;
4405}
4406
4407static noinline int split_item(struct btrfs_fs_info *fs_info,
4408                               struct btrfs_path *path,
4409                               const struct btrfs_key *new_key,
4410                               unsigned long split_offset)
4411{
4412        struct extent_buffer *leaf;
4413        struct btrfs_item *item;
4414        struct btrfs_item *new_item;
4415        int slot;
4416        char *buf;
4417        u32 nritems;
4418        u32 item_size;
4419        u32 orig_offset;
4420        struct btrfs_disk_key disk_key;
4421
4422        leaf = path->nodes[0];
4423        BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4424
4425        btrfs_set_path_blocking(path);
4426
4427        item = btrfs_item_nr(path->slots[0]);
4428        orig_offset = btrfs_item_offset(leaf, item);
4429        item_size = btrfs_item_size(leaf, item);
4430
4431        buf = kmalloc(item_size, GFP_NOFS);
4432        if (!buf)
4433                return -ENOMEM;
4434
4435        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4436                            path->slots[0]), item_size);
4437
4438        slot = path->slots[0] + 1;
4439        nritems = btrfs_header_nritems(leaf);
4440        if (slot != nritems) {
4441                /* shift the items */
4442                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4443                                btrfs_item_nr_offset(slot),
4444                                (nritems - slot) * sizeof(struct btrfs_item));
4445        }
4446
4447        btrfs_cpu_key_to_disk(&disk_key, new_key);
4448        btrfs_set_item_key(leaf, &disk_key, slot);
4449
4450        new_item = btrfs_item_nr(slot);
4451
4452        btrfs_set_item_offset(leaf, new_item, orig_offset);
4453        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4454
4455        btrfs_set_item_offset(leaf, item,
4456                              orig_offset + item_size - split_offset);
4457        btrfs_set_item_size(leaf, item, split_offset);
4458
4459        btrfs_set_header_nritems(leaf, nritems + 1);
4460
4461        /* write the data for the start of the original item */
4462        write_extent_buffer(leaf, buf,
4463                            btrfs_item_ptr_offset(leaf, path->slots[0]),
4464                            split_offset);
4465
4466        /* write the data for the new item */
4467        write_extent_buffer(leaf, buf + split_offset,
4468                            btrfs_item_ptr_offset(leaf, slot),
4469                            item_size - split_offset);
4470        btrfs_mark_buffer_dirty(leaf);
4471
4472        BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4473        kfree(buf);
4474        return 0;
4475}
4476
4477/*
4478 * This function splits a single item into two items,
4479 * giving 'new_key' to the new item and splitting the
4480 * old one at split_offset (from the start of the item).
4481 *
4482 * The path may be released by this operation.  After
4483 * the split, the path is pointing to the old item.  The
4484 * new item is going to be in the same node as the old one.
4485 *
4486 * Note, the item being split must be smaller enough to live alone on
4487 * a tree block with room for one extra struct btrfs_item
4488 *
4489 * This allows us to split the item in place, keeping a lock on the
4490 * leaf the entire time.
4491 */
4492int btrfs_split_item(struct btrfs_trans_handle *trans,
4493                     struct btrfs_root *root,
4494                     struct btrfs_path *path,
4495                     const struct btrfs_key *new_key,
4496                     unsigned long split_offset)
4497{
4498        int ret;
4499        ret = setup_leaf_for_split(trans, root, path,
4500                                   sizeof(struct btrfs_item));
4501        if (ret)
4502                return ret;
4503
4504        ret = split_item(root->fs_info, path, new_key, split_offset);
4505        return ret;
4506}
4507
4508/*
4509 * This function duplicate a item, giving 'new_key' to the new item.
4510 * It guarantees both items live in the same tree leaf and the new item
4511 * is contiguous with the original item.
4512 *
4513 * This allows us to split file extent in place, keeping a lock on the
4514 * leaf the entire time.
4515 */
4516int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4517                         struct btrfs_root *root,
4518                         struct btrfs_path *path,
4519                         const struct btrfs_key *new_key)
4520{
4521        struct extent_buffer *leaf;
4522        int ret;
4523        u32 item_size;
4524
4525        leaf = path->nodes[0];
4526        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4527        ret = setup_leaf_for_split(trans, root, path,
4528                                   item_size + sizeof(struct btrfs_item));
4529        if (ret)
4530                return ret;
4531
4532        path->slots[0]++;
4533        setup_items_for_insert(root, path, new_key, &item_size,
4534                               item_size, item_size +
4535                               sizeof(struct btrfs_item), 1);
4536        leaf = path->nodes[0];
4537        memcpy_extent_buffer(leaf,
4538                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4539                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4540                             item_size);
4541        return 0;
4542}
4543
4544/*
4545 * make the item pointed to by the path smaller.  new_size indicates
4546 * how small to make it, and from_end tells us if we just chop bytes
4547 * off the end of the item or if we shift the item to chop bytes off
4548 * the front.
4549 */
4550void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4551                         struct btrfs_path *path, u32 new_size, int from_end)
4552{
4553        int slot;
4554        struct extent_buffer *leaf;
4555        struct btrfs_item *item;
4556        u32 nritems;
4557        unsigned int data_end;
4558        unsigned int old_data_start;
4559        unsigned int old_size;
4560        unsigned int size_diff;
4561        int i;
4562        struct btrfs_map_token token;
4563
4564        btrfs_init_map_token(&token);
4565
4566        leaf = path->nodes[0];
4567        slot = path->slots[0];
4568
4569        old_size = btrfs_item_size_nr(leaf, slot);
4570        if (old_size == new_size)
4571                return;
4572
4573        nritems = btrfs_header_nritems(leaf);
4574        data_end = leaf_data_end(fs_info, leaf);
4575
4576        old_data_start = btrfs_item_offset_nr(leaf, slot);
4577
4578        size_diff = old_size - new_size;
4579
4580        BUG_ON(slot < 0);
4581        BUG_ON(slot >= nritems);
4582
4583        /*
4584         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4585         */
4586        /* first correct the data pointers */
4587        for (i = slot; i < nritems; i++) {
4588                u32 ioff;
4589                item = btrfs_item_nr(i);
4590
4591                ioff = btrfs_token_item_offset(leaf, item, &token);
4592                btrfs_set_token_item_offset(leaf, item,
4593                                            ioff + size_diff, &token);
4594        }
4595
4596        /* shift the data */
4597        if (from_end) {
4598                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4599                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4600                              data_end, old_data_start + new_size - data_end);
4601        } else {
4602                struct btrfs_disk_key disk_key;
4603                u64 offset;
4604
4605                btrfs_item_key(leaf, &disk_key, slot);
4606
4607                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4608                        unsigned long ptr;
4609                        struct btrfs_file_extent_item *fi;
4610
4611                        fi = btrfs_item_ptr(leaf, slot,
4612                                            struct btrfs_file_extent_item);
4613                        fi = (struct btrfs_file_extent_item *)(
4614                             (unsigned long)fi - size_diff);
4615
4616                        if (btrfs_file_extent_type(leaf, fi) ==
4617                            BTRFS_FILE_EXTENT_INLINE) {
4618                                ptr = btrfs_item_ptr_offset(leaf, slot);
4619                                memmove_extent_buffer(leaf, ptr,
4620                                      (unsigned long)fi,
4621                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4622                        }
4623                }
4624
4625                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4626                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4627                              data_end, old_data_start - data_end);
4628
4629                offset = btrfs_disk_key_offset(&disk_key);
4630                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4631                btrfs_set_item_key(leaf, &disk_key, slot);
4632                if (slot == 0)
4633                        fixup_low_keys(path, &disk_key, 1);
4634        }
4635
4636        item = btrfs_item_nr(slot);
4637        btrfs_set_item_size(leaf, item, new_size);
4638        btrfs_mark_buffer_dirty(leaf);
4639
4640        if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4641                btrfs_print_leaf(leaf);
4642                BUG();
4643        }
4644}
4645
4646/*
4647 * make the item pointed to by the path bigger, data_size is the added size.
4648 */
4649void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4650                       u32 data_size)
4651{
4652        int slot;
4653        struct extent_buffer *leaf;
4654        struct btrfs_item *item;
4655        u32 nritems;
4656        unsigned int data_end;
4657        unsigned int old_data;
4658        unsigned int old_size;
4659        int i;
4660        struct btrfs_map_token token;
4661
4662        btrfs_init_map_token(&token);
4663
4664        leaf = path->nodes[0];
4665
4666        nritems = btrfs_header_nritems(leaf);
4667        data_end = leaf_data_end(fs_info, leaf);
4668
4669        if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4670                btrfs_print_leaf(leaf);
4671                BUG();
4672        }
4673        slot = path->slots[0];
4674        old_data = btrfs_item_end_nr(leaf, slot);
4675
4676        BUG_ON(slot < 0);
4677        if (slot >= nritems) {
4678                btrfs_print_leaf(leaf);
4679                btrfs_crit(fs_info, "slot %d too large, nritems %d",
4680                           slot, nritems);
4681                BUG_ON(1);
4682        }
4683
4684        /*
4685         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4686         */
4687        /* first correct the data pointers */
4688        for (i = slot; i < nritems; i++) {
4689                u32 ioff;
4690                item = btrfs_item_nr(i);
4691
4692                ioff = btrfs_token_item_offset(leaf, item, &token);
4693                btrfs_set_token_item_offset(leaf, item,
4694                                            ioff - data_size, &token);
4695        }
4696
4697        /* shift the data */
4698        memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4699                      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4700                      data_end, old_data - data_end);
4701
4702        data_end = old_data;
4703        old_size = btrfs_item_size_nr(leaf, slot);
4704        item = btrfs_item_nr(slot);
4705        btrfs_set_item_size(leaf, item, old_size + data_size);
4706        btrfs_mark_buffer_dirty(leaf);
4707
4708        if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4709                btrfs_print_leaf(leaf);
4710                BUG();
4711        }
4712}
4713
4714/*
4715 * this is a helper for btrfs_insert_empty_items, the main goal here is
4716 * to save stack depth by doing the bulk of the work in a function
4717 * that doesn't call btrfs_search_slot
4718 */
4719void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4720                            const struct btrfs_key *cpu_key, u32 *data_size,
4721                            u32 total_data, u32 total_size, int nr)
4722{
4723        struct btrfs_fs_info *fs_info = root->fs_info;
4724        struct btrfs_item *item;
4725        int i;
4726        u32 nritems;
4727        unsigned int data_end;
4728        struct btrfs_disk_key disk_key;
4729        struct extent_buffer *leaf;
4730        int slot;
4731        struct btrfs_map_token token;
4732
4733        if (path->slots[0] == 0) {
4734                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4735                fixup_low_keys(path, &disk_key, 1);
4736        }
4737        btrfs_unlock_up_safe(path, 1);
4738
4739        btrfs_init_map_token(&token);
4740
4741        leaf = path->nodes[0];
4742        slot = path->slots[0];
4743
4744        nritems = btrfs_header_nritems(leaf);
4745        data_end = leaf_data_end(fs_info, leaf);
4746
4747        if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4748                btrfs_print_leaf(leaf);
4749                btrfs_crit(fs_info, "not enough freespace need %u have %d",
4750                           total_size, btrfs_leaf_free_space(fs_info, leaf));
4751                BUG();
4752        }
4753
4754        if (slot != nritems) {
4755                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4756
4757                if (old_data < data_end) {
4758                        btrfs_print_leaf(leaf);
4759                        btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4760                                   slot, old_data, data_end);
4761                        BUG_ON(1);
4762                }
4763                /*
4764                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4765                 */
4766                /* first correct the data pointers */
4767                for (i = slot; i < nritems; i++) {
4768                        u32 ioff;
4769
4770                        item = btrfs_item_nr(i);
4771                        ioff = btrfs_token_item_offset(leaf, item, &token);
4772                        btrfs_set_token_item_offset(leaf, item,
4773                                                    ioff - total_data, &token);
4774                }
4775                /* shift the items */
4776                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4777                              btrfs_item_nr_offset(slot),
4778                              (nritems - slot) * sizeof(struct btrfs_item));
4779
4780                /* shift the data */
4781                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4782                              data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4783                              data_end, old_data - data_end);
4784                data_end = old_data;
4785        }
4786
4787        /* setup the item for the new data */
4788        for (i = 0; i < nr; i++) {
4789                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4790                btrfs_set_item_key(leaf, &disk_key, slot + i);
4791                item = btrfs_item_nr(slot + i);
4792                btrfs_set_token_item_offset(leaf, item,
4793                                            data_end - data_size[i], &token);
4794                data_end -= data_size[i];
4795                btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4796        }
4797
4798        btrfs_set_header_nritems(leaf, nritems + nr);
4799        btrfs_mark_buffer_dirty(leaf);
4800
4801        if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4802                btrfs_print_leaf(leaf);
4803                BUG();
4804        }
4805}
4806
4807/*
4808 * Given a key and some data, insert items into the tree.
4809 * This does all the path init required, making room in the tree if needed.
4810 */
4811int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4812                            struct btrfs_root *root,
4813                            struct btrfs_path *path,
4814                            const struct btrfs_key *cpu_key, u32 *data_size,
4815                            int nr)
4816{
4817        int ret = 0;
4818        int slot;
4819        int i;
4820        u32 total_size = 0;
4821        u32 total_data = 0;
4822
4823        for (i = 0; i < nr; i++)
4824                total_data += data_size[i];
4825
4826        total_size = total_data + (nr * sizeof(struct btrfs_item));
4827        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4828        if (ret == 0)
4829                return -EEXIST;
4830        if (ret < 0)
4831                return ret;
4832
4833        slot = path->slots[0];
4834        BUG_ON(slot < 0);
4835
4836        setup_items_for_insert(root, path, cpu_key, data_size,
4837                               total_data, total_size, nr);
4838        return 0;
4839}
4840
4841/*
4842 * Given a key and some data, insert an item into the tree.
4843 * This does all the path init required, making room in the tree if needed.
4844 */
4845int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4846                      const struct btrfs_key *cpu_key, void *data,
4847                      u32 data_size)
4848{
4849        int ret = 0;
4850        struct btrfs_path *path;
4851        struct extent_buffer *leaf;
4852        unsigned long ptr;
4853
4854        path = btrfs_alloc_path();
4855        if (!path)
4856                return -ENOMEM;
4857        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4858        if (!ret) {
4859                leaf = path->nodes[0];
4860                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4861                write_extent_buffer(leaf, data, ptr, data_size);
4862                btrfs_mark_buffer_dirty(leaf);
4863        }
4864        btrfs_free_path(path);
4865        return ret;
4866}
4867
4868/*
4869 * delete the pointer from a given node.
4870 *
4871 * the tree should have been previously balanced so the deletion does not
4872 * empty a node.
4873 */
4874static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4875                    int level, int slot)
4876{
4877        struct extent_buffer *parent = path->nodes[level];
4878        u32 nritems;
4879        int ret;
4880
4881        nritems = btrfs_header_nritems(parent);
4882        if (slot != nritems - 1) {
4883                if (level) {
4884                        ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4885                                        nritems - slot - 1);
4886                        BUG_ON(ret < 0);
4887                }
4888                memmove_extent_buffer(parent,
4889                              btrfs_node_key_ptr_offset(slot),
4890                              btrfs_node_key_ptr_offset(slot + 1),
4891                              sizeof(struct btrfs_key_ptr) *
4892                              (nritems - slot - 1));
4893        } else if (level) {
4894                ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4895                                GFP_NOFS);
4896                BUG_ON(ret < 0);
4897        }
4898
4899        nritems--;
4900        btrfs_set_header_nritems(parent, nritems);
4901        if (nritems == 0 && parent == root->node) {
4902                BUG_ON(btrfs_header_level(root->node) != 1);
4903                /* just turn the root into a leaf and break */
4904                btrfs_set_header_level(root->node, 0);
4905        } else if (slot == 0) {
4906                struct btrfs_disk_key disk_key;
4907
4908                btrfs_node_key(parent, &disk_key, 0);
4909                fixup_low_keys(path, &disk_key, level + 1);
4910        }
4911        btrfs_mark_buffer_dirty(parent);
4912}
4913
4914/*
4915 * a helper function to delete the leaf pointed to by path->slots[1] and
4916 * path->nodes[1].
4917 *
4918 * This deletes the pointer in path->nodes[1] and frees the leaf
4919 * block extent.  zero is returned if it all worked out, < 0 otherwise.
4920 *
4921 * The path must have already been setup for deleting the leaf, including
4922 * all the proper balancing.  path->nodes[1] must be locked.
4923 */
4924static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4925                                    struct btrfs_root *root,
4926                                    struct btrfs_path *path,
4927                                    struct extent_buffer *leaf)
4928{
4929        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4930        del_ptr(root, path, 1, path->slots[1]);
4931
4932        /*
4933         * btrfs_free_extent is expensive, we want to make sure we
4934         * aren't holding any locks when we call it
4935         */
4936        btrfs_unlock_up_safe(path, 0);
4937
4938        root_sub_used(root, leaf->len);
4939
4940        extent_buffer_get(leaf);
4941        btrfs_free_tree_block(trans, root, leaf, 0, 1);
4942        free_extent_buffer_stale(leaf);
4943}
4944/*
4945 * delete the item at the leaf level in path.  If that empties
4946 * the leaf, remove it from the tree
4947 */
4948int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4949                    struct btrfs_path *path, int slot, int nr)
4950{
4951        struct btrfs_fs_info *fs_info = root->fs_info;
4952        struct extent_buffer *leaf;
4953        struct btrfs_item *item;
4954        u32 last_off;
4955        u32 dsize = 0;
4956        int ret = 0;
4957        int wret;
4958        int i;
4959        u32 nritems;
4960        struct btrfs_map_token token;
4961
4962        btrfs_init_map_token(&token);
4963
4964        leaf = path->nodes[0];
4965        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4966
4967        for (i = 0; i < nr; i++)
4968                dsize += btrfs_item_size_nr(leaf, slot + i);
4969
4970        nritems = btrfs_header_nritems(leaf);
4971
4972        if (slot + nr != nritems) {
4973                int data_end = leaf_data_end(fs_info, leaf);
4974
4975                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4976                              data_end + dsize,
4977                              BTRFS_LEAF_DATA_OFFSET + data_end,
4978                              last_off - data_end);
4979
4980                for (i = slot + nr; i < nritems; i++) {
4981                        u32 ioff;
4982
4983                        item = btrfs_item_nr(i);
4984                        ioff = btrfs_token_item_offset(leaf, item, &token);
4985                        btrfs_set_token_item_offset(leaf, item,
4986                                                    ioff + dsize, &token);
4987                }
4988
4989                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4990                              btrfs_item_nr_offset(slot + nr),
4991                              sizeof(struct btrfs_item) *
4992                              (nritems - slot - nr));
4993        }
4994        btrfs_set_header_nritems(leaf, nritems - nr);
4995        nritems -= nr;
4996
4997        /* delete the leaf if we've emptied it */
4998        if (nritems == 0) {
4999                if (leaf == root->node) {
5000                        btrfs_set_header_level(leaf, 0);
5001                } else {
5002                        btrfs_set_path_blocking(path);
5003                        clean_tree_block(fs_info, leaf);
5004                        btrfs_del_leaf(trans, root, path, leaf);
5005                }
5006        } else {
5007                int used = leaf_space_used(leaf, 0, nritems);
5008                if (slot == 0) {
5009                        struct btrfs_disk_key disk_key;
5010
5011                        btrfs_item_key(leaf, &disk_key, 0);
5012                        fixup_low_keys(path, &disk_key, 1);
5013                }
5014
5015                /* delete the leaf if it is mostly empty */
5016                if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5017                        /* push_leaf_left fixes the path.
5018                         * make sure the path still points to our leaf
5019                         * for possible call to del_ptr below
5020                         */
5021                        slot = path->slots[1];
5022                        extent_buffer_get(leaf);
5023
5024                        btrfs_set_path_blocking(path);
5025                        wret = push_leaf_left(trans, root, path, 1, 1,
5026                                              1, (u32)-1);
5027                        if (wret < 0 && wret != -ENOSPC)
5028                                ret = wret;
5029
5030                        if (path->nodes[0] == leaf &&
5031                            btrfs_header_nritems(leaf)) {
5032                                wret = push_leaf_right(trans, root, path, 1,
5033                                                       1, 1, 0);
5034                                if (wret < 0 && wret != -ENOSPC)
5035                                        ret = wret;
5036                        }
5037
5038                        if (btrfs_header_nritems(leaf) == 0) {
5039                                path->slots[1] = slot;
5040                                btrfs_del_leaf(trans, root, path, leaf);
5041                                free_extent_buffer(leaf);
5042                                ret = 0;
5043                        } else {
5044                                /* if we're still in the path, make sure
5045                                 * we're dirty.  Otherwise, one of the
5046                                 * push_leaf functions must have already
5047                                 * dirtied this buffer
5048                                 */
5049                                if (path->nodes[0] == leaf)
5050                                        btrfs_mark_buffer_dirty(leaf);
5051                                free_extent_buffer(leaf);
5052                        }
5053                } else {
5054                        btrfs_mark_buffer_dirty(leaf);
5055                }
5056        }
5057        return ret;
5058}
5059
5060/*
5061 * search the tree again to find a leaf with lesser keys
5062 * returns 0 if it found something or 1 if there are no lesser leaves.
5063 * returns < 0 on io errors.
5064 *
5065 * This may release the path, and so you may lose any locks held at the
5066 * time you call it.
5067 */
5068int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5069{
5070        struct btrfs_key key;
5071        struct btrfs_disk_key found_key;
5072        int ret;
5073
5074        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5075
5076        if (key.offset > 0) {
5077                key.offset--;
5078        } else if (key.type > 0) {
5079                key.type--;
5080                key.offset = (u64)-1;
5081        } else if (key.objectid > 0) {
5082                key.objectid--;
5083                key.type = (u8)-1;
5084                key.offset = (u64)-1;
5085        } else {
5086                return 1;
5087        }
5088
5089        btrfs_release_path(path);
5090        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5091        if (ret < 0)
5092                return ret;
5093        btrfs_item_key(path->nodes[0], &found_key, 0);
5094        ret = comp_keys(&found_key, &key);
5095        /*
5096         * We might have had an item with the previous key in the tree right
5097         * before we released our path. And after we released our path, that
5098         * item might have been pushed to the first slot (0) of the leaf we
5099         * were holding due to a tree balance. Alternatively, an item with the
5100         * previous key can exist as the only element of a leaf (big fat item).
5101         * Therefore account for these 2 cases, so that our callers (like
5102         * btrfs_previous_item) don't miss an existing item with a key matching
5103         * the previous key we computed above.
5104         */
5105        if (ret <= 0)
5106                return 0;
5107        return 1;
5108}
5109
5110/*
5111 * A helper function to walk down the tree starting at min_key, and looking
5112 * for nodes or leaves that are have a minimum transaction id.
5113 * This is used by the btree defrag code, and tree logging
5114 *
5115 * This does not cow, but it does stuff the starting key it finds back
5116 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5117 * key and get a writable path.
5118 *
5119 * This honors path->lowest_level to prevent descent past a given level
5120 * of the tree.
5121 *
5122 * min_trans indicates the oldest transaction that you are interested
5123 * in walking through.  Any nodes or leaves older than min_trans are
5124 * skipped over (without reading them).
5125 *
5126 * returns zero if something useful was found, < 0 on error and 1 if there
5127 * was nothing in the tree that matched the search criteria.
5128 */
5129int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5130                         struct btrfs_path *path,
5131                         u64 min_trans)
5132{
5133        struct btrfs_fs_info *fs_info = root->fs_info;
5134        struct extent_buffer *cur;
5135        struct btrfs_key found_key;
5136        int slot;
5137        int sret;
5138        u32 nritems;
5139        int level;
5140        int ret = 1;
5141        int keep_locks = path->keep_locks;
5142
5143        path->keep_locks = 1;
5144again:
5145        cur = btrfs_read_lock_root_node(root);
5146        level = btrfs_header_level(cur);
5147        WARN_ON(path->nodes[level]);
5148        path->nodes[level] = cur;
5149        path->locks[level] = BTRFS_READ_LOCK;
5150
5151        if (btrfs_header_generation(cur) < min_trans) {
5152                ret = 1;
5153                goto out;
5154        }
5155        while (1) {
5156                nritems = btrfs_header_nritems(cur);
5157                level = btrfs_header_level(cur);
5158                sret = btrfs_bin_search(cur, min_key, level, &slot);
5159
5160                /* at the lowest level, we're done, setup the path and exit */
5161                if (level == path->lowest_level) {
5162                        if (slot >= nritems)
5163                                goto find_next_key;
5164                        ret = 0;
5165                        path->slots[level] = slot;
5166                        btrfs_item_key_to_cpu(cur, &found_key, slot);
5167                        goto out;
5168                }
5169                if (sret && slot > 0)
5170                        slot--;
5171                /*
5172                 * check this node pointer against the min_trans parameters.
5173                 * If it is too old, old, skip to the next one.
5174                 */
5175                while (slot < nritems) {
5176                        u64 gen;
5177
5178                        gen = btrfs_node_ptr_generation(cur, slot);
5179                        if (gen < min_trans) {
5180                                slot++;
5181                                continue;
5182                        }
5183                        break;
5184                }
5185find_next_key:
5186                /*
5187                 * we didn't find a candidate key in this node, walk forward
5188                 * and find another one
5189                 */
5190                if (slot >= nritems) {
5191                        path->slots[level] = slot;
5192                        btrfs_set_path_blocking(path);
5193                        sret = btrfs_find_next_key(root, path, min_key, level,
5194                                                  min_trans);
5195                        if (sret == 0) {
5196                                btrfs_release_path(path);
5197                                goto again;
5198                        } else {
5199                                goto out;
5200                        }
5201                }
5202                /* save our key for returning back */
5203                btrfs_node_key_to_cpu(cur, &found_key, slot);
5204                path->slots[level] = slot;
5205                if (level == path->lowest_level) {
5206                        ret = 0;
5207                        goto out;
5208                }
5209                btrfs_set_path_blocking(path);
5210                cur = read_node_slot(fs_info, cur, slot);
5211                if (IS_ERR(cur)) {
5212                        ret = PTR_ERR(cur);
5213                        goto out;
5214                }
5215
5216                btrfs_tree_read_lock(cur);
5217
5218                path->locks[level - 1] = BTRFS_READ_LOCK;
5219                path->nodes[level - 1] = cur;
5220                unlock_up(path, level, 1, 0, NULL);
5221        }
5222out:
5223        path->keep_locks = keep_locks;
5224        if (ret == 0) {
5225                btrfs_unlock_up_safe(path, path->lowest_level + 1);
5226                btrfs_set_path_blocking(path);
5227                memcpy(min_key, &found_key, sizeof(found_key));
5228        }
5229        return ret;
5230}
5231
5232static int tree_move_down(struct btrfs_fs_info *fs_info,
5233                           struct btrfs_path *path,
5234                           int *level)
5235{
5236        struct extent_buffer *eb;
5237
5238        BUG_ON(*level == 0);
5239        eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5240        if (IS_ERR(eb))
5241                return PTR_ERR(eb);
5242
5243        path->nodes[*level - 1] = eb;
5244        path->slots[*level - 1] = 0;
5245        (*level)--;
5246        return 0;
5247}
5248
5249static int tree_move_next_or_upnext(struct btrfs_path *path,
5250                                    int *level, int root_level)
5251{
5252        int ret = 0;
5253        int nritems;
5254        nritems = btrfs_header_nritems(path->nodes[*level]);
5255
5256        path->slots[*level]++;
5257
5258        while (path->slots[*level] >= nritems) {
5259                if (*level == root_level)
5260                        return -1;
5261
5262                /* move upnext */
5263                path->slots[*level] = 0;
5264                free_extent_buffer(path->nodes[*level]);
5265                path->nodes[*level] = NULL;
5266                (*level)++;
5267                path->slots[*level]++;
5268
5269                nritems = btrfs_header_nritems(path->nodes[*level]);
5270                ret = 1;
5271        }
5272        return ret;
5273}
5274
5275/*
5276 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5277 * or down.
5278 */
5279static int tree_advance(struct btrfs_fs_info *fs_info,
5280                        struct btrfs_path *path,
5281                        int *level, int root_level,
5282                        int allow_down,
5283                        struct btrfs_key *key)
5284{
5285        int ret;
5286
5287        if (*level == 0 || !allow_down) {
5288                ret = tree_move_next_or_upnext(path, level, root_level);
5289        } else {
5290                ret = tree_move_down(fs_info, path, level);
5291        }
5292        if (ret >= 0) {
5293                if (*level == 0)
5294                        btrfs_item_key_to_cpu(path->nodes[*level], key,
5295                                        path->slots[*level]);
5296                else
5297                        btrfs_node_key_to_cpu(path->nodes[*level], key,
5298                                        path->slots[*level]);
5299        }
5300        return ret;
5301}
5302
5303static int tree_compare_item(struct btrfs_path *left_path,
5304                             struct btrfs_path *right_path,
5305                             char *tmp_buf)
5306{
5307        int cmp;
5308        int len1, len2;
5309        unsigned long off1, off2;
5310
5311        len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5312        len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5313        if (len1 != len2)
5314                return 1;
5315
5316        off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5317        off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5318                                right_path->slots[0]);
5319
5320        read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5321
5322        cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5323        if (cmp)
5324                return 1;
5325        return 0;
5326}
5327
5328#define ADVANCE 1
5329#define ADVANCE_ONLY_NEXT -1
5330
5331/*
5332 * This function compares two trees and calls the provided callback for
5333 * every changed/new/deleted item it finds.
5334 * If shared tree blocks are encountered, whole subtrees are skipped, making
5335 * the compare pretty fast on snapshotted subvolumes.
5336 *
5337 * This currently works on commit roots only. As commit roots are read only,
5338 * we don't do any locking. The commit roots are protected with transactions.
5339 * Transactions are ended and rejoined when a commit is tried in between.
5340 *
5341 * This function checks for modifications done to the trees while comparing.
5342 * If it detects a change, it aborts immediately.
5343 */
5344int btrfs_compare_trees(struct btrfs_root *left_root,
5345                        struct btrfs_root *right_root,
5346                        btrfs_changed_cb_t changed_cb, void *ctx)
5347{
5348        struct btrfs_fs_info *fs_info = left_root->fs_info;
5349        int ret;
5350        int cmp;
5351        struct btrfs_path *left_path = NULL;
5352        struct btrfs_path *right_path = NULL;
5353        struct btrfs_key left_key;
5354        struct btrfs_key right_key;
5355        char *tmp_buf = NULL;
5356        int left_root_level;
5357        int right_root_level;
5358        int left_level;
5359        int right_level;
5360        int left_end_reached;
5361        int right_end_reached;
5362        int advance_left;
5363        int advance_right;
5364        u64 left_blockptr;
5365        u64 right_blockptr;
5366        u64 left_gen;
5367        u64 right_gen;
5368
5369        left_path = btrfs_alloc_path();
5370        if (!left_path) {
5371                ret = -ENOMEM;
5372                goto out;
5373        }
5374        right_path = btrfs_alloc_path();
5375        if (!right_path) {
5376                ret = -ENOMEM;
5377                goto out;
5378        }
5379
5380        tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5381        if (!tmp_buf) {
5382                ret = -ENOMEM;
5383                goto out;
5384        }
5385
5386        left_path->search_commit_root = 1;
5387        left_path->skip_locking = 1;
5388        right_path->search_commit_root = 1;
5389        right_path->skip_locking = 1;
5390
5391        /*
5392         * Strategy: Go to the first items of both trees. Then do
5393         *
5394         * If both trees are at level 0
5395         *   Compare keys of current items
5396         *     If left < right treat left item as new, advance left tree
5397         *       and repeat
5398         *     If left > right treat right item as deleted, advance right tree
5399         *       and repeat
5400         *     If left == right do deep compare of items, treat as changed if
5401         *       needed, advance both trees and repeat
5402         * If both trees are at the same level but not at level 0
5403         *   Compare keys of current nodes/leafs
5404         *     If left < right advance left tree and repeat
5405         *     If left > right advance right tree and repeat
5406         *     If left == right compare blockptrs of the next nodes/leafs
5407         *       If they match advance both trees but stay at the same level
5408         *         and repeat
5409         *       If they don't match advance both trees while allowing to go
5410         *         deeper and repeat
5411         * If tree levels are different
5412         *   Advance the tree that needs it and repeat
5413         *
5414         * Advancing a tree means:
5415         *   If we are at level 0, try to go to the next slot. If that's not
5416         *   possible, go one level up and repeat. Stop when we found a level
5417         *   where we could go to the next slot. We may at this point be on a
5418         *   node or a leaf.
5419         *
5420         *   If we are not at level 0 and not on shared tree blocks, go one
5421         *   level deeper.
5422         *
5423         *   If we are not at level 0 and on shared tree blocks, go one slot to
5424         *   the right if possible or go up and right.
5425         */
5426
5427        down_read(&fs_info->commit_root_sem);
5428        left_level = btrfs_header_level(left_root->commit_root);
5429        left_root_level = left_level;
5430        left_path->nodes[left_level] =
5431                        btrfs_clone_extent_buffer(left_root->commit_root);
5432        if (!left_path->nodes[left_level]) {
5433                up_read(&fs_info->commit_root_sem);
5434                ret = -ENOMEM;
5435                goto out;
5436        }
5437
5438        right_level = btrfs_header_level(right_root->commit_root);
5439        right_root_level = right_level;
5440        right_path->nodes[right_level] =
5441                        btrfs_clone_extent_buffer(right_root->commit_root);
5442        if (!right_path->nodes[right_level]) {
5443                up_read(&fs_info->commit_root_sem);
5444                ret = -ENOMEM;
5445                goto out;
5446        }
5447        up_read(&fs_info->commit_root_sem);
5448
5449        if (left_level == 0)
5450                btrfs_item_key_to_cpu(left_path->nodes[left_level],
5451                                &left_key, left_path->slots[left_level]);
5452        else
5453                btrfs_node_key_to_cpu(left_path->nodes[left_level],
5454                                &left_key, left_path->slots[left_level]);
5455        if (right_level == 0)
5456                btrfs_item_key_to_cpu(right_path->nodes[right_level],
5457                                &right_key, right_path->slots[right_level]);
5458        else
5459                btrfs_node_key_to_cpu(right_path->nodes[right_level],
5460                                &right_key, right_path->slots[right_level]);
5461
5462        left_end_reached = right_end_reached = 0;
5463        advance_left = advance_right = 0;
5464
5465        while (1) {
5466                if (advance_left && !left_end_reached) {
5467                        ret = tree_advance(fs_info, left_path, &left_level,
5468                                        left_root_level,
5469                                        advance_left != ADVANCE_ONLY_NEXT,
5470                                        &left_key);
5471                        if (ret == -1)
5472                                left_end_reached = ADVANCE;
5473                        else if (ret < 0)
5474                                goto out;
5475                        advance_left = 0;
5476                }
5477                if (advance_right && !right_end_reached) {
5478                        ret = tree_advance(fs_info, right_path, &right_level,
5479                                        right_root_level,
5480                                        advance_right != ADVANCE_ONLY_NEXT,
5481                                        &right_key);
5482                        if (ret == -1)
5483                                right_end_reached = ADVANCE;
5484                        else if (ret < 0)
5485                                goto out;
5486                        advance_right = 0;
5487                }
5488
5489                if (left_end_reached && right_end_reached) {
5490                        ret = 0;
5491                        goto out;
5492                } else if (left_end_reached) {
5493                        if (right_level == 0) {
5494                                ret = changed_cb(left_path, right_path,
5495                                                &right_key,
5496                                                BTRFS_COMPARE_TREE_DELETED,
5497                                                ctx);
5498                                if (ret < 0)
5499                                        goto out;
5500                        }
5501                        advance_right = ADVANCE;
5502                        continue;
5503                } else if (right_end_reached) {
5504                        if (left_level == 0) {
5505                                ret = changed_cb(left_path, right_path,
5506                                                &left_key,
5507                                                BTRFS_COMPARE_TREE_NEW,
5508                                                ctx);
5509                                if (ret < 0)
5510                                        goto out;
5511                        }
5512                        advance_left = ADVANCE;
5513                        continue;
5514                }
5515
5516                if (left_level == 0 && right_level == 0) {
5517                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5518                        if (cmp < 0) {
5519                                ret = changed_cb(left_path, right_path,
5520                                                &left_key,
5521                                                BTRFS_COMPARE_TREE_NEW,
5522                                                ctx);
5523                                if (ret < 0)
5524                                        goto out;
5525                                advance_left = ADVANCE;
5526                        } else if (cmp > 0) {
5527                                ret = changed_cb(left_path, right_path,
5528                                                &right_key,
5529                                                BTRFS_COMPARE_TREE_DELETED,
5530                                                ctx);
5531                                if (ret < 0)
5532                                        goto out;
5533                                advance_right = ADVANCE;
5534                        } else {
5535                                enum btrfs_compare_tree_result result;
5536
5537                                WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5538                                ret = tree_compare_item(left_path, right_path,
5539                                                        tmp_buf);
5540                                if (ret)
5541                                        result = BTRFS_COMPARE_TREE_CHANGED;
5542                                else
5543                                        result = BTRFS_COMPARE_TREE_SAME;
5544                                ret = changed_cb(left_path, right_path,
5545                                                 &left_key, result, ctx);
5546                                if (ret < 0)
5547                                        goto out;
5548                                advance_left = ADVANCE;
5549                                advance_right = ADVANCE;
5550                        }
5551                } else if (left_level == right_level) {
5552                        cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5553                        if (cmp < 0) {
5554                                advance_left = ADVANCE;
5555                        } else if (cmp > 0) {
5556                                advance_right = ADVANCE;
5557                        } else {
5558                                left_blockptr = btrfs_node_blockptr(
5559                                                left_path->nodes[left_level],
5560                                                left_path->slots[left_level]);
5561                                right_blockptr = btrfs_node_blockptr(
5562                                                right_path->nodes[right_level],
5563                                                right_path->slots[right_level]);
5564                                left_gen = btrfs_node_ptr_generation(
5565                                                left_path->nodes[left_level],
5566                                                left_path->slots[left_level]);
5567                                right_gen = btrfs_node_ptr_generation(
5568                                                right_path->nodes[right_level],
5569                                                right_path->slots[right_level]);
5570                                if (left_blockptr == right_blockptr &&
5571                                    left_gen == right_gen) {
5572                                        /*
5573                                         * As we're on a shared block, don't
5574                                         * allow to go deeper.
5575                                         */
5576                                        advance_left = ADVANCE_ONLY_NEXT;
5577                                        advance_right = ADVANCE_ONLY_NEXT;
5578                                } else {
5579                                        advance_left = ADVANCE;
5580                                        advance_right = ADVANCE;
5581                                }
5582                        }
5583                } else if (left_level < right_level) {
5584                        advance_right = ADVANCE;
5585                } else {
5586                        advance_left = ADVANCE;
5587                }
5588        }
5589
5590out:
5591        btrfs_free_path(left_path);
5592        btrfs_free_path(right_path);
5593        kvfree(tmp_buf);
5594        return ret;
5595}
5596
5597/*
5598 * this is similar to btrfs_next_leaf, but does not try to preserve
5599 * and fixup the path.  It looks for and returns the next key in the
5600 * tree based on the current path and the min_trans parameters.
5601 *
5602 * 0 is returned if another key is found, < 0 if there are any errors
5603 * and 1 is returned if there are no higher keys in the tree
5604 *
5605 * path->keep_locks should be set to 1 on the search made before
5606 * calling this function.
5607 */
5608int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5609                        struct btrfs_key *key, int level, u64 min_trans)
5610{
5611        int slot;
5612        struct extent_buffer *c;
5613
5614        WARN_ON(!path->keep_locks);
5615        while (level < BTRFS_MAX_LEVEL) {
5616                if (!path->nodes[level])
5617                        return 1;
5618
5619                slot = path->slots[level] + 1;
5620                c = path->nodes[level];
5621next:
5622                if (slot >= btrfs_header_nritems(c)) {
5623                        int ret;
5624                        int orig_lowest;
5625                        struct btrfs_key cur_key;
5626                        if (level + 1 >= BTRFS_MAX_LEVEL ||
5627                            !path->nodes[level + 1])
5628                                return 1;
5629
5630                        if (path->locks[level + 1]) {
5631                                level++;
5632                                continue;
5633                        }
5634
5635                        slot = btrfs_header_nritems(c) - 1;
5636                        if (level == 0)
5637                                btrfs_item_key_to_cpu(c, &cur_key, slot);
5638                        else
5639                                btrfs_node_key_to_cpu(c, &cur_key, slot);
5640
5641                        orig_lowest = path->lowest_level;
5642                        btrfs_release_path(path);
5643                        path->lowest_level = level;
5644                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
5645                                                0, 0);
5646                        path->lowest_level = orig_lowest;
5647                        if (ret < 0)
5648                                return ret;
5649
5650                        c = path->nodes[level];
5651                        slot = path->slots[level];
5652                        if (ret == 0)
5653                                slot++;
5654                        goto next;
5655                }
5656
5657                if (level == 0)
5658                        btrfs_item_key_to_cpu(c, key, slot);
5659                else {
5660                        u64 gen = btrfs_node_ptr_generation(c, slot);
5661
5662                        if (gen < min_trans) {
5663                                slot++;
5664                                goto next;
5665                        }
5666                        btrfs_node_key_to_cpu(c, key, slot);
5667                }
5668                return 0;
5669        }
5670        return 1;
5671}
5672
5673/*
5674 * search the tree again to find a leaf with greater keys
5675 * returns 0 if it found something or 1 if there are no greater leaves.
5676 * returns < 0 on io errors.
5677 */
5678int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5679{
5680        return btrfs_next_old_leaf(root, path, 0);
5681}
5682
5683int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5684                        u64 time_seq)
5685{
5686        int slot;
5687        int level;
5688        struct extent_buffer *c;
5689        struct extent_buffer *next;
5690        struct btrfs_key key;
5691        u32 nritems;
5692        int ret;
5693        int old_spinning = path->leave_spinning;
5694        int next_rw_lock = 0;
5695
5696        nritems = btrfs_header_nritems(path->nodes[0]);
5697        if (nritems == 0)
5698                return 1;
5699
5700        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5701again:
5702        level = 1;
5703        next = NULL;
5704        next_rw_lock = 0;
5705        btrfs_release_path(path);
5706
5707        path->keep_locks = 1;
5708        path->leave_spinning = 1;
5709
5710        if (time_seq)
5711                ret = btrfs_search_old_slot(root, &key, path, time_seq);
5712        else
5713                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5714        path->keep_locks = 0;
5715
5716        if (ret < 0)
5717                return ret;
5718
5719        nritems = btrfs_header_nritems(path->nodes[0]);
5720        /*
5721         * by releasing the path above we dropped all our locks.  A balance
5722         * could have added more items next to the key that used to be
5723         * at the very end of the block.  So, check again here and
5724         * advance the path if there are now more items available.
5725         */
5726        if (nritems > 0 && path->slots[0] < nritems - 1) {
5727                if (ret == 0)
5728                        path->slots[0]++;
5729                ret = 0;
5730                goto done;
5731        }
5732        /*
5733         * So the above check misses one case:
5734         * - after releasing the path above, someone has removed the item that
5735         *   used to be at the very end of the block, and balance between leafs
5736         *   gets another one with bigger key.offset to replace it.
5737         *
5738         * This one should be returned as well, or we can get leaf corruption
5739         * later(esp. in __btrfs_drop_extents()).
5740         *
5741         * And a bit more explanation about this check,
5742         * with ret > 0, the key isn't found, the path points to the slot
5743         * where it should be inserted, so the path->slots[0] item must be the
5744         * bigger one.
5745         */
5746        if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5747                ret = 0;
5748                goto done;
5749        }
5750
5751        while (level < BTRFS_MAX_LEVEL) {
5752                if (!path->nodes[level]) {
5753                        ret = 1;
5754                        goto done;
5755                }
5756
5757                slot = path->slots[level] + 1;
5758                c = path->nodes[level];
5759                if (slot >= btrfs_header_nritems(c)) {
5760                        level++;
5761                        if (level == BTRFS_MAX_LEVEL) {
5762                                ret = 1;
5763                                goto done;
5764                        }
5765                        continue;
5766                }
5767
5768                if (next) {
5769                        btrfs_tree_unlock_rw(next, next_rw_lock);
5770                        free_extent_buffer(next);
5771                }
5772
5773                next = c;
5774                next_rw_lock = path->locks[level];
5775                ret = read_block_for_search(root, path, &next, level,
5776                                            slot, &key);
5777                if (ret == -EAGAIN)
5778                        goto again;
5779
5780                if (ret < 0) {
5781                        btrfs_release_path(path);
5782                        goto done;
5783                }
5784
5785                if (!path->skip_locking) {
5786                        ret = btrfs_try_tree_read_lock(next);
5787                        if (!ret && time_seq) {
5788                                /*
5789                                 * If we don't get the lock, we may be racing
5790                                 * with push_leaf_left, holding that lock while
5791                                 * itself waiting for the leaf we've currently
5792                                 * locked. To solve this situation, we give up
5793                                 * on our lock and cycle.
5794                                 */
5795                                free_extent_buffer(next);
5796                                btrfs_release_path(path);
5797                                cond_resched();
5798                                goto again;
5799                        }
5800                        if (!ret) {
5801                                btrfs_set_path_blocking(path);
5802                                btrfs_tree_read_lock(next);
5803                        }
5804                        next_rw_lock = BTRFS_READ_LOCK;
5805                }
5806                break;
5807        }
5808        path->slots[level] = slot;
5809        while (1) {
5810                level--;
5811                c = path->nodes[level];
5812                if (path->locks[level])
5813                        btrfs_tree_unlock_rw(c, path->locks[level]);
5814
5815                free_extent_buffer(c);
5816                path->nodes[level] = next;
5817                path->slots[level] = 0;
5818                if (!path->skip_locking)
5819                        path->locks[level] = next_rw_lock;
5820                if (!level)
5821                        break;
5822
5823                ret = read_block_for_search(root, path, &next, level,
5824                                            0, &key);
5825                if (ret == -EAGAIN)
5826                        goto again;
5827
5828                if (ret < 0) {
5829                        btrfs_release_path(path);
5830                        goto done;
5831                }
5832
5833                if (!path->skip_locking) {
5834                        ret = btrfs_try_tree_read_lock(next);
5835                        if (!ret) {
5836                                btrfs_set_path_blocking(path);
5837                                btrfs_tree_read_lock(next);
5838                        }
5839                        next_rw_lock = BTRFS_READ_LOCK;
5840                }
5841        }
5842        ret = 0;
5843done:
5844        unlock_up(path, 0, 1, 0, NULL);
5845        path->leave_spinning = old_spinning;
5846        if (!old_spinning)
5847                btrfs_set_path_blocking(path);
5848
5849        return ret;
5850}
5851
5852/*
5853 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5854 * searching until it gets past min_objectid or finds an item of 'type'
5855 *
5856 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5857 */
5858int btrfs_previous_item(struct btrfs_root *root,
5859                        struct btrfs_path *path, u64 min_objectid,
5860                        int type)
5861{
5862        struct btrfs_key found_key;
5863        struct extent_buffer *leaf;
5864        u32 nritems;
5865        int ret;
5866
5867        while (1) {
5868                if (path->slots[0] == 0) {
5869                        btrfs_set_path_blocking(path);
5870                        ret = btrfs_prev_leaf(root, path);
5871                        if (ret != 0)
5872                                return ret;
5873                } else {
5874                        path->slots[0]--;
5875                }
5876                leaf = path->nodes[0];
5877                nritems = btrfs_header_nritems(leaf);
5878                if (nritems == 0)
5879                        return 1;
5880                if (path->slots[0] == nritems)
5881                        path->slots[0]--;
5882
5883                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5884                if (found_key.objectid < min_objectid)
5885                        break;
5886                if (found_key.type == type)
5887                        return 0;
5888                if (found_key.objectid == min_objectid &&
5889                    found_key.type < type)
5890                        break;
5891        }
5892        return 1;
5893}
5894
5895/*
5896 * search in extent tree to find a previous Metadata/Data extent item with
5897 * min objecitd.
5898 *
5899 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5900 */
5901int btrfs_previous_extent_item(struct btrfs_root *root,
5902                        struct btrfs_path *path, u64 min_objectid)
5903{
5904        struct btrfs_key found_key;
5905        struct extent_buffer *leaf;
5906        u32 nritems;
5907        int ret;
5908
5909        while (1) {
5910                if (path->slots[0] == 0) {
5911                        btrfs_set_path_blocking(path);
5912                        ret = btrfs_prev_leaf(root, path);
5913                        if (ret != 0)
5914                                return ret;
5915                } else {
5916                        path->slots[0]--;
5917                }
5918                leaf = path->nodes[0];
5919                nritems = btrfs_header_nritems(leaf);
5920                if (nritems == 0)
5921                        return 1;
5922                if (path->slots[0] == nritems)
5923                        path->slots[0]--;
5924
5925                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5926                if (found_key.objectid < min_objectid)
5927                        break;
5928                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5929                    found_key.type == BTRFS_METADATA_ITEM_KEY)
5930                        return 0;
5931                if (found_key.objectid == min_objectid &&
5932                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5933                        break;
5934        }
5935        return 1;
5936}
5937