linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/slab.h>
   8#include <linux/rbtree.h>
   9#include <linux/mm.h>
  10#include "ctree.h"
  11#include "disk-io.h"
  12#include "transaction.h"
  13#include "print-tree.h"
  14#include "locking.h"
  15#include "volumes.h"
  16#include "qgroup.h"
  17
  18static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  19                      *root, struct btrfs_path *path, int level);
  20static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  21                      const struct btrfs_key *ins_key, struct btrfs_path *path,
  22                      int data_size, int extend);
  23static int push_node_left(struct btrfs_trans_handle *trans,
  24                          struct extent_buffer *dst,
  25                          struct extent_buffer *src, int empty);
  26static int balance_node_right(struct btrfs_trans_handle *trans,
  27                              struct extent_buffer *dst_buf,
  28                              struct extent_buffer *src_buf);
  29static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
  30                    int level, int slot);
  31
  32static const struct btrfs_csums {
  33        u16             size;
  34        const char      name[10];
  35        const char      driver[12];
  36} btrfs_csums[] = {
  37        [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
  38        [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
  39        [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
  40        [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
  41                                     .driver = "blake2b-256" },
  42};
  43
  44int btrfs_super_csum_size(const struct btrfs_super_block *s)
  45{
  46        u16 t = btrfs_super_csum_type(s);
  47        /*
  48         * csum type is validated at mount time
  49         */
  50        return btrfs_csums[t].size;
  51}
  52
  53const char *btrfs_super_csum_name(u16 csum_type)
  54{
  55        /* csum type is validated at mount time */
  56        return btrfs_csums[csum_type].name;
  57}
  58
  59/*
  60 * Return driver name if defined, otherwise the name that's also a valid driver
  61 * name
  62 */
  63const char *btrfs_super_csum_driver(u16 csum_type)
  64{
  65        /* csum type is validated at mount time */
  66        return btrfs_csums[csum_type].driver[0] ?
  67                btrfs_csums[csum_type].driver :
  68                btrfs_csums[csum_type].name;
  69}
  70
  71size_t __attribute_const__ btrfs_get_num_csums(void)
  72{
  73        return ARRAY_SIZE(btrfs_csums);
  74}
  75
  76struct btrfs_path *btrfs_alloc_path(void)
  77{
  78        return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  79}
  80
  81/* this also releases the path */
  82void btrfs_free_path(struct btrfs_path *p)
  83{
  84        if (!p)
  85                return;
  86        btrfs_release_path(p);
  87        kmem_cache_free(btrfs_path_cachep, p);
  88}
  89
  90/*
  91 * path release drops references on the extent buffers in the path
  92 * and it drops any locks held by this path
  93 *
  94 * It is safe to call this on paths that no locks or extent buffers held.
  95 */
  96noinline void btrfs_release_path(struct btrfs_path *p)
  97{
  98        int i;
  99
 100        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 101                p->slots[i] = 0;
 102                if (!p->nodes[i])
 103                        continue;
 104                if (p->locks[i]) {
 105                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
 106                        p->locks[i] = 0;
 107                }
 108                free_extent_buffer(p->nodes[i]);
 109                p->nodes[i] = NULL;
 110        }
 111}
 112
 113/*
 114 * safely gets a reference on the root node of a tree.  A lock
 115 * is not taken, so a concurrent writer may put a different node
 116 * at the root of the tree.  See btrfs_lock_root_node for the
 117 * looping required.
 118 *
 119 * The extent buffer returned by this has a reference taken, so
 120 * it won't disappear.  It may stop being the root of the tree
 121 * at any time because there are no locks held.
 122 */
 123struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 124{
 125        struct extent_buffer *eb;
 126
 127        while (1) {
 128                rcu_read_lock();
 129                eb = rcu_dereference(root->node);
 130
 131                /*
 132                 * RCU really hurts here, we could free up the root node because
 133                 * it was COWed but we may not get the new root node yet so do
 134                 * the inc_not_zero dance and if it doesn't work then
 135                 * synchronize_rcu and try again.
 136                 */
 137                if (atomic_inc_not_zero(&eb->refs)) {
 138                        rcu_read_unlock();
 139                        break;
 140                }
 141                rcu_read_unlock();
 142                synchronize_rcu();
 143        }
 144        return eb;
 145}
 146
 147/*
 148 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
 149 * just get put onto a simple dirty list.  Transaction walks this list to make
 150 * sure they get properly updated on disk.
 151 */
 152static void add_root_to_dirty_list(struct btrfs_root *root)
 153{
 154        struct btrfs_fs_info *fs_info = root->fs_info;
 155
 156        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
 157            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
 158                return;
 159
 160        spin_lock(&fs_info->trans_lock);
 161        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
 162                /* Want the extent tree to be the last on the list */
 163                if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
 164                        list_move_tail(&root->dirty_list,
 165                                       &fs_info->dirty_cowonly_roots);
 166                else
 167                        list_move(&root->dirty_list,
 168                                  &fs_info->dirty_cowonly_roots);
 169        }
 170        spin_unlock(&fs_info->trans_lock);
 171}
 172
 173/*
 174 * used by snapshot creation to make a copy of a root for a tree with
 175 * a given objectid.  The buffer with the new root node is returned in
 176 * cow_ret, and this func returns zero on success or a negative error code.
 177 */
 178int btrfs_copy_root(struct btrfs_trans_handle *trans,
 179                      struct btrfs_root *root,
 180                      struct extent_buffer *buf,
 181                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 182{
 183        struct btrfs_fs_info *fs_info = root->fs_info;
 184        struct extent_buffer *cow;
 185        int ret = 0;
 186        int level;
 187        struct btrfs_disk_key disk_key;
 188
 189        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 190                trans->transid != fs_info->running_transaction->transid);
 191        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 192                trans->transid != root->last_trans);
 193
 194        level = btrfs_header_level(buf);
 195        if (level == 0)
 196                btrfs_item_key(buf, &disk_key, 0);
 197        else
 198                btrfs_node_key(buf, &disk_key, 0);
 199
 200        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 201                        &disk_key, level, buf->start, 0);
 202        if (IS_ERR(cow))
 203                return PTR_ERR(cow);
 204
 205        copy_extent_buffer_full(cow, buf);
 206        btrfs_set_header_bytenr(cow, cow->start);
 207        btrfs_set_header_generation(cow, trans->transid);
 208        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 209        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 210                                     BTRFS_HEADER_FLAG_RELOC);
 211        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 212                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 213        else
 214                btrfs_set_header_owner(cow, new_root_objectid);
 215
 216        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
 217
 218        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 219        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 220                ret = btrfs_inc_ref(trans, root, cow, 1);
 221        else
 222                ret = btrfs_inc_ref(trans, root, cow, 0);
 223
 224        if (ret)
 225                return ret;
 226
 227        btrfs_mark_buffer_dirty(cow);
 228        *cow_ret = cow;
 229        return 0;
 230}
 231
 232enum mod_log_op {
 233        MOD_LOG_KEY_REPLACE,
 234        MOD_LOG_KEY_ADD,
 235        MOD_LOG_KEY_REMOVE,
 236        MOD_LOG_KEY_REMOVE_WHILE_FREEING,
 237        MOD_LOG_KEY_REMOVE_WHILE_MOVING,
 238        MOD_LOG_MOVE_KEYS,
 239        MOD_LOG_ROOT_REPLACE,
 240};
 241
 242struct tree_mod_root {
 243        u64 logical;
 244        u8 level;
 245};
 246
 247struct tree_mod_elem {
 248        struct rb_node node;
 249        u64 logical;
 250        u64 seq;
 251        enum mod_log_op op;
 252
 253        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
 254        int slot;
 255
 256        /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
 257        u64 generation;
 258
 259        /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
 260        struct btrfs_disk_key key;
 261        u64 blockptr;
 262
 263        /* this is used for op == MOD_LOG_MOVE_KEYS */
 264        struct {
 265                int dst_slot;
 266                int nr_items;
 267        } move;
 268
 269        /* this is used for op == MOD_LOG_ROOT_REPLACE */
 270        struct tree_mod_root old_root;
 271};
 272
 273/*
 274 * Pull a new tree mod seq number for our operation.
 275 */
 276static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 277{
 278        return atomic64_inc_return(&fs_info->tree_mod_seq);
 279}
 280
 281/*
 282 * This adds a new blocker to the tree mod log's blocker list if the @elem
 283 * passed does not already have a sequence number set. So when a caller expects
 284 * to record tree modifications, it should ensure to set elem->seq to zero
 285 * before calling btrfs_get_tree_mod_seq.
 286 * Returns a fresh, unused tree log modification sequence number, even if no new
 287 * blocker was added.
 288 */
 289u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 290                           struct seq_list *elem)
 291{
 292        write_lock(&fs_info->tree_mod_log_lock);
 293        if (!elem->seq) {
 294                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 295                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 296        }
 297        write_unlock(&fs_info->tree_mod_log_lock);
 298
 299        return elem->seq;
 300}
 301
 302void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 303                            struct seq_list *elem)
 304{
 305        struct rb_root *tm_root;
 306        struct rb_node *node;
 307        struct rb_node *next;
 308        struct tree_mod_elem *tm;
 309        u64 min_seq = (u64)-1;
 310        u64 seq_putting = elem->seq;
 311
 312        if (!seq_putting)
 313                return;
 314
 315        write_lock(&fs_info->tree_mod_log_lock);
 316        list_del(&elem->list);
 317        elem->seq = 0;
 318
 319        if (!list_empty(&fs_info->tree_mod_seq_list)) {
 320                struct seq_list *first;
 321
 322                first = list_first_entry(&fs_info->tree_mod_seq_list,
 323                                         struct seq_list, list);
 324                if (seq_putting > first->seq) {
 325                        /*
 326                         * Blocker with lower sequence number exists, we
 327                         * cannot remove anything from the log.
 328                         */
 329                        write_unlock(&fs_info->tree_mod_log_lock);
 330                        return;
 331                }
 332                min_seq = first->seq;
 333        }
 334
 335        /*
 336         * anything that's lower than the lowest existing (read: blocked)
 337         * sequence number can be removed from the tree.
 338         */
 339        tm_root = &fs_info->tree_mod_log;
 340        for (node = rb_first(tm_root); node; node = next) {
 341                next = rb_next(node);
 342                tm = rb_entry(node, struct tree_mod_elem, node);
 343                if (tm->seq >= min_seq)
 344                        continue;
 345                rb_erase(node, tm_root);
 346                kfree(tm);
 347        }
 348        write_unlock(&fs_info->tree_mod_log_lock);
 349}
 350
 351/*
 352 * key order of the log:
 353 *       node/leaf start address -> sequence
 354 *
 355 * The 'start address' is the logical address of the *new* root node
 356 * for root replace operations, or the logical address of the affected
 357 * block for all other operations.
 358 */
 359static noinline int
 360__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 361{
 362        struct rb_root *tm_root;
 363        struct rb_node **new;
 364        struct rb_node *parent = NULL;
 365        struct tree_mod_elem *cur;
 366
 367        lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
 368
 369        tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 370
 371        tm_root = &fs_info->tree_mod_log;
 372        new = &tm_root->rb_node;
 373        while (*new) {
 374                cur = rb_entry(*new, struct tree_mod_elem, node);
 375                parent = *new;
 376                if (cur->logical < tm->logical)
 377                        new = &((*new)->rb_left);
 378                else if (cur->logical > tm->logical)
 379                        new = &((*new)->rb_right);
 380                else if (cur->seq < tm->seq)
 381                        new = &((*new)->rb_left);
 382                else if (cur->seq > tm->seq)
 383                        new = &((*new)->rb_right);
 384                else
 385                        return -EEXIST;
 386        }
 387
 388        rb_link_node(&tm->node, parent, new);
 389        rb_insert_color(&tm->node, tm_root);
 390        return 0;
 391}
 392
 393/*
 394 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 395 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 396 * this until all tree mod log insertions are recorded in the rb tree and then
 397 * write unlock fs_info::tree_mod_log_lock.
 398 */
 399static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 400                                    struct extent_buffer *eb) {
 401        smp_mb();
 402        if (list_empty(&(fs_info)->tree_mod_seq_list))
 403                return 1;
 404        if (eb && btrfs_header_level(eb) == 0)
 405                return 1;
 406
 407        write_lock(&fs_info->tree_mod_log_lock);
 408        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 409                write_unlock(&fs_info->tree_mod_log_lock);
 410                return 1;
 411        }
 412
 413        return 0;
 414}
 415
 416/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 417static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 418                                    struct extent_buffer *eb)
 419{
 420        smp_mb();
 421        if (list_empty(&(fs_info)->tree_mod_seq_list))
 422                return 0;
 423        if (eb && btrfs_header_level(eb) == 0)
 424                return 0;
 425
 426        return 1;
 427}
 428
 429static struct tree_mod_elem *
 430alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
 431                    enum mod_log_op op, gfp_t flags)
 432{
 433        struct tree_mod_elem *tm;
 434
 435        tm = kzalloc(sizeof(*tm), flags);
 436        if (!tm)
 437                return NULL;
 438
 439        tm->logical = eb->start;
 440        if (op != MOD_LOG_KEY_ADD) {
 441                btrfs_node_key(eb, &tm->key, slot);
 442                tm->blockptr = btrfs_node_blockptr(eb, slot);
 443        }
 444        tm->op = op;
 445        tm->slot = slot;
 446        tm->generation = btrfs_node_ptr_generation(eb, slot);
 447        RB_CLEAR_NODE(&tm->node);
 448
 449        return tm;
 450}
 451
 452static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 453                enum mod_log_op op, gfp_t flags)
 454{
 455        struct tree_mod_elem *tm;
 456        int ret;
 457
 458        if (!tree_mod_need_log(eb->fs_info, eb))
 459                return 0;
 460
 461        tm = alloc_tree_mod_elem(eb, slot, op, flags);
 462        if (!tm)
 463                return -ENOMEM;
 464
 465        if (tree_mod_dont_log(eb->fs_info, eb)) {
 466                kfree(tm);
 467                return 0;
 468        }
 469
 470        ret = __tree_mod_log_insert(eb->fs_info, tm);
 471        write_unlock(&eb->fs_info->tree_mod_log_lock);
 472        if (ret)
 473                kfree(tm);
 474
 475        return ret;
 476}
 477
 478static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
 479                int dst_slot, int src_slot, int nr_items)
 480{
 481        struct tree_mod_elem *tm = NULL;
 482        struct tree_mod_elem **tm_list = NULL;
 483        int ret = 0;
 484        int i;
 485        int locked = 0;
 486
 487        if (!tree_mod_need_log(eb->fs_info, eb))
 488                return 0;
 489
 490        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
 491        if (!tm_list)
 492                return -ENOMEM;
 493
 494        tm = kzalloc(sizeof(*tm), GFP_NOFS);
 495        if (!tm) {
 496                ret = -ENOMEM;
 497                goto free_tms;
 498        }
 499
 500        tm->logical = eb->start;
 501        tm->slot = src_slot;
 502        tm->move.dst_slot = dst_slot;
 503        tm->move.nr_items = nr_items;
 504        tm->op = MOD_LOG_MOVE_KEYS;
 505
 506        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 507                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 508                    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
 509                if (!tm_list[i]) {
 510                        ret = -ENOMEM;
 511                        goto free_tms;
 512                }
 513        }
 514
 515        if (tree_mod_dont_log(eb->fs_info, eb))
 516                goto free_tms;
 517        locked = 1;
 518
 519        /*
 520         * When we override something during the move, we log these removals.
 521         * This can only happen when we move towards the beginning of the
 522         * buffer, i.e. dst_slot < src_slot.
 523         */
 524        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 525                ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
 526                if (ret)
 527                        goto free_tms;
 528        }
 529
 530        ret = __tree_mod_log_insert(eb->fs_info, tm);
 531        if (ret)
 532                goto free_tms;
 533        write_unlock(&eb->fs_info->tree_mod_log_lock);
 534        kfree(tm_list);
 535
 536        return 0;
 537free_tms:
 538        for (i = 0; i < nr_items; i++) {
 539                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 540                        rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
 541                kfree(tm_list[i]);
 542        }
 543        if (locked)
 544                write_unlock(&eb->fs_info->tree_mod_log_lock);
 545        kfree(tm_list);
 546        kfree(tm);
 547
 548        return ret;
 549}
 550
 551static inline int
 552__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 553                       struct tree_mod_elem **tm_list,
 554                       int nritems)
 555{
 556        int i, j;
 557        int ret;
 558
 559        for (i = nritems - 1; i >= 0; i--) {
 560                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 561                if (ret) {
 562                        for (j = nritems - 1; j > i; j--)
 563                                rb_erase(&tm_list[j]->node,
 564                                         &fs_info->tree_mod_log);
 565                        return ret;
 566                }
 567        }
 568
 569        return 0;
 570}
 571
 572static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
 573                         struct extent_buffer *new_root, int log_removal)
 574{
 575        struct btrfs_fs_info *fs_info = old_root->fs_info;
 576        struct tree_mod_elem *tm = NULL;
 577        struct tree_mod_elem **tm_list = NULL;
 578        int nritems = 0;
 579        int ret = 0;
 580        int i;
 581
 582        if (!tree_mod_need_log(fs_info, NULL))
 583                return 0;
 584
 585        if (log_removal && btrfs_header_level(old_root) > 0) {
 586                nritems = btrfs_header_nritems(old_root);
 587                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 588                                  GFP_NOFS);
 589                if (!tm_list) {
 590                        ret = -ENOMEM;
 591                        goto free_tms;
 592                }
 593                for (i = 0; i < nritems; i++) {
 594                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
 595                            MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 596                        if (!tm_list[i]) {
 597                                ret = -ENOMEM;
 598                                goto free_tms;
 599                        }
 600                }
 601        }
 602
 603        tm = kzalloc(sizeof(*tm), GFP_NOFS);
 604        if (!tm) {
 605                ret = -ENOMEM;
 606                goto free_tms;
 607        }
 608
 609        tm->logical = new_root->start;
 610        tm->old_root.logical = old_root->start;
 611        tm->old_root.level = btrfs_header_level(old_root);
 612        tm->generation = btrfs_header_generation(old_root);
 613        tm->op = MOD_LOG_ROOT_REPLACE;
 614
 615        if (tree_mod_dont_log(fs_info, NULL))
 616                goto free_tms;
 617
 618        if (tm_list)
 619                ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 620        if (!ret)
 621                ret = __tree_mod_log_insert(fs_info, tm);
 622
 623        write_unlock(&fs_info->tree_mod_log_lock);
 624        if (ret)
 625                goto free_tms;
 626        kfree(tm_list);
 627
 628        return ret;
 629
 630free_tms:
 631        if (tm_list) {
 632                for (i = 0; i < nritems; i++)
 633                        kfree(tm_list[i]);
 634                kfree(tm_list);
 635        }
 636        kfree(tm);
 637
 638        return ret;
 639}
 640
 641static struct tree_mod_elem *
 642__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 643                      int smallest)
 644{
 645        struct rb_root *tm_root;
 646        struct rb_node *node;
 647        struct tree_mod_elem *cur = NULL;
 648        struct tree_mod_elem *found = NULL;
 649
 650        read_lock(&fs_info->tree_mod_log_lock);
 651        tm_root = &fs_info->tree_mod_log;
 652        node = tm_root->rb_node;
 653        while (node) {
 654                cur = rb_entry(node, struct tree_mod_elem, node);
 655                if (cur->logical < start) {
 656                        node = node->rb_left;
 657                } else if (cur->logical > start) {
 658                        node = node->rb_right;
 659                } else if (cur->seq < min_seq) {
 660                        node = node->rb_left;
 661                } else if (!smallest) {
 662                        /* we want the node with the highest seq */
 663                        if (found)
 664                                BUG_ON(found->seq > cur->seq);
 665                        found = cur;
 666                        node = node->rb_left;
 667                } else if (cur->seq > min_seq) {
 668                        /* we want the node with the smallest seq */
 669                        if (found)
 670                                BUG_ON(found->seq < cur->seq);
 671                        found = cur;
 672                        node = node->rb_right;
 673                } else {
 674                        found = cur;
 675                        break;
 676                }
 677        }
 678        read_unlock(&fs_info->tree_mod_log_lock);
 679
 680        return found;
 681}
 682
 683/*
 684 * this returns the element from the log with the smallest time sequence
 685 * value that's in the log (the oldest log item). any element with a time
 686 * sequence lower than min_seq will be ignored.
 687 */
 688static struct tree_mod_elem *
 689tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
 690                           u64 min_seq)
 691{
 692        return __tree_mod_log_search(fs_info, start, min_seq, 1);
 693}
 694
 695/*
 696 * this returns the element from the log with the largest time sequence
 697 * value that's in the log (the most recent log item). any element with
 698 * a time sequence lower than min_seq will be ignored.
 699 */
 700static struct tree_mod_elem *
 701tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 702{
 703        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 704}
 705
 706static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
 707                     struct extent_buffer *src, unsigned long dst_offset,
 708                     unsigned long src_offset, int nr_items)
 709{
 710        struct btrfs_fs_info *fs_info = dst->fs_info;
 711        int ret = 0;
 712        struct tree_mod_elem **tm_list = NULL;
 713        struct tree_mod_elem **tm_list_add, **tm_list_rem;
 714        int i;
 715        int locked = 0;
 716
 717        if (!tree_mod_need_log(fs_info, NULL))
 718                return 0;
 719
 720        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 721                return 0;
 722
 723        tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 724                          GFP_NOFS);
 725        if (!tm_list)
 726                return -ENOMEM;
 727
 728        tm_list_add = tm_list;
 729        tm_list_rem = tm_list + nr_items;
 730        for (i = 0; i < nr_items; i++) {
 731                tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 732                    MOD_LOG_KEY_REMOVE, GFP_NOFS);
 733                if (!tm_list_rem[i]) {
 734                        ret = -ENOMEM;
 735                        goto free_tms;
 736                }
 737
 738                tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 739                    MOD_LOG_KEY_ADD, GFP_NOFS);
 740                if (!tm_list_add[i]) {
 741                        ret = -ENOMEM;
 742                        goto free_tms;
 743                }
 744        }
 745
 746        if (tree_mod_dont_log(fs_info, NULL))
 747                goto free_tms;
 748        locked = 1;
 749
 750        for (i = 0; i < nr_items; i++) {
 751                ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
 752                if (ret)
 753                        goto free_tms;
 754                ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
 755                if (ret)
 756                        goto free_tms;
 757        }
 758
 759        write_unlock(&fs_info->tree_mod_log_lock);
 760        kfree(tm_list);
 761
 762        return 0;
 763
 764free_tms:
 765        for (i = 0; i < nr_items * 2; i++) {
 766                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 767                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 768                kfree(tm_list[i]);
 769        }
 770        if (locked)
 771                write_unlock(&fs_info->tree_mod_log_lock);
 772        kfree(tm_list);
 773
 774        return ret;
 775}
 776
 777static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
 778{
 779        struct tree_mod_elem **tm_list = NULL;
 780        int nritems = 0;
 781        int i;
 782        int ret = 0;
 783
 784        if (btrfs_header_level(eb) == 0)
 785                return 0;
 786
 787        if (!tree_mod_need_log(eb->fs_info, NULL))
 788                return 0;
 789
 790        nritems = btrfs_header_nritems(eb);
 791        tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 792        if (!tm_list)
 793                return -ENOMEM;
 794
 795        for (i = 0; i < nritems; i++) {
 796                tm_list[i] = alloc_tree_mod_elem(eb, i,
 797                    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 798                if (!tm_list[i]) {
 799                        ret = -ENOMEM;
 800                        goto free_tms;
 801                }
 802        }
 803
 804        if (tree_mod_dont_log(eb->fs_info, eb))
 805                goto free_tms;
 806
 807        ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
 808        write_unlock(&eb->fs_info->tree_mod_log_lock);
 809        if (ret)
 810                goto free_tms;
 811        kfree(tm_list);
 812
 813        return 0;
 814
 815free_tms:
 816        for (i = 0; i < nritems; i++)
 817                kfree(tm_list[i]);
 818        kfree(tm_list);
 819
 820        return ret;
 821}
 822
 823/*
 824 * check if the tree block can be shared by multiple trees
 825 */
 826int btrfs_block_can_be_shared(struct btrfs_root *root,
 827                              struct extent_buffer *buf)
 828{
 829        /*
 830         * Tree blocks not in shareable trees and tree roots are never shared.
 831         * If a block was allocated after the last snapshot and the block was
 832         * not allocated by tree relocation, we know the block is not shared.
 833         */
 834        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 835            buf != root->node && buf != root->commit_root &&
 836            (btrfs_header_generation(buf) <=
 837             btrfs_root_last_snapshot(&root->root_item) ||
 838             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 839                return 1;
 840
 841        return 0;
 842}
 843
 844static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 845                                       struct btrfs_root *root,
 846                                       struct extent_buffer *buf,
 847                                       struct extent_buffer *cow,
 848                                       int *last_ref)
 849{
 850        struct btrfs_fs_info *fs_info = root->fs_info;
 851        u64 refs;
 852        u64 owner;
 853        u64 flags;
 854        u64 new_flags = 0;
 855        int ret;
 856
 857        /*
 858         * Backrefs update rules:
 859         *
 860         * Always use full backrefs for extent pointers in tree block
 861         * allocated by tree relocation.
 862         *
 863         * If a shared tree block is no longer referenced by its owner
 864         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 865         * use full backrefs for extent pointers in tree block.
 866         *
 867         * If a tree block is been relocating
 868         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
 869         * use full backrefs for extent pointers in tree block.
 870         * The reason for this is some operations (such as drop tree)
 871         * are only allowed for blocks use full backrefs.
 872         */
 873
 874        if (btrfs_block_can_be_shared(root, buf)) {
 875                ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
 876                                               btrfs_header_level(buf), 1,
 877                                               &refs, &flags);
 878                if (ret)
 879                        return ret;
 880                if (refs == 0) {
 881                        ret = -EROFS;
 882                        btrfs_handle_fs_error(fs_info, ret, NULL);
 883                        return ret;
 884                }
 885        } else {
 886                refs = 1;
 887                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
 888                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 889                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 890                else
 891                        flags = 0;
 892        }
 893
 894        owner = btrfs_header_owner(buf);
 895        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
 896               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 897
 898        if (refs > 1) {
 899                if ((owner == root->root_key.objectid ||
 900                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 901                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
 902                        ret = btrfs_inc_ref(trans, root, buf, 1);
 903                        if (ret)
 904                                return ret;
 905
 906                        if (root->root_key.objectid ==
 907                            BTRFS_TREE_RELOC_OBJECTID) {
 908                                ret = btrfs_dec_ref(trans, root, buf, 0);
 909                                if (ret)
 910                                        return ret;
 911                                ret = btrfs_inc_ref(trans, root, cow, 1);
 912                                if (ret)
 913                                        return ret;
 914                        }
 915                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 916                } else {
 917
 918                        if (root->root_key.objectid ==
 919                            BTRFS_TREE_RELOC_OBJECTID)
 920                                ret = btrfs_inc_ref(trans, root, cow, 1);
 921                        else
 922                                ret = btrfs_inc_ref(trans, root, cow, 0);
 923                        if (ret)
 924                                return ret;
 925                }
 926                if (new_flags != 0) {
 927                        int level = btrfs_header_level(buf);
 928
 929                        ret = btrfs_set_disk_extent_flags(trans, buf,
 930                                                          new_flags, level, 0);
 931                        if (ret)
 932                                return ret;
 933                }
 934        } else {
 935                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 936                        if (root->root_key.objectid ==
 937                            BTRFS_TREE_RELOC_OBJECTID)
 938                                ret = btrfs_inc_ref(trans, root, cow, 1);
 939                        else
 940                                ret = btrfs_inc_ref(trans, root, cow, 0);
 941                        if (ret)
 942                                return ret;
 943                        ret = btrfs_dec_ref(trans, root, buf, 1);
 944                        if (ret)
 945                                return ret;
 946                }
 947                btrfs_clean_tree_block(buf);
 948                *last_ref = 1;
 949        }
 950        return 0;
 951}
 952
 953static struct extent_buffer *alloc_tree_block_no_bg_flush(
 954                                          struct btrfs_trans_handle *trans,
 955                                          struct btrfs_root *root,
 956                                          u64 parent_start,
 957                                          const struct btrfs_disk_key *disk_key,
 958                                          int level,
 959                                          u64 hint,
 960                                          u64 empty_size)
 961{
 962        struct btrfs_fs_info *fs_info = root->fs_info;
 963        struct extent_buffer *ret;
 964
 965        /*
 966         * If we are COWing a node/leaf from the extent, chunk, device or free
 967         * space trees, make sure that we do not finish block group creation of
 968         * pending block groups. We do this to avoid a deadlock.
 969         * COWing can result in allocation of a new chunk, and flushing pending
 970         * block groups (btrfs_create_pending_block_groups()) can be triggered
 971         * when finishing allocation of a new chunk. Creation of a pending block
 972         * group modifies the extent, chunk, device and free space trees,
 973         * therefore we could deadlock with ourselves since we are holding a
 974         * lock on an extent buffer that btrfs_create_pending_block_groups() may
 975         * try to COW later.
 976         * For similar reasons, we also need to delay flushing pending block
 977         * groups when splitting a leaf or node, from one of those trees, since
 978         * we are holding a write lock on it and its parent or when inserting a
 979         * new root node for one of those trees.
 980         */
 981        if (root == fs_info->extent_root ||
 982            root == fs_info->chunk_root ||
 983            root == fs_info->dev_root ||
 984            root == fs_info->free_space_root)
 985                trans->can_flush_pending_bgs = false;
 986
 987        ret = btrfs_alloc_tree_block(trans, root, parent_start,
 988                                     root->root_key.objectid, disk_key, level,
 989                                     hint, empty_size);
 990        trans->can_flush_pending_bgs = true;
 991
 992        return ret;
 993}
 994
 995/*
 996 * does the dirty work in cow of a single block.  The parent block (if
 997 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 998 * dirty and returned locked.  If you modify the block it needs to be marked
 999 * dirty again.
1000 *
1001 * search_start -- an allocation hint for the new block
1002 *
1003 * empty_size -- a hint that you plan on doing more cow.  This is the size in
1004 * bytes the allocator should try to find free next to the block it returns.
1005 * This is just a hint and may be ignored by the allocator.
1006 */
1007static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1008                             struct btrfs_root *root,
1009                             struct extent_buffer *buf,
1010                             struct extent_buffer *parent, int parent_slot,
1011                             struct extent_buffer **cow_ret,
1012                             u64 search_start, u64 empty_size)
1013{
1014        struct btrfs_fs_info *fs_info = root->fs_info;
1015        struct btrfs_disk_key disk_key;
1016        struct extent_buffer *cow;
1017        int level, ret;
1018        int last_ref = 0;
1019        int unlock_orig = 0;
1020        u64 parent_start = 0;
1021
1022        if (*cow_ret == buf)
1023                unlock_orig = 1;
1024
1025        btrfs_assert_tree_locked(buf);
1026
1027        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1028                trans->transid != fs_info->running_transaction->transid);
1029        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1030                trans->transid != root->last_trans);
1031
1032        level = btrfs_header_level(buf);
1033
1034        if (level == 0)
1035                btrfs_item_key(buf, &disk_key, 0);
1036        else
1037                btrfs_node_key(buf, &disk_key, 0);
1038
1039        if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1040                parent_start = parent->start;
1041
1042        cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1043                                           level, search_start, empty_size);
1044        if (IS_ERR(cow))
1045                return PTR_ERR(cow);
1046
1047        /* cow is set to blocking by btrfs_init_new_buffer */
1048
1049        copy_extent_buffer_full(cow, buf);
1050        btrfs_set_header_bytenr(cow, cow->start);
1051        btrfs_set_header_generation(cow, trans->transid);
1052        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1053        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1054                                     BTRFS_HEADER_FLAG_RELOC);
1055        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1056                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1057        else
1058                btrfs_set_header_owner(cow, root->root_key.objectid);
1059
1060        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1061
1062        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1063        if (ret) {
1064                btrfs_abort_transaction(trans, ret);
1065                return ret;
1066        }
1067
1068        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
1069                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1070                if (ret) {
1071                        btrfs_abort_transaction(trans, ret);
1072                        return ret;
1073                }
1074        }
1075
1076        if (buf == root->node) {
1077                WARN_ON(parent && parent != buf);
1078                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1079                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1080                        parent_start = buf->start;
1081
1082                atomic_inc(&cow->refs);
1083                ret = tree_mod_log_insert_root(root->node, cow, 1);
1084                BUG_ON(ret < 0);
1085                rcu_assign_pointer(root->node, cow);
1086
1087                btrfs_free_tree_block(trans, root, buf, parent_start,
1088                                      last_ref);
1089                free_extent_buffer(buf);
1090                add_root_to_dirty_list(root);
1091        } else {
1092                WARN_ON(trans->transid != btrfs_header_generation(parent));
1093                tree_mod_log_insert_key(parent, parent_slot,
1094                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1095                btrfs_set_node_blockptr(parent, parent_slot,
1096                                        cow->start);
1097                btrfs_set_node_ptr_generation(parent, parent_slot,
1098                                              trans->transid);
1099                btrfs_mark_buffer_dirty(parent);
1100                if (last_ref) {
1101                        ret = tree_mod_log_free_eb(buf);
1102                        if (ret) {
1103                                btrfs_abort_transaction(trans, ret);
1104                                return ret;
1105                        }
1106                }
1107                btrfs_free_tree_block(trans, root, buf, parent_start,
1108                                      last_ref);
1109        }
1110        if (unlock_orig)
1111                btrfs_tree_unlock(buf);
1112        free_extent_buffer_stale(buf);
1113        btrfs_mark_buffer_dirty(cow);
1114        *cow_ret = cow;
1115        return 0;
1116}
1117
1118/*
1119 * returns the logical address of the oldest predecessor of the given root.
1120 * entries older than time_seq are ignored.
1121 */
1122static struct tree_mod_elem *__tree_mod_log_oldest_root(
1123                struct extent_buffer *eb_root, u64 time_seq)
1124{
1125        struct tree_mod_elem *tm;
1126        struct tree_mod_elem *found = NULL;
1127        u64 root_logical = eb_root->start;
1128        int looped = 0;
1129
1130        if (!time_seq)
1131                return NULL;
1132
1133        /*
1134         * the very last operation that's logged for a root is the
1135         * replacement operation (if it is replaced at all). this has
1136         * the logical address of the *new* root, making it the very
1137         * first operation that's logged for this root.
1138         */
1139        while (1) {
1140                tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1141                                                time_seq);
1142                if (!looped && !tm)
1143                        return NULL;
1144                /*
1145                 * if there are no tree operation for the oldest root, we simply
1146                 * return it. this should only happen if that (old) root is at
1147                 * level 0.
1148                 */
1149                if (!tm)
1150                        break;
1151
1152                /*
1153                 * if there's an operation that's not a root replacement, we
1154                 * found the oldest version of our root. normally, we'll find a
1155                 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1156                 */
1157                if (tm->op != MOD_LOG_ROOT_REPLACE)
1158                        break;
1159
1160                found = tm;
1161                root_logical = tm->old_root.logical;
1162                looped = 1;
1163        }
1164
1165        /* if there's no old root to return, return what we found instead */
1166        if (!found)
1167                found = tm;
1168
1169        return found;
1170}
1171
1172/*
1173 * tm is a pointer to the first operation to rewind within eb. then, all
1174 * previous operations will be rewound (until we reach something older than
1175 * time_seq).
1176 */
1177static void
1178__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1179                      u64 time_seq, struct tree_mod_elem *first_tm)
1180{
1181        u32 n;
1182        struct rb_node *next;
1183        struct tree_mod_elem *tm = first_tm;
1184        unsigned long o_dst;
1185        unsigned long o_src;
1186        unsigned long p_size = sizeof(struct btrfs_key_ptr);
1187
1188        n = btrfs_header_nritems(eb);
1189        read_lock(&fs_info->tree_mod_log_lock);
1190        while (tm && tm->seq >= time_seq) {
1191                /*
1192                 * all the operations are recorded with the operator used for
1193                 * the modification. as we're going backwards, we do the
1194                 * opposite of each operation here.
1195                 */
1196                switch (tm->op) {
1197                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1198                        BUG_ON(tm->slot < n);
1199                        fallthrough;
1200                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1201                case MOD_LOG_KEY_REMOVE:
1202                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1203                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1204                        btrfs_set_node_ptr_generation(eb, tm->slot,
1205                                                      tm->generation);
1206                        n++;
1207                        break;
1208                case MOD_LOG_KEY_REPLACE:
1209                        BUG_ON(tm->slot >= n);
1210                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1211                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1212                        btrfs_set_node_ptr_generation(eb, tm->slot,
1213                                                      tm->generation);
1214                        break;
1215                case MOD_LOG_KEY_ADD:
1216                        /* if a move operation is needed it's in the log */
1217                        n--;
1218                        break;
1219                case MOD_LOG_MOVE_KEYS:
1220                        o_dst = btrfs_node_key_ptr_offset(tm->slot);
1221                        o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1222                        memmove_extent_buffer(eb, o_dst, o_src,
1223                                              tm->move.nr_items * p_size);
1224                        break;
1225                case MOD_LOG_ROOT_REPLACE:
1226                        /*
1227                         * this operation is special. for roots, this must be
1228                         * handled explicitly before rewinding.
1229                         * for non-roots, this operation may exist if the node
1230                         * was a root: root A -> child B; then A gets empty and
1231                         * B is promoted to the new root. in the mod log, we'll
1232                         * have a root-replace operation for B, a tree block
1233                         * that is no root. we simply ignore that operation.
1234                         */
1235                        break;
1236                }
1237                next = rb_next(&tm->node);
1238                if (!next)
1239                        break;
1240                tm = rb_entry(next, struct tree_mod_elem, node);
1241                if (tm->logical != first_tm->logical)
1242                        break;
1243        }
1244        read_unlock(&fs_info->tree_mod_log_lock);
1245        btrfs_set_header_nritems(eb, n);
1246}
1247
1248/*
1249 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1250 * is returned. If rewind operations happen, a fresh buffer is returned. The
1251 * returned buffer is always read-locked. If the returned buffer is not the
1252 * input buffer, the lock on the input buffer is released and the input buffer
1253 * is freed (its refcount is decremented).
1254 */
1255static struct extent_buffer *
1256tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1257                    struct extent_buffer *eb, u64 time_seq)
1258{
1259        struct extent_buffer *eb_rewin;
1260        struct tree_mod_elem *tm;
1261
1262        if (!time_seq)
1263                return eb;
1264
1265        if (btrfs_header_level(eb) == 0)
1266                return eb;
1267
1268        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1269        if (!tm)
1270                return eb;
1271
1272        btrfs_set_path_blocking(path);
1273        btrfs_set_lock_blocking_read(eb);
1274
1275        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1276                BUG_ON(tm->slot != 0);
1277                eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1278                if (!eb_rewin) {
1279                        btrfs_tree_read_unlock_blocking(eb);
1280                        free_extent_buffer(eb);
1281                        return NULL;
1282                }
1283                btrfs_set_header_bytenr(eb_rewin, eb->start);
1284                btrfs_set_header_backref_rev(eb_rewin,
1285                                             btrfs_header_backref_rev(eb));
1286                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1287                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1288        } else {
1289                eb_rewin = btrfs_clone_extent_buffer(eb);
1290                if (!eb_rewin) {
1291                        btrfs_tree_read_unlock_blocking(eb);
1292                        free_extent_buffer(eb);
1293                        return NULL;
1294                }
1295        }
1296
1297        btrfs_tree_read_unlock_blocking(eb);
1298        free_extent_buffer(eb);
1299
1300        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1301                                       eb_rewin, btrfs_header_level(eb_rewin));
1302        btrfs_tree_read_lock(eb_rewin);
1303        __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1304        WARN_ON(btrfs_header_nritems(eb_rewin) >
1305                BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1306
1307        return eb_rewin;
1308}
1309
1310/*
1311 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1312 * value. If there are no changes, the current root->root_node is returned. If
1313 * anything changed in between, there's a fresh buffer allocated on which the
1314 * rewind operations are done. In any case, the returned buffer is read locked.
1315 * Returns NULL on error (with no locks held).
1316 */
1317static inline struct extent_buffer *
1318get_old_root(struct btrfs_root *root, u64 time_seq)
1319{
1320        struct btrfs_fs_info *fs_info = root->fs_info;
1321        struct tree_mod_elem *tm;
1322        struct extent_buffer *eb = NULL;
1323        struct extent_buffer *eb_root;
1324        u64 eb_root_owner = 0;
1325        struct extent_buffer *old;
1326        struct tree_mod_root *old_root = NULL;
1327        u64 old_generation = 0;
1328        u64 logical;
1329        int level;
1330
1331        eb_root = btrfs_read_lock_root_node(root);
1332        tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1333        if (!tm)
1334                return eb_root;
1335
1336        if (tm->op == MOD_LOG_ROOT_REPLACE) {
1337                old_root = &tm->old_root;
1338                old_generation = tm->generation;
1339                logical = old_root->logical;
1340                level = old_root->level;
1341        } else {
1342                logical = eb_root->start;
1343                level = btrfs_header_level(eb_root);
1344        }
1345
1346        tm = tree_mod_log_search(fs_info, logical, time_seq);
1347        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1348                btrfs_tree_read_unlock(eb_root);
1349                free_extent_buffer(eb_root);
1350                old = read_tree_block(fs_info, logical, 0, level, NULL);
1351                if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1352                        if (!IS_ERR(old))
1353                                free_extent_buffer(old);
1354                        btrfs_warn(fs_info,
1355                                   "failed to read tree block %llu from get_old_root",
1356                                   logical);
1357                } else {
1358                        eb = btrfs_clone_extent_buffer(old);
1359                        free_extent_buffer(old);
1360                }
1361        } else if (old_root) {
1362                eb_root_owner = btrfs_header_owner(eb_root);
1363                btrfs_tree_read_unlock(eb_root);
1364                free_extent_buffer(eb_root);
1365                eb = alloc_dummy_extent_buffer(fs_info, logical);
1366        } else {
1367                btrfs_set_lock_blocking_read(eb_root);
1368                eb = btrfs_clone_extent_buffer(eb_root);
1369                btrfs_tree_read_unlock_blocking(eb_root);
1370                free_extent_buffer(eb_root);
1371        }
1372
1373        if (!eb)
1374                return NULL;
1375        if (old_root) {
1376                btrfs_set_header_bytenr(eb, eb->start);
1377                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1378                btrfs_set_header_owner(eb, eb_root_owner);
1379                btrfs_set_header_level(eb, old_root->level);
1380                btrfs_set_header_generation(eb, old_generation);
1381        }
1382        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1383                                       btrfs_header_level(eb));
1384        btrfs_tree_read_lock(eb);
1385        if (tm)
1386                __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1387        else
1388                WARN_ON(btrfs_header_level(eb) != 0);
1389        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1390
1391        return eb;
1392}
1393
1394int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1395{
1396        struct tree_mod_elem *tm;
1397        int level;
1398        struct extent_buffer *eb_root = btrfs_root_node(root);
1399
1400        tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1401        if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1402                level = tm->old_root.level;
1403        } else {
1404                level = btrfs_header_level(eb_root);
1405        }
1406        free_extent_buffer(eb_root);
1407
1408        return level;
1409}
1410
1411static inline int should_cow_block(struct btrfs_trans_handle *trans,
1412                                   struct btrfs_root *root,
1413                                   struct extent_buffer *buf)
1414{
1415        if (btrfs_is_testing(root->fs_info))
1416                return 0;
1417
1418        /* Ensure we can see the FORCE_COW bit */
1419        smp_mb__before_atomic();
1420
1421        /*
1422         * We do not need to cow a block if
1423         * 1) this block is not created or changed in this transaction;
1424         * 2) this block does not belong to TREE_RELOC tree;
1425         * 3) the root is not forced COW.
1426         *
1427         * What is forced COW:
1428         *    when we create snapshot during committing the transaction,
1429         *    after we've finished copying src root, we must COW the shared
1430         *    block to ensure the metadata consistency.
1431         */
1432        if (btrfs_header_generation(buf) == trans->transid &&
1433            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1434            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1435              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1436            !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1437                return 0;
1438        return 1;
1439}
1440
1441/*
1442 * cows a single block, see __btrfs_cow_block for the real work.
1443 * This version of it has extra checks so that a block isn't COWed more than
1444 * once per transaction, as long as it hasn't been written yet
1445 */
1446noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1447                    struct btrfs_root *root, struct extent_buffer *buf,
1448                    struct extent_buffer *parent, int parent_slot,
1449                    struct extent_buffer **cow_ret)
1450{
1451        struct btrfs_fs_info *fs_info = root->fs_info;
1452        u64 search_start;
1453        int ret;
1454
1455        if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1456                btrfs_err(fs_info,
1457                        "COW'ing blocks on a fs root that's being dropped");
1458
1459        if (trans->transaction != fs_info->running_transaction)
1460                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1461                       trans->transid,
1462                       fs_info->running_transaction->transid);
1463
1464        if (trans->transid != fs_info->generation)
1465                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1466                       trans->transid, fs_info->generation);
1467
1468        if (!should_cow_block(trans, root, buf)) {
1469                trans->dirty = true;
1470                *cow_ret = buf;
1471                return 0;
1472        }
1473
1474        search_start = buf->start & ~((u64)SZ_1G - 1);
1475
1476        if (parent)
1477                btrfs_set_lock_blocking_write(parent);
1478        btrfs_set_lock_blocking_write(buf);
1479
1480        /*
1481         * Before CoWing this block for later modification, check if it's
1482         * the subtree root and do the delayed subtree trace if needed.
1483         *
1484         * Also We don't care about the error, as it's handled internally.
1485         */
1486        btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1487        ret = __btrfs_cow_block(trans, root, buf, parent,
1488                                 parent_slot, cow_ret, search_start, 0);
1489
1490        trace_btrfs_cow_block(root, buf, *cow_ret);
1491
1492        return ret;
1493}
1494
1495/*
1496 * helper function for defrag to decide if two blocks pointed to by a
1497 * node are actually close by
1498 */
1499static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1500{
1501        if (blocknr < other && other - (blocknr + blocksize) < 32768)
1502                return 1;
1503        if (blocknr > other && blocknr - (other + blocksize) < 32768)
1504                return 1;
1505        return 0;
1506}
1507
1508#ifdef __LITTLE_ENDIAN
1509
1510/*
1511 * Compare two keys, on little-endian the disk order is same as CPU order and
1512 * we can avoid the conversion.
1513 */
1514static int comp_keys(const struct btrfs_disk_key *disk_key,
1515                     const struct btrfs_key *k2)
1516{
1517        const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
1518
1519        return btrfs_comp_cpu_keys(k1, k2);
1520}
1521
1522#else
1523
1524/*
1525 * compare two keys in a memcmp fashion
1526 */
1527static int comp_keys(const struct btrfs_disk_key *disk,
1528                     const struct btrfs_key *k2)
1529{
1530        struct btrfs_key k1;
1531
1532        btrfs_disk_key_to_cpu(&k1, disk);
1533
1534        return btrfs_comp_cpu_keys(&k1, k2);
1535}
1536#endif
1537
1538/*
1539 * same as comp_keys only with two btrfs_key's
1540 */
1541int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1542{
1543        if (k1->objectid > k2->objectid)
1544                return 1;
1545        if (k1->objectid < k2->objectid)
1546                return -1;
1547        if (k1->type > k2->type)
1548                return 1;
1549        if (k1->type < k2->type)
1550                return -1;
1551        if (k1->offset > k2->offset)
1552                return 1;
1553        if (k1->offset < k2->offset)
1554                return -1;
1555        return 0;
1556}
1557
1558/*
1559 * this is used by the defrag code to go through all the
1560 * leaves pointed to by a node and reallocate them so that
1561 * disk order is close to key order
1562 */
1563int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1564                       struct btrfs_root *root, struct extent_buffer *parent,
1565                       int start_slot, u64 *last_ret,
1566                       struct btrfs_key *progress)
1567{
1568        struct btrfs_fs_info *fs_info = root->fs_info;
1569        struct extent_buffer *cur;
1570        u64 blocknr;
1571        u64 gen;
1572        u64 search_start = *last_ret;
1573        u64 last_block = 0;
1574        u64 other;
1575        u32 parent_nritems;
1576        int end_slot;
1577        int i;
1578        int err = 0;
1579        int parent_level;
1580        int uptodate;
1581        u32 blocksize;
1582        int progress_passed = 0;
1583        struct btrfs_disk_key disk_key;
1584
1585        parent_level = btrfs_header_level(parent);
1586
1587        WARN_ON(trans->transaction != fs_info->running_transaction);
1588        WARN_ON(trans->transid != fs_info->generation);
1589
1590        parent_nritems = btrfs_header_nritems(parent);
1591        blocksize = fs_info->nodesize;
1592        end_slot = parent_nritems - 1;
1593
1594        if (parent_nritems <= 1)
1595                return 0;
1596
1597        btrfs_set_lock_blocking_write(parent);
1598
1599        for (i = start_slot; i <= end_slot; i++) {
1600                struct btrfs_key first_key;
1601                int close = 1;
1602
1603                btrfs_node_key(parent, &disk_key, i);
1604                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1605                        continue;
1606
1607                progress_passed = 1;
1608                blocknr = btrfs_node_blockptr(parent, i);
1609                gen = btrfs_node_ptr_generation(parent, i);
1610                btrfs_node_key_to_cpu(parent, &first_key, i);
1611                if (last_block == 0)
1612                        last_block = blocknr;
1613
1614                if (i > 0) {
1615                        other = btrfs_node_blockptr(parent, i - 1);
1616                        close = close_blocks(blocknr, other, blocksize);
1617                }
1618                if (!close && i < end_slot) {
1619                        other = btrfs_node_blockptr(parent, i + 1);
1620                        close = close_blocks(blocknr, other, blocksize);
1621                }
1622                if (close) {
1623                        last_block = blocknr;
1624                        continue;
1625                }
1626
1627                cur = find_extent_buffer(fs_info, blocknr);
1628                if (cur)
1629                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1630                else
1631                        uptodate = 0;
1632                if (!cur || !uptodate) {
1633                        if (!cur) {
1634                                cur = read_tree_block(fs_info, blocknr, gen,
1635                                                      parent_level - 1,
1636                                                      &first_key);
1637                                if (IS_ERR(cur)) {
1638                                        return PTR_ERR(cur);
1639                                } else if (!extent_buffer_uptodate(cur)) {
1640                                        free_extent_buffer(cur);
1641                                        return -EIO;
1642                                }
1643                        } else if (!uptodate) {
1644                                err = btrfs_read_buffer(cur, gen,
1645                                                parent_level - 1,&first_key);
1646                                if (err) {
1647                                        free_extent_buffer(cur);
1648                                        return err;
1649                                }
1650                        }
1651                }
1652                if (search_start == 0)
1653                        search_start = last_block;
1654
1655                btrfs_tree_lock(cur);
1656                btrfs_set_lock_blocking_write(cur);
1657                err = __btrfs_cow_block(trans, root, cur, parent, i,
1658                                        &cur, search_start,
1659                                        min(16 * blocksize,
1660                                            (end_slot - i) * blocksize));
1661                if (err) {
1662                        btrfs_tree_unlock(cur);
1663                        free_extent_buffer(cur);
1664                        break;
1665                }
1666                search_start = cur->start;
1667                last_block = cur->start;
1668                *last_ret = search_start;
1669                btrfs_tree_unlock(cur);
1670                free_extent_buffer(cur);
1671        }
1672        return err;
1673}
1674
1675/*
1676 * search for key in the extent_buffer.  The items start at offset p,
1677 * and they are item_size apart.  There are 'max' items in p.
1678 *
1679 * the slot in the array is returned via slot, and it points to
1680 * the place where you would insert key if it is not found in
1681 * the array.
1682 *
1683 * slot may point to max if the key is bigger than all of the keys
1684 */
1685static noinline int generic_bin_search(struct extent_buffer *eb,
1686                                       unsigned long p, int item_size,
1687                                       const struct btrfs_key *key,
1688                                       int max, int *slot)
1689{
1690        int low = 0;
1691        int high = max;
1692        int ret;
1693        const int key_size = sizeof(struct btrfs_disk_key);
1694
1695        if (low > high) {
1696                btrfs_err(eb->fs_info,
1697                 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1698                          __func__, low, high, eb->start,
1699                          btrfs_header_owner(eb), btrfs_header_level(eb));
1700                return -EINVAL;
1701        }
1702
1703        while (low < high) {
1704                unsigned long oip;
1705                unsigned long offset;
1706                struct btrfs_disk_key *tmp;
1707                struct btrfs_disk_key unaligned;
1708                int mid;
1709
1710                mid = (low + high) / 2;
1711                offset = p + mid * item_size;
1712                oip = offset_in_page(offset);
1713
1714                if (oip + key_size <= PAGE_SIZE) {
1715                        const unsigned long idx = offset >> PAGE_SHIFT;
1716                        char *kaddr = page_address(eb->pages[idx]);
1717
1718                        tmp = (struct btrfs_disk_key *)(kaddr + oip);
1719                } else {
1720                        read_extent_buffer(eb, &unaligned, offset, key_size);
1721                        tmp = &unaligned;
1722                }
1723
1724                ret = comp_keys(tmp, key);
1725
1726                if (ret < 0)
1727                        low = mid + 1;
1728                else if (ret > 0)
1729                        high = mid;
1730                else {
1731                        *slot = mid;
1732                        return 0;
1733                }
1734        }
1735        *slot = low;
1736        return 1;
1737}
1738
1739/*
1740 * simple bin_search frontend that does the right thing for
1741 * leaves vs nodes
1742 */
1743int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1744                     int *slot)
1745{
1746        if (btrfs_header_level(eb) == 0)
1747                return generic_bin_search(eb,
1748                                          offsetof(struct btrfs_leaf, items),
1749                                          sizeof(struct btrfs_item),
1750                                          key, btrfs_header_nritems(eb),
1751                                          slot);
1752        else
1753                return generic_bin_search(eb,
1754                                          offsetof(struct btrfs_node, ptrs),
1755                                          sizeof(struct btrfs_key_ptr),
1756                                          key, btrfs_header_nritems(eb),
1757                                          slot);
1758}
1759
1760static void root_add_used(struct btrfs_root *root, u32 size)
1761{
1762        spin_lock(&root->accounting_lock);
1763        btrfs_set_root_used(&root->root_item,
1764                            btrfs_root_used(&root->root_item) + size);
1765        spin_unlock(&root->accounting_lock);
1766}
1767
1768static void root_sub_used(struct btrfs_root *root, u32 size)
1769{
1770        spin_lock(&root->accounting_lock);
1771        btrfs_set_root_used(&root->root_item,
1772                            btrfs_root_used(&root->root_item) - size);
1773        spin_unlock(&root->accounting_lock);
1774}
1775
1776/* given a node and slot number, this reads the blocks it points to.  The
1777 * extent buffer is returned with a reference taken (but unlocked).
1778 */
1779struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1780                                           int slot)
1781{
1782        int level = btrfs_header_level(parent);
1783        struct extent_buffer *eb;
1784        struct btrfs_key first_key;
1785
1786        if (slot < 0 || slot >= btrfs_header_nritems(parent))
1787                return ERR_PTR(-ENOENT);
1788
1789        BUG_ON(level == 0);
1790
1791        btrfs_node_key_to_cpu(parent, &first_key, slot);
1792        eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1793                             btrfs_node_ptr_generation(parent, slot),
1794                             level - 1, &first_key);
1795        if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1796                free_extent_buffer(eb);
1797                eb = ERR_PTR(-EIO);
1798        }
1799
1800        return eb;
1801}
1802
1803/*
1804 * node level balancing, used to make sure nodes are in proper order for
1805 * item deletion.  We balance from the top down, so we have to make sure
1806 * that a deletion won't leave an node completely empty later on.
1807 */
1808static noinline int balance_level(struct btrfs_trans_handle *trans,
1809                         struct btrfs_root *root,
1810                         struct btrfs_path *path, int level)
1811{
1812        struct btrfs_fs_info *fs_info = root->fs_info;
1813        struct extent_buffer *right = NULL;
1814        struct extent_buffer *mid;
1815        struct extent_buffer *left = NULL;
1816        struct extent_buffer *parent = NULL;
1817        int ret = 0;
1818        int wret;
1819        int pslot;
1820        int orig_slot = path->slots[level];
1821        u64 orig_ptr;
1822
1823        ASSERT(level > 0);
1824
1825        mid = path->nodes[level];
1826
1827        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1828                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1829        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1830
1831        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1832
1833        if (level < BTRFS_MAX_LEVEL - 1) {
1834                parent = path->nodes[level + 1];
1835                pslot = path->slots[level + 1];
1836        }
1837
1838        /*
1839         * deal with the case where there is only one pointer in the root
1840         * by promoting the node below to a root
1841         */
1842        if (!parent) {
1843                struct extent_buffer *child;
1844
1845                if (btrfs_header_nritems(mid) != 1)
1846                        return 0;
1847
1848                /* promote the child to a root */
1849                child = btrfs_read_node_slot(mid, 0);
1850                if (IS_ERR(child)) {
1851                        ret = PTR_ERR(child);
1852                        btrfs_handle_fs_error(fs_info, ret, NULL);
1853                        goto enospc;
1854                }
1855
1856                btrfs_tree_lock(child);
1857                btrfs_set_lock_blocking_write(child);
1858                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1859                if (ret) {
1860                        btrfs_tree_unlock(child);
1861                        free_extent_buffer(child);
1862                        goto enospc;
1863                }
1864
1865                ret = tree_mod_log_insert_root(root->node, child, 1);
1866                BUG_ON(ret < 0);
1867                rcu_assign_pointer(root->node, child);
1868
1869                add_root_to_dirty_list(root);
1870                btrfs_tree_unlock(child);
1871
1872                path->locks[level] = 0;
1873                path->nodes[level] = NULL;
1874                btrfs_clean_tree_block(mid);
1875                btrfs_tree_unlock(mid);
1876                /* once for the path */
1877                free_extent_buffer(mid);
1878
1879                root_sub_used(root, mid->len);
1880                btrfs_free_tree_block(trans, root, mid, 0, 1);
1881                /* once for the root ptr */
1882                free_extent_buffer_stale(mid);
1883                return 0;
1884        }
1885        if (btrfs_header_nritems(mid) >
1886            BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1887                return 0;
1888
1889        left = btrfs_read_node_slot(parent, pslot - 1);
1890        if (IS_ERR(left))
1891                left = NULL;
1892
1893        if (left) {
1894                btrfs_tree_lock(left);
1895                btrfs_set_lock_blocking_write(left);
1896                wret = btrfs_cow_block(trans, root, left,
1897                                       parent, pslot - 1, &left);
1898                if (wret) {
1899                        ret = wret;
1900                        goto enospc;
1901                }
1902        }
1903
1904        right = btrfs_read_node_slot(parent, pslot + 1);
1905        if (IS_ERR(right))
1906                right = NULL;
1907
1908        if (right) {
1909                btrfs_tree_lock(right);
1910                btrfs_set_lock_blocking_write(right);
1911                wret = btrfs_cow_block(trans, root, right,
1912                                       parent, pslot + 1, &right);
1913                if (wret) {
1914                        ret = wret;
1915                        goto enospc;
1916                }
1917        }
1918
1919        /* first, try to make some room in the middle buffer */
1920        if (left) {
1921                orig_slot += btrfs_header_nritems(left);
1922                wret = push_node_left(trans, left, mid, 1);
1923                if (wret < 0)
1924                        ret = wret;
1925        }
1926
1927        /*
1928         * then try to empty the right most buffer into the middle
1929         */
1930        if (right) {
1931                wret = push_node_left(trans, mid, right, 1);
1932                if (wret < 0 && wret != -ENOSPC)
1933                        ret = wret;
1934                if (btrfs_header_nritems(right) == 0) {
1935                        btrfs_clean_tree_block(right);
1936                        btrfs_tree_unlock(right);
1937                        del_ptr(root, path, level + 1, pslot + 1);
1938                        root_sub_used(root, right->len);
1939                        btrfs_free_tree_block(trans, root, right, 0, 1);
1940                        free_extent_buffer_stale(right);
1941                        right = NULL;
1942                } else {
1943                        struct btrfs_disk_key right_key;
1944                        btrfs_node_key(right, &right_key, 0);
1945                        ret = tree_mod_log_insert_key(parent, pslot + 1,
1946                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1947                        BUG_ON(ret < 0);
1948                        btrfs_set_node_key(parent, &right_key, pslot + 1);
1949                        btrfs_mark_buffer_dirty(parent);
1950                }
1951        }
1952        if (btrfs_header_nritems(mid) == 1) {
1953                /*
1954                 * we're not allowed to leave a node with one item in the
1955                 * tree during a delete.  A deletion from lower in the tree
1956                 * could try to delete the only pointer in this node.
1957                 * So, pull some keys from the left.
1958                 * There has to be a left pointer at this point because
1959                 * otherwise we would have pulled some pointers from the
1960                 * right
1961                 */
1962                if (!left) {
1963                        ret = -EROFS;
1964                        btrfs_handle_fs_error(fs_info, ret, NULL);
1965                        goto enospc;
1966                }
1967                wret = balance_node_right(trans, mid, left);
1968                if (wret < 0) {
1969                        ret = wret;
1970                        goto enospc;
1971                }
1972                if (wret == 1) {
1973                        wret = push_node_left(trans, left, mid, 1);
1974                        if (wret < 0)
1975                                ret = wret;
1976                }
1977                BUG_ON(wret == 1);
1978        }
1979        if (btrfs_header_nritems(mid) == 0) {
1980                btrfs_clean_tree_block(mid);
1981                btrfs_tree_unlock(mid);
1982                del_ptr(root, path, level + 1, pslot);
1983                root_sub_used(root, mid->len);
1984                btrfs_free_tree_block(trans, root, mid, 0, 1);
1985                free_extent_buffer_stale(mid);
1986                mid = NULL;
1987        } else {
1988                /* update the parent key to reflect our changes */
1989                struct btrfs_disk_key mid_key;
1990                btrfs_node_key(mid, &mid_key, 0);
1991                ret = tree_mod_log_insert_key(parent, pslot,
1992                                MOD_LOG_KEY_REPLACE, GFP_NOFS);
1993                BUG_ON(ret < 0);
1994                btrfs_set_node_key(parent, &mid_key, pslot);
1995                btrfs_mark_buffer_dirty(parent);
1996        }
1997
1998        /* update the path */
1999        if (left) {
2000                if (btrfs_header_nritems(left) > orig_slot) {
2001                        atomic_inc(&left->refs);
2002                        /* left was locked after cow */
2003                        path->nodes[level] = left;
2004                        path->slots[level + 1] -= 1;
2005                        path->slots[level] = orig_slot;
2006                        if (mid) {
2007                                btrfs_tree_unlock(mid);
2008                                free_extent_buffer(mid);
2009                        }
2010                } else {
2011                        orig_slot -= btrfs_header_nritems(left);
2012                        path->slots[level] = orig_slot;
2013                }
2014        }
2015        /* double check we haven't messed things up */
2016        if (orig_ptr !=
2017            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2018                BUG();
2019enospc:
2020        if (right) {
2021                btrfs_tree_unlock(right);
2022                free_extent_buffer(right);
2023        }
2024        if (left) {
2025                if (path->nodes[level] != left)
2026                        btrfs_tree_unlock(left);
2027                free_extent_buffer(left);
2028        }
2029        return ret;
2030}
2031
2032/* Node balancing for insertion.  Here we only split or push nodes around
2033 * when they are completely full.  This is also done top down, so we
2034 * have to be pessimistic.
2035 */
2036static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2037                                          struct btrfs_root *root,
2038                                          struct btrfs_path *path, int level)
2039{
2040        struct btrfs_fs_info *fs_info = root->fs_info;
2041        struct extent_buffer *right = NULL;
2042        struct extent_buffer *mid;
2043        struct extent_buffer *left = NULL;
2044        struct extent_buffer *parent = NULL;
2045        int ret = 0;
2046        int wret;
2047        int pslot;
2048        int orig_slot = path->slots[level];
2049
2050        if (level == 0)
2051                return 1;
2052
2053        mid = path->nodes[level];
2054        WARN_ON(btrfs_header_generation(mid) != trans->transid);
2055
2056        if (level < BTRFS_MAX_LEVEL - 1) {
2057                parent = path->nodes[level + 1];
2058                pslot = path->slots[level + 1];
2059        }
2060
2061        if (!parent)
2062                return 1;
2063
2064        left = btrfs_read_node_slot(parent, pslot - 1);
2065        if (IS_ERR(left))
2066                left = NULL;
2067
2068        /* first, try to make some room in the middle buffer */
2069        if (left) {
2070                u32 left_nr;
2071
2072                btrfs_tree_lock(left);
2073                btrfs_set_lock_blocking_write(left);
2074
2075                left_nr = btrfs_header_nritems(left);
2076                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2077                        wret = 1;
2078                } else {
2079                        ret = btrfs_cow_block(trans, root, left, parent,
2080                                              pslot - 1, &left);
2081                        if (ret)
2082                                wret = 1;
2083                        else {
2084                                wret = push_node_left(trans, left, mid, 0);
2085                        }
2086                }
2087                if (wret < 0)
2088                        ret = wret;
2089                if (wret == 0) {
2090                        struct btrfs_disk_key disk_key;
2091                        orig_slot += left_nr;
2092                        btrfs_node_key(mid, &disk_key, 0);
2093                        ret = tree_mod_log_insert_key(parent, pslot,
2094                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
2095                        BUG_ON(ret < 0);
2096                        btrfs_set_node_key(parent, &disk_key, pslot);
2097                        btrfs_mark_buffer_dirty(parent);
2098                        if (btrfs_header_nritems(left) > orig_slot) {
2099                                path->nodes[level] = left;
2100                                path->slots[level + 1] -= 1;
2101                                path->slots[level] = orig_slot;
2102                                btrfs_tree_unlock(mid);
2103                                free_extent_buffer(mid);
2104                        } else {
2105                                orig_slot -=
2106                                        btrfs_header_nritems(left);
2107                                path->slots[level] = orig_slot;
2108                                btrfs_tree_unlock(left);
2109                                free_extent_buffer(left);
2110                        }
2111                        return 0;
2112                }
2113                btrfs_tree_unlock(left);
2114                free_extent_buffer(left);
2115        }
2116        right = btrfs_read_node_slot(parent, pslot + 1);
2117        if (IS_ERR(right))
2118                right = NULL;
2119
2120        /*
2121         * then try to empty the right most buffer into the middle
2122         */
2123        if (right) {
2124                u32 right_nr;
2125
2126                btrfs_tree_lock(right);
2127                btrfs_set_lock_blocking_write(right);
2128
2129                right_nr = btrfs_header_nritems(right);
2130                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2131                        wret = 1;
2132                } else {
2133                        ret = btrfs_cow_block(trans, root, right,
2134                                              parent, pslot + 1,
2135                                              &right);
2136                        if (ret)
2137                                wret = 1;
2138                        else {
2139                                wret = balance_node_right(trans, right, mid);
2140                        }
2141                }
2142                if (wret < 0)
2143                        ret = wret;
2144                if (wret == 0) {
2145                        struct btrfs_disk_key disk_key;
2146
2147                        btrfs_node_key(right, &disk_key, 0);
2148                        ret = tree_mod_log_insert_key(parent, pslot + 1,
2149                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
2150                        BUG_ON(ret < 0);
2151                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
2152                        btrfs_mark_buffer_dirty(parent);
2153
2154                        if (btrfs_header_nritems(mid) <= orig_slot) {
2155                                path->nodes[level] = right;
2156                                path->slots[level + 1] += 1;
2157                                path->slots[level] = orig_slot -
2158                                        btrfs_header_nritems(mid);
2159                                btrfs_tree_unlock(mid);
2160                                free_extent_buffer(mid);
2161                        } else {
2162                                btrfs_tree_unlock(right);
2163                                free_extent_buffer(right);
2164                        }
2165                        return 0;
2166                }
2167                btrfs_tree_unlock(right);
2168                free_extent_buffer(right);
2169        }
2170        return 1;
2171}
2172
2173/*
2174 * readahead one full node of leaves, finding things that are close
2175 * to the block in 'slot', and triggering ra on them.
2176 */
2177static void reada_for_search(struct btrfs_fs_info *fs_info,
2178                             struct btrfs_path *path,
2179                             int level, int slot, u64 objectid)
2180{
2181        struct extent_buffer *node;
2182        struct btrfs_disk_key disk_key;
2183        u32 nritems;
2184        u64 search;
2185        u64 target;
2186        u64 nread = 0;
2187        struct extent_buffer *eb;
2188        u32 nr;
2189        u32 blocksize;
2190        u32 nscan = 0;
2191
2192        if (level != 1)
2193                return;
2194
2195        if (!path->nodes[level])
2196                return;
2197
2198        node = path->nodes[level];
2199
2200        search = btrfs_node_blockptr(node, slot);
2201        blocksize = fs_info->nodesize;
2202        eb = find_extent_buffer(fs_info, search);
2203        if (eb) {
2204                free_extent_buffer(eb);
2205                return;
2206        }
2207
2208        target = search;
2209
2210        nritems = btrfs_header_nritems(node);
2211        nr = slot;
2212
2213        while (1) {
2214                if (path->reada == READA_BACK) {
2215                        if (nr == 0)
2216                                break;
2217                        nr--;
2218                } else if (path->reada == READA_FORWARD) {
2219                        nr++;
2220                        if (nr >= nritems)
2221                                break;
2222                }
2223                if (path->reada == READA_BACK && objectid) {
2224                        btrfs_node_key(node, &disk_key, nr);
2225                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
2226                                break;
2227                }
2228                search = btrfs_node_blockptr(node, nr);
2229                if ((search <= target && target - search <= 65536) ||
2230                    (search > target && search - target <= 65536)) {
2231                        readahead_tree_block(fs_info, search);
2232                        nread += blocksize;
2233                }
2234                nscan++;
2235                if ((nread > 65536 || nscan > 32))
2236                        break;
2237        }
2238}
2239
2240static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2241                                       struct btrfs_path *path, int level)
2242{
2243        int slot;
2244        int nritems;
2245        struct extent_buffer *parent;
2246        struct extent_buffer *eb;
2247        u64 gen;
2248        u64 block1 = 0;
2249        u64 block2 = 0;
2250
2251        parent = path->nodes[level + 1];
2252        if (!parent)
2253                return;
2254
2255        nritems = btrfs_header_nritems(parent);
2256        slot = path->slots[level + 1];
2257
2258        if (slot > 0) {
2259                block1 = btrfs_node_blockptr(parent, slot - 1);
2260                gen = btrfs_node_ptr_generation(parent, slot - 1);
2261                eb = find_extent_buffer(fs_info, block1);
2262                /*
2263                 * if we get -eagain from btrfs_buffer_uptodate, we
2264                 * don't want to return eagain here.  That will loop
2265                 * forever
2266                 */
2267                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2268                        block1 = 0;
2269                free_extent_buffer(eb);
2270        }
2271        if (slot + 1 < nritems) {
2272                block2 = btrfs_node_blockptr(parent, slot + 1);
2273                gen = btrfs_node_ptr_generation(parent, slot + 1);
2274                eb = find_extent_buffer(fs_info, block2);
2275                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2276                        block2 = 0;
2277                free_extent_buffer(eb);
2278        }
2279
2280        if (block1)
2281                readahead_tree_block(fs_info, block1);
2282        if (block2)
2283                readahead_tree_block(fs_info, block2);
2284}
2285
2286
2287/*
2288 * when we walk down the tree, it is usually safe to unlock the higher layers
2289 * in the tree.  The exceptions are when our path goes through slot 0, because
2290 * operations on the tree might require changing key pointers higher up in the
2291 * tree.
2292 *
2293 * callers might also have set path->keep_locks, which tells this code to keep
2294 * the lock if the path points to the last slot in the block.  This is part of
2295 * walking through the tree, and selecting the next slot in the higher block.
2296 *
2297 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2298 * if lowest_unlock is 1, level 0 won't be unlocked
2299 */
2300static noinline void unlock_up(struct btrfs_path *path, int level,
2301                               int lowest_unlock, int min_write_lock_level,
2302                               int *write_lock_level)
2303{
2304        int i;
2305        int skip_level = level;
2306        int no_skips = 0;
2307        struct extent_buffer *t;
2308
2309        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2310                if (!path->nodes[i])
2311                        break;
2312                if (!path->locks[i])
2313                        break;
2314                if (!no_skips && path->slots[i] == 0) {
2315                        skip_level = i + 1;
2316                        continue;
2317                }
2318                if (!no_skips && path->keep_locks) {
2319                        u32 nritems;
2320                        t = path->nodes[i];
2321                        nritems = btrfs_header_nritems(t);
2322                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
2323                                skip_level = i + 1;
2324                                continue;
2325                        }
2326                }
2327                if (skip_level < i && i >= lowest_unlock)
2328                        no_skips = 1;
2329
2330                t = path->nodes[i];
2331                if (i >= lowest_unlock && i > skip_level) {
2332                        btrfs_tree_unlock_rw(t, path->locks[i]);
2333                        path->locks[i] = 0;
2334                        if (write_lock_level &&
2335                            i > min_write_lock_level &&
2336                            i <= *write_lock_level) {
2337                                *write_lock_level = i - 1;
2338                        }
2339                }
2340        }
2341}
2342
2343/*
2344 * helper function for btrfs_search_slot.  The goal is to find a block
2345 * in cache without setting the path to blocking.  If we find the block
2346 * we return zero and the path is unchanged.
2347 *
2348 * If we can't find the block, we set the path blocking and do some
2349 * reada.  -EAGAIN is returned and the search must be repeated.
2350 */
2351static int
2352read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2353                      struct extent_buffer **eb_ret, int level, int slot,
2354                      const struct btrfs_key *key)
2355{
2356        struct btrfs_fs_info *fs_info = root->fs_info;
2357        u64 blocknr;
2358        u64 gen;
2359        struct extent_buffer *tmp;
2360        struct btrfs_key first_key;
2361        int ret;
2362        int parent_level;
2363
2364        blocknr = btrfs_node_blockptr(*eb_ret, slot);
2365        gen = btrfs_node_ptr_generation(*eb_ret, slot);
2366        parent_level = btrfs_header_level(*eb_ret);
2367        btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
2368
2369        tmp = find_extent_buffer(fs_info, blocknr);
2370        if (tmp) {
2371                /* first we do an atomic uptodate check */
2372                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2373                        /*
2374                         * Do extra check for first_key, eb can be stale due to
2375                         * being cached, read from scrub, or have multiple
2376                         * parents (shared tree blocks).
2377                         */
2378                        if (btrfs_verify_level_key(tmp,
2379                                        parent_level - 1, &first_key, gen)) {
2380                                free_extent_buffer(tmp);
2381                                return -EUCLEAN;
2382                        }
2383                        *eb_ret = tmp;
2384                        return 0;
2385                }
2386
2387                /* the pages were up to date, but we failed
2388                 * the generation number check.  Do a full
2389                 * read for the generation number that is correct.
2390                 * We must do this without dropping locks so
2391                 * we can trust our generation number
2392                 */
2393                btrfs_set_path_blocking(p);
2394
2395                /* now we're allowed to do a blocking uptodate check */
2396                ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2397                if (!ret) {
2398                        *eb_ret = tmp;
2399                        return 0;
2400                }
2401                free_extent_buffer(tmp);
2402                btrfs_release_path(p);
2403                return -EIO;
2404        }
2405
2406        /*
2407         * reduce lock contention at high levels
2408         * of the btree by dropping locks before
2409         * we read.  Don't release the lock on the current
2410         * level because we need to walk this node to figure
2411         * out which blocks to read.
2412         */
2413        btrfs_unlock_up_safe(p, level + 1);
2414        btrfs_set_path_blocking(p);
2415
2416        if (p->reada != READA_NONE)
2417                reada_for_search(fs_info, p, level, slot, key->objectid);
2418
2419        ret = -EAGAIN;
2420        tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2421                              &first_key);
2422        if (!IS_ERR(tmp)) {
2423                /*
2424                 * If the read above didn't mark this buffer up to date,
2425                 * it will never end up being up to date.  Set ret to EIO now
2426                 * and give up so that our caller doesn't loop forever
2427                 * on our EAGAINs.
2428                 */
2429                if (!extent_buffer_uptodate(tmp))
2430                        ret = -EIO;
2431                free_extent_buffer(tmp);
2432        } else {
2433                ret = PTR_ERR(tmp);
2434        }
2435
2436        btrfs_release_path(p);
2437        return ret;
2438}
2439
2440/*
2441 * helper function for btrfs_search_slot.  This does all of the checks
2442 * for node-level blocks and does any balancing required based on
2443 * the ins_len.
2444 *
2445 * If no extra work was required, zero is returned.  If we had to
2446 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2447 * start over
2448 */
2449static int
2450setup_nodes_for_search(struct btrfs_trans_handle *trans,
2451                       struct btrfs_root *root, struct btrfs_path *p,
2452                       struct extent_buffer *b, int level, int ins_len,
2453                       int *write_lock_level)
2454{
2455        struct btrfs_fs_info *fs_info = root->fs_info;
2456        int ret;
2457
2458        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2459            BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2460                int sret;
2461
2462                if (*write_lock_level < level + 1) {
2463                        *write_lock_level = level + 1;
2464                        btrfs_release_path(p);
2465                        goto again;
2466                }
2467
2468                btrfs_set_path_blocking(p);
2469                reada_for_balance(fs_info, p, level);
2470                sret = split_node(trans, root, p, level);
2471
2472                BUG_ON(sret > 0);
2473                if (sret) {
2474                        ret = sret;
2475                        goto done;
2476                }
2477                b = p->nodes[level];
2478        } else if (ins_len < 0 && btrfs_header_nritems(b) <
2479                   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2480                int sret;
2481
2482                if (*write_lock_level < level + 1) {
2483                        *write_lock_level = level + 1;
2484                        btrfs_release_path(p);
2485                        goto again;
2486                }
2487
2488                btrfs_set_path_blocking(p);
2489                reada_for_balance(fs_info, p, level);
2490                sret = balance_level(trans, root, p, level);
2491
2492                if (sret) {
2493                        ret = sret;
2494                        goto done;
2495                }
2496                b = p->nodes[level];
2497                if (!b) {
2498                        btrfs_release_path(p);
2499                        goto again;
2500                }
2501                BUG_ON(btrfs_header_nritems(b) == 1);
2502        }
2503        return 0;
2504
2505again:
2506        ret = -EAGAIN;
2507done:
2508        return ret;
2509}
2510
2511int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2512                u64 iobjectid, u64 ioff, u8 key_type,
2513                struct btrfs_key *found_key)
2514{
2515        int ret;
2516        struct btrfs_key key;
2517        struct extent_buffer *eb;
2518
2519        ASSERT(path);
2520        ASSERT(found_key);
2521
2522        key.type = key_type;
2523        key.objectid = iobjectid;
2524        key.offset = ioff;
2525
2526        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2527        if (ret < 0)
2528                return ret;
2529
2530        eb = path->nodes[0];
2531        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2532                ret = btrfs_next_leaf(fs_root, path);
2533                if (ret)
2534                        return ret;
2535                eb = path->nodes[0];
2536        }
2537
2538        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2539        if (found_key->type != key.type ||
2540                        found_key->objectid != key.objectid)
2541                return 1;
2542
2543        return 0;
2544}
2545
2546static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2547                                                        struct btrfs_path *p,
2548                                                        int write_lock_level)
2549{
2550        struct btrfs_fs_info *fs_info = root->fs_info;
2551        struct extent_buffer *b;
2552        int root_lock;
2553        int level = 0;
2554
2555        /* We try very hard to do read locks on the root */
2556        root_lock = BTRFS_READ_LOCK;
2557
2558        if (p->search_commit_root) {
2559                /*
2560                 * The commit roots are read only so we always do read locks,
2561                 * and we always must hold the commit_root_sem when doing
2562                 * searches on them, the only exception is send where we don't
2563                 * want to block transaction commits for a long time, so
2564                 * we need to clone the commit root in order to avoid races
2565                 * with transaction commits that create a snapshot of one of
2566                 * the roots used by a send operation.
2567                 */
2568                if (p->need_commit_sem) {
2569                        down_read(&fs_info->commit_root_sem);
2570                        b = btrfs_clone_extent_buffer(root->commit_root);
2571                        up_read(&fs_info->commit_root_sem);
2572                        if (!b)
2573                                return ERR_PTR(-ENOMEM);
2574
2575                } else {
2576                        b = root->commit_root;
2577                        atomic_inc(&b->refs);
2578                }
2579                level = btrfs_header_level(b);
2580                /*
2581                 * Ensure that all callers have set skip_locking when
2582                 * p->search_commit_root = 1.
2583                 */
2584                ASSERT(p->skip_locking == 1);
2585
2586                goto out;
2587        }
2588
2589        if (p->skip_locking) {
2590                b = btrfs_root_node(root);
2591                level = btrfs_header_level(b);
2592                goto out;
2593        }
2594
2595        /*
2596         * If the level is set to maximum, we can skip trying to get the read
2597         * lock.
2598         */
2599        if (write_lock_level < BTRFS_MAX_LEVEL) {
2600                /*
2601                 * We don't know the level of the root node until we actually
2602                 * have it read locked
2603                 */
2604                b = btrfs_read_lock_root_node(root);
2605                level = btrfs_header_level(b);
2606                if (level > write_lock_level)
2607                        goto out;
2608
2609                /* Whoops, must trade for write lock */
2610                btrfs_tree_read_unlock(b);
2611                free_extent_buffer(b);
2612        }
2613
2614        b = btrfs_lock_root_node(root);
2615        root_lock = BTRFS_WRITE_LOCK;
2616
2617        /* The level might have changed, check again */
2618        level = btrfs_header_level(b);
2619
2620out:
2621        p->nodes[level] = b;
2622        if (!p->skip_locking)
2623                p->locks[level] = root_lock;
2624        /*
2625         * Callers are responsible for dropping b's references.
2626         */
2627        return b;
2628}
2629
2630
2631/*
2632 * btrfs_search_slot - look for a key in a tree and perform necessary
2633 * modifications to preserve tree invariants.
2634 *
2635 * @trans:      Handle of transaction, used when modifying the tree
2636 * @p:          Holds all btree nodes along the search path
2637 * @root:       The root node of the tree
2638 * @key:        The key we are looking for
2639 * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2640 *              deletions it's -1. 0 for plain searches
2641 * @cow:        boolean should CoW operations be performed. Must always be 1
2642 *              when modifying the tree.
2643 *
2644 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2645 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2646 *
2647 * If @key is found, 0 is returned and you can find the item in the leaf level
2648 * of the path (level 0)
2649 *
2650 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2651 * points to the slot where it should be inserted
2652 *
2653 * If an error is encountered while searching the tree a negative error number
2654 * is returned
2655 */
2656int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2657                      const struct btrfs_key *key, struct btrfs_path *p,
2658                      int ins_len, int cow)
2659{
2660        struct extent_buffer *b;
2661        int slot;
2662        int ret;
2663        int err;
2664        int level;
2665        int lowest_unlock = 1;
2666        /* everything at write_lock_level or lower must be write locked */
2667        int write_lock_level = 0;
2668        u8 lowest_level = 0;
2669        int min_write_lock_level;
2670        int prev_cmp;
2671
2672        lowest_level = p->lowest_level;
2673        WARN_ON(lowest_level && ins_len > 0);
2674        WARN_ON(p->nodes[0] != NULL);
2675        BUG_ON(!cow && ins_len);
2676
2677        if (ins_len < 0) {
2678                lowest_unlock = 2;
2679
2680                /* when we are removing items, we might have to go up to level
2681                 * two as we update tree pointers  Make sure we keep write
2682                 * for those levels as well
2683                 */
2684                write_lock_level = 2;
2685        } else if (ins_len > 0) {
2686                /*
2687                 * for inserting items, make sure we have a write lock on
2688                 * level 1 so we can update keys
2689                 */
2690                write_lock_level = 1;
2691        }
2692
2693        if (!cow)
2694                write_lock_level = -1;
2695
2696        if (cow && (p->keep_locks || p->lowest_level))
2697                write_lock_level = BTRFS_MAX_LEVEL;
2698
2699        min_write_lock_level = write_lock_level;
2700
2701again:
2702        prev_cmp = -1;
2703        b = btrfs_search_slot_get_root(root, p, write_lock_level);
2704        if (IS_ERR(b)) {
2705                ret = PTR_ERR(b);
2706                goto done;
2707        }
2708
2709        while (b) {
2710                int dec = 0;
2711
2712                level = btrfs_header_level(b);
2713
2714                if (cow) {
2715                        bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2716
2717                        /*
2718                         * if we don't really need to cow this block
2719                         * then we don't want to set the path blocking,
2720                         * so we test it here
2721                         */
2722                        if (!should_cow_block(trans, root, b)) {
2723                                trans->dirty = true;
2724                                goto cow_done;
2725                        }
2726
2727                        /*
2728                         * must have write locks on this node and the
2729                         * parent
2730                         */
2731                        if (level > write_lock_level ||
2732                            (level + 1 > write_lock_level &&
2733                            level + 1 < BTRFS_MAX_LEVEL &&
2734                            p->nodes[level + 1])) {
2735                                write_lock_level = level + 1;
2736                                btrfs_release_path(p);
2737                                goto again;
2738                        }
2739
2740                        btrfs_set_path_blocking(p);
2741                        if (last_level)
2742                                err = btrfs_cow_block(trans, root, b, NULL, 0,
2743                                                      &b);
2744                        else
2745                                err = btrfs_cow_block(trans, root, b,
2746                                                      p->nodes[level + 1],
2747                                                      p->slots[level + 1], &b);
2748                        if (err) {
2749                                ret = err;
2750                                goto done;
2751                        }
2752                }
2753cow_done:
2754                p->nodes[level] = b;
2755                /*
2756                 * Leave path with blocking locks to avoid massive
2757                 * lock context switch, this is made on purpose.
2758                 */
2759
2760                /*
2761                 * we have a lock on b and as long as we aren't changing
2762                 * the tree, there is no way to for the items in b to change.
2763                 * It is safe to drop the lock on our parent before we
2764                 * go through the expensive btree search on b.
2765                 *
2766                 * If we're inserting or deleting (ins_len != 0), then we might
2767                 * be changing slot zero, which may require changing the parent.
2768                 * So, we can't drop the lock until after we know which slot
2769                 * we're operating on.
2770                 */
2771                if (!ins_len && !p->keep_locks) {
2772                        int u = level + 1;
2773
2774                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2775                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2776                                p->locks[u] = 0;
2777                        }
2778                }
2779
2780                /*
2781                 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
2782                 * we can safely assume the target key will always be in slot 0
2783                 * on lower levels due to the invariants BTRFS' btree provides,
2784                 * namely that a btrfs_key_ptr entry always points to the
2785                 * lowest key in the child node, thus we can skip searching
2786                 * lower levels
2787                 */
2788                if (prev_cmp == 0) {
2789                        slot = 0;
2790                        ret = 0;
2791                } else {
2792                        ret = btrfs_bin_search(b, key, &slot);
2793                        prev_cmp = ret;
2794                        if (ret < 0)
2795                                goto done;
2796                }
2797
2798                if (level == 0) {
2799                        p->slots[level] = slot;
2800                        if (ins_len > 0 &&
2801                            btrfs_leaf_free_space(b) < ins_len) {
2802                                if (write_lock_level < 1) {
2803                                        write_lock_level = 1;
2804                                        btrfs_release_path(p);
2805                                        goto again;
2806                                }
2807
2808                                btrfs_set_path_blocking(p);
2809                                err = split_leaf(trans, root, key,
2810                                                 p, ins_len, ret == 0);
2811
2812                                BUG_ON(err > 0);
2813                                if (err) {
2814                                        ret = err;
2815                                        goto done;
2816                                }
2817                        }
2818                        if (!p->search_for_split)
2819                                unlock_up(p, level, lowest_unlock,
2820                                          min_write_lock_level, NULL);
2821                        goto done;
2822                }
2823                if (ret && slot > 0) {
2824                        dec = 1;
2825                        slot--;
2826                }
2827                p->slots[level] = slot;
2828                err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2829                                             &write_lock_level);
2830                if (err == -EAGAIN)
2831                        goto again;
2832                if (err) {
2833                        ret = err;
2834                        goto done;
2835                }
2836                b = p->nodes[level];
2837                slot = p->slots[level];
2838
2839                /*
2840                 * Slot 0 is special, if we change the key we have to update
2841                 * the parent pointer which means we must have a write lock on
2842                 * the parent
2843                 */
2844                if (slot == 0 && ins_len && write_lock_level < level + 1) {
2845                        write_lock_level = level + 1;
2846                        btrfs_release_path(p);
2847                        goto again;
2848                }
2849
2850                unlock_up(p, level, lowest_unlock, min_write_lock_level,
2851                          &write_lock_level);
2852
2853                if (level == lowest_level) {
2854                        if (dec)
2855                                p->slots[level]++;
2856                        goto done;
2857                }
2858
2859                err = read_block_for_search(root, p, &b, level, slot, key);
2860                if (err == -EAGAIN)
2861                        goto again;
2862                if (err) {
2863                        ret = err;
2864                        goto done;
2865                }
2866
2867                if (!p->skip_locking) {
2868                        level = btrfs_header_level(b);
2869                        if (level <= write_lock_level) {
2870                                if (!btrfs_try_tree_write_lock(b)) {
2871                                        btrfs_set_path_blocking(p);
2872                                        btrfs_tree_lock(b);
2873                                }
2874                                p->locks[level] = BTRFS_WRITE_LOCK;
2875                        } else {
2876                                if (!btrfs_tree_read_lock_atomic(b)) {
2877                                        btrfs_set_path_blocking(p);
2878                                        btrfs_tree_read_lock(b);
2879                                }
2880                                p->locks[level] = BTRFS_READ_LOCK;
2881                        }
2882                        p->nodes[level] = b;
2883                }
2884        }
2885        ret = 1;
2886done:
2887        /*
2888         * we don't really know what they plan on doing with the path
2889         * from here on, so for now just mark it as blocking
2890         */
2891        if (!p->leave_spinning)
2892                btrfs_set_path_blocking(p);
2893        if (ret < 0 && !p->skip_release_on_error)
2894                btrfs_release_path(p);
2895        return ret;
2896}
2897
2898/*
2899 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2900 * current state of the tree together with the operations recorded in the tree
2901 * modification log to search for the key in a previous version of this tree, as
2902 * denoted by the time_seq parameter.
2903 *
2904 * Naturally, there is no support for insert, delete or cow operations.
2905 *
2906 * The resulting path and return value will be set up as if we called
2907 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2908 */
2909int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2910                          struct btrfs_path *p, u64 time_seq)
2911{
2912        struct btrfs_fs_info *fs_info = root->fs_info;
2913        struct extent_buffer *b;
2914        int slot;
2915        int ret;
2916        int err;
2917        int level;
2918        int lowest_unlock = 1;
2919        u8 lowest_level = 0;
2920
2921        lowest_level = p->lowest_level;
2922        WARN_ON(p->nodes[0] != NULL);
2923
2924        if (p->search_commit_root) {
2925                BUG_ON(time_seq);
2926                return btrfs_search_slot(NULL, root, key, p, 0, 0);
2927        }
2928
2929again:
2930        b = get_old_root(root, time_seq);
2931        if (!b) {
2932                ret = -EIO;
2933                goto done;
2934        }
2935        level = btrfs_header_level(b);
2936        p->locks[level] = BTRFS_READ_LOCK;
2937
2938        while (b) {
2939                int dec = 0;
2940
2941                level = btrfs_header_level(b);
2942                p->nodes[level] = b;
2943
2944                /*
2945                 * we have a lock on b and as long as we aren't changing
2946                 * the tree, there is no way to for the items in b to change.
2947                 * It is safe to drop the lock on our parent before we
2948                 * go through the expensive btree search on b.
2949                 */
2950                btrfs_unlock_up_safe(p, level + 1);
2951
2952                ret = btrfs_bin_search(b, key, &slot);
2953                if (ret < 0)
2954                        goto done;
2955
2956                if (level == 0) {
2957                        p->slots[level] = slot;
2958                        unlock_up(p, level, lowest_unlock, 0, NULL);
2959                        goto done;
2960                }
2961
2962                if (ret && slot > 0) {
2963                        dec = 1;
2964                        slot--;
2965                }
2966                p->slots[level] = slot;
2967                unlock_up(p, level, lowest_unlock, 0, NULL);
2968
2969                if (level == lowest_level) {
2970                        if (dec)
2971                                p->slots[level]++;
2972                        goto done;
2973                }
2974
2975                err = read_block_for_search(root, p, &b, level, slot, key);
2976                if (err == -EAGAIN)
2977                        goto again;
2978                if (err) {
2979                        ret = err;
2980                        goto done;
2981                }
2982
2983                level = btrfs_header_level(b);
2984                if (!btrfs_tree_read_lock_atomic(b)) {
2985                        btrfs_set_path_blocking(p);
2986                        btrfs_tree_read_lock(b);
2987                }
2988                b = tree_mod_log_rewind(fs_info, p, b, time_seq);
2989                if (!b) {
2990                        ret = -ENOMEM;
2991                        goto done;
2992                }
2993                p->locks[level] = BTRFS_READ_LOCK;
2994                p->nodes[level] = b;
2995        }
2996        ret = 1;
2997done:
2998        if (!p->leave_spinning)
2999                btrfs_set_path_blocking(p);
3000        if (ret < 0)
3001                btrfs_release_path(p);
3002
3003        return ret;
3004}
3005
3006/*
3007 * helper to use instead of search slot if no exact match is needed but
3008 * instead the next or previous item should be returned.
3009 * When find_higher is true, the next higher item is returned, the next lower
3010 * otherwise.
3011 * When return_any and find_higher are both true, and no higher item is found,
3012 * return the next lower instead.
3013 * When return_any is true and find_higher is false, and no lower item is found,
3014 * return the next higher instead.
3015 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3016 * < 0 on error
3017 */
3018int btrfs_search_slot_for_read(struct btrfs_root *root,
3019                               const struct btrfs_key *key,
3020                               struct btrfs_path *p, int find_higher,
3021                               int return_any)
3022{
3023        int ret;
3024        struct extent_buffer *leaf;
3025
3026again:
3027        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3028        if (ret <= 0)
3029                return ret;
3030        /*
3031         * a return value of 1 means the path is at the position where the
3032         * item should be inserted. Normally this is the next bigger item,
3033         * but in case the previous item is the last in a leaf, path points
3034         * to the first free slot in the previous leaf, i.e. at an invalid
3035         * item.
3036         */
3037        leaf = p->nodes[0];
3038
3039        if (find_higher) {
3040                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3041                        ret = btrfs_next_leaf(root, p);
3042                        if (ret <= 0)
3043                                return ret;
3044                        if (!return_any)
3045                                return 1;
3046                        /*
3047                         * no higher item found, return the next
3048                         * lower instead
3049                         */
3050                        return_any = 0;
3051                        find_higher = 0;
3052                        btrfs_release_path(p);
3053                        goto again;
3054                }
3055        } else {
3056                if (p->slots[0] == 0) {
3057                        ret = btrfs_prev_leaf(root, p);
3058                        if (ret < 0)
3059                                return ret;
3060                        if (!ret) {
3061                                leaf = p->nodes[0];
3062                                if (p->slots[0] == btrfs_header_nritems(leaf))
3063                                        p->slots[0]--;
3064                                return 0;
3065                        }
3066                        if (!return_any)
3067                                return 1;
3068                        /*
3069                         * no lower item found, return the next
3070                         * higher instead
3071                         */
3072                        return_any = 0;
3073                        find_higher = 1;
3074                        btrfs_release_path(p);
3075                        goto again;
3076                } else {
3077                        --p->slots[0];
3078                }
3079        }
3080        return 0;
3081}
3082
3083/*
3084 * adjust the pointers going up the tree, starting at level
3085 * making sure the right key of each node is points to 'key'.
3086 * This is used after shifting pointers to the left, so it stops
3087 * fixing up pointers when a given leaf/node is not in slot 0 of the
3088 * higher levels
3089 *
3090 */
3091static void fixup_low_keys(struct btrfs_path *path,
3092                           struct btrfs_disk_key *key, int level)
3093{
3094        int i;
3095        struct extent_buffer *t;
3096        int ret;
3097
3098        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3099                int tslot = path->slots[i];
3100
3101                if (!path->nodes[i])
3102                        break;
3103                t = path->nodes[i];
3104                ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3105                                GFP_ATOMIC);
3106                BUG_ON(ret < 0);
3107                btrfs_set_node_key(t, key, tslot);
3108                btrfs_mark_buffer_dirty(path->nodes[i]);
3109                if (tslot != 0)
3110                        break;
3111        }
3112}
3113
3114/*
3115 * update item key.
3116 *
3117 * This function isn't completely safe. It's the caller's responsibility
3118 * that the new key won't break the order
3119 */
3120void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3121                             struct btrfs_path *path,
3122                             const struct btrfs_key *new_key)
3123{
3124        struct btrfs_disk_key disk_key;
3125        struct extent_buffer *eb;
3126        int slot;
3127
3128        eb = path->nodes[0];
3129        slot = path->slots[0];
3130        if (slot > 0) {
3131                btrfs_item_key(eb, &disk_key, slot - 1);
3132                if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3133                        btrfs_crit(fs_info,
3134                "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3135                                   slot, btrfs_disk_key_objectid(&disk_key),
3136                                   btrfs_disk_key_type(&disk_key),
3137                                   btrfs_disk_key_offset(&disk_key),
3138                                   new_key->objectid, new_key->type,
3139                                   new_key->offset);
3140                        btrfs_print_leaf(eb);
3141                        BUG();
3142                }
3143        }
3144        if (slot < btrfs_header_nritems(eb) - 1) {
3145                btrfs_item_key(eb, &disk_key, slot + 1);
3146                if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3147                        btrfs_crit(fs_info,
3148                "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3149                                   slot, btrfs_disk_key_objectid(&disk_key),
3150                                   btrfs_disk_key_type(&disk_key),
3151                                   btrfs_disk_key_offset(&disk_key),
3152                                   new_key->objectid, new_key->type,
3153                                   new_key->offset);
3154                        btrfs_print_leaf(eb);
3155                        BUG();
3156                }
3157        }
3158
3159        btrfs_cpu_key_to_disk(&disk_key, new_key);
3160        btrfs_set_item_key(eb, &disk_key, slot);
3161        btrfs_mark_buffer_dirty(eb);
3162        if (slot == 0)
3163                fixup_low_keys(path, &disk_key, 1);
3164}
3165
3166/*
3167 * try to push data from one node into the next node left in the
3168 * tree.
3169 *
3170 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3171 * error, and > 0 if there was no room in the left hand block.
3172 */
3173static int push_node_left(struct btrfs_trans_handle *trans,
3174                          struct extent_buffer *dst,
3175                          struct extent_buffer *src, int empty)
3176{
3177        struct btrfs_fs_info *fs_info = trans->fs_info;
3178        int push_items = 0;
3179        int src_nritems;
3180        int dst_nritems;
3181        int ret = 0;
3182
3183        src_nritems = btrfs_header_nritems(src);
3184        dst_nritems = btrfs_header_nritems(dst);
3185        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3186        WARN_ON(btrfs_header_generation(src) != trans->transid);
3187        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3188
3189        if (!empty && src_nritems <= 8)
3190                return 1;
3191
3192        if (push_items <= 0)
3193                return 1;
3194
3195        if (empty) {
3196                push_items = min(src_nritems, push_items);
3197                if (push_items < src_nritems) {
3198                        /* leave at least 8 pointers in the node if
3199                         * we aren't going to empty it
3200                         */
3201                        if (src_nritems - push_items < 8) {
3202                                if (push_items <= 8)
3203                                        return 1;
3204                                push_items -= 8;
3205                        }
3206                }
3207        } else
3208                push_items = min(src_nritems - 8, push_items);
3209
3210        ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3211        if (ret) {
3212                btrfs_abort_transaction(trans, ret);
3213                return ret;
3214        }
3215        copy_extent_buffer(dst, src,
3216                           btrfs_node_key_ptr_offset(dst_nritems),
3217                           btrfs_node_key_ptr_offset(0),
3218                           push_items * sizeof(struct btrfs_key_ptr));
3219
3220        if (push_items < src_nritems) {
3221                /*
3222                 * Don't call tree_mod_log_insert_move here, key removal was
3223                 * already fully logged by tree_mod_log_eb_copy above.
3224                 */
3225                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3226                                      btrfs_node_key_ptr_offset(push_items),
3227                                      (src_nritems - push_items) *
3228                                      sizeof(struct btrfs_key_ptr));
3229        }
3230        btrfs_set_header_nritems(src, src_nritems - push_items);
3231        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3232        btrfs_mark_buffer_dirty(src);
3233        btrfs_mark_buffer_dirty(dst);
3234
3235        return ret;
3236}
3237
3238/*
3239 * try to push data from one node into the next node right in the
3240 * tree.
3241 *
3242 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3243 * error, and > 0 if there was no room in the right hand block.
3244 *
3245 * this will  only push up to 1/2 the contents of the left node over
3246 */
3247static int balance_node_right(struct btrfs_trans_handle *trans,
3248                              struct extent_buffer *dst,
3249                              struct extent_buffer *src)
3250{
3251        struct btrfs_fs_info *fs_info = trans->fs_info;
3252        int push_items = 0;
3253        int max_push;
3254        int src_nritems;
3255        int dst_nritems;
3256        int ret = 0;
3257
3258        WARN_ON(btrfs_header_generation(src) != trans->transid);
3259        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3260
3261        src_nritems = btrfs_header_nritems(src);
3262        dst_nritems = btrfs_header_nritems(dst);
3263        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3264        if (push_items <= 0)
3265                return 1;
3266
3267        if (src_nritems < 4)
3268                return 1;
3269
3270        max_push = src_nritems / 2 + 1;
3271        /* don't try to empty the node */
3272        if (max_push >= src_nritems)
3273                return 1;
3274
3275        if (max_push < push_items)
3276                push_items = max_push;
3277
3278        ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3279        BUG_ON(ret < 0);
3280        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3281                                      btrfs_node_key_ptr_offset(0),
3282                                      (dst_nritems) *
3283                                      sizeof(struct btrfs_key_ptr));
3284
3285        ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3286                                   push_items);
3287        if (ret) {
3288                btrfs_abort_transaction(trans, ret);
3289                return ret;
3290        }
3291        copy_extent_buffer(dst, src,
3292                           btrfs_node_key_ptr_offset(0),
3293                           btrfs_node_key_ptr_offset(src_nritems - push_items),
3294                           push_items * sizeof(struct btrfs_key_ptr));
3295
3296        btrfs_set_header_nritems(src, src_nritems - push_items);
3297        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3298
3299        btrfs_mark_buffer_dirty(src);
3300        btrfs_mark_buffer_dirty(dst);
3301
3302        return ret;
3303}
3304
3305/*
3306 * helper function to insert a new root level in the tree.
3307 * A new node is allocated, and a single item is inserted to
3308 * point to the existing root
3309 *
3310 * returns zero on success or < 0 on failure.
3311 */
3312static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3313                           struct btrfs_root *root,
3314                           struct btrfs_path *path, int level)
3315{
3316        struct btrfs_fs_info *fs_info = root->fs_info;
3317        u64 lower_gen;
3318        struct extent_buffer *lower;
3319        struct extent_buffer *c;
3320        struct extent_buffer *old;
3321        struct btrfs_disk_key lower_key;
3322        int ret;
3323
3324        BUG_ON(path->nodes[level]);
3325        BUG_ON(path->nodes[level-1] != root->node);
3326
3327        lower = path->nodes[level-1];
3328        if (level == 1)
3329                btrfs_item_key(lower, &lower_key, 0);
3330        else
3331                btrfs_node_key(lower, &lower_key, 0);
3332
3333        c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3334                                         root->node->start, 0);
3335        if (IS_ERR(c))
3336                return PTR_ERR(c);
3337
3338        root_add_used(root, fs_info->nodesize);
3339
3340        btrfs_set_header_nritems(c, 1);
3341        btrfs_set_node_key(c, &lower_key, 0);
3342        btrfs_set_node_blockptr(c, 0, lower->start);
3343        lower_gen = btrfs_header_generation(lower);
3344        WARN_ON(lower_gen != trans->transid);
3345
3346        btrfs_set_node_ptr_generation(c, 0, lower_gen);
3347
3348        btrfs_mark_buffer_dirty(c);
3349
3350        old = root->node;
3351        ret = tree_mod_log_insert_root(root->node, c, 0);
3352        BUG_ON(ret < 0);
3353        rcu_assign_pointer(root->node, c);
3354
3355        /* the super has an extra ref to root->node */
3356        free_extent_buffer(old);
3357
3358        add_root_to_dirty_list(root);
3359        atomic_inc(&c->refs);
3360        path->nodes[level] = c;
3361        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3362        path->slots[level] = 0;
3363        return 0;
3364}
3365
3366/*
3367 * worker function to insert a single pointer in a node.
3368 * the node should have enough room for the pointer already
3369 *
3370 * slot and level indicate where you want the key to go, and
3371 * blocknr is the block the key points to.
3372 */
3373static void insert_ptr(struct btrfs_trans_handle *trans,
3374                       struct btrfs_path *path,
3375                       struct btrfs_disk_key *key, u64 bytenr,
3376                       int slot, int level)
3377{
3378        struct extent_buffer *lower;
3379        int nritems;
3380        int ret;
3381
3382        BUG_ON(!path->nodes[level]);
3383        btrfs_assert_tree_locked(path->nodes[level]);
3384        lower = path->nodes[level];
3385        nritems = btrfs_header_nritems(lower);
3386        BUG_ON(slot > nritems);
3387        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3388        if (slot != nritems) {
3389                if (level) {
3390                        ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3391                                        nritems - slot);
3392                        BUG_ON(ret < 0);
3393                }
3394                memmove_extent_buffer(lower,
3395                              btrfs_node_key_ptr_offset(slot + 1),
3396                              btrfs_node_key_ptr_offset(slot),
3397                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
3398        }
3399        if (level) {
3400                ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3401                                GFP_NOFS);
3402                BUG_ON(ret < 0);
3403        }
3404        btrfs_set_node_key(lower, key, slot);
3405        btrfs_set_node_blockptr(lower, slot, bytenr);
3406        WARN_ON(trans->transid == 0);
3407        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3408        btrfs_set_header_nritems(lower, nritems + 1);
3409        btrfs_mark_buffer_dirty(lower);
3410}
3411
3412/*
3413 * split the node at the specified level in path in two.
3414 * The path is corrected to point to the appropriate node after the split
3415 *
3416 * Before splitting this tries to make some room in the node by pushing
3417 * left and right, if either one works, it returns right away.
3418 *
3419 * returns 0 on success and < 0 on failure
3420 */
3421static noinline int split_node(struct btrfs_trans_handle *trans,
3422                               struct btrfs_root *root,
3423                               struct btrfs_path *path, int level)
3424{
3425        struct btrfs_fs_info *fs_info = root->fs_info;
3426        struct extent_buffer *c;
3427        struct extent_buffer *split;
3428        struct btrfs_disk_key disk_key;
3429        int mid;
3430        int ret;
3431        u32 c_nritems;
3432
3433        c = path->nodes[level];
3434        WARN_ON(btrfs_header_generation(c) != trans->transid);
3435        if (c == root->node) {
3436                /*
3437                 * trying to split the root, lets make a new one
3438                 *
3439                 * tree mod log: We don't log_removal old root in
3440                 * insert_new_root, because that root buffer will be kept as a
3441                 * normal node. We are going to log removal of half of the
3442                 * elements below with tree_mod_log_eb_copy. We're holding a
3443                 * tree lock on the buffer, which is why we cannot race with
3444                 * other tree_mod_log users.
3445                 */
3446                ret = insert_new_root(trans, root, path, level + 1);
3447                if (ret)
3448                        return ret;
3449        } else {
3450                ret = push_nodes_for_insert(trans, root, path, level);
3451                c = path->nodes[level];
3452                if (!ret && btrfs_header_nritems(c) <
3453                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3454                        return 0;
3455                if (ret < 0)
3456                        return ret;
3457        }
3458
3459        c_nritems = btrfs_header_nritems(c);
3460        mid = (c_nritems + 1) / 2;
3461        btrfs_node_key(c, &disk_key, mid);
3462
3463        split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3464                                             c->start, 0);
3465        if (IS_ERR(split))
3466                return PTR_ERR(split);
3467
3468        root_add_used(root, fs_info->nodesize);
3469        ASSERT(btrfs_header_level(c) == level);
3470
3471        ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3472        if (ret) {
3473                btrfs_abort_transaction(trans, ret);
3474                return ret;
3475        }
3476        copy_extent_buffer(split, c,
3477                           btrfs_node_key_ptr_offset(0),
3478                           btrfs_node_key_ptr_offset(mid),
3479                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3480        btrfs_set_header_nritems(split, c_nritems - mid);
3481        btrfs_set_header_nritems(c, mid);
3482        ret = 0;
3483
3484        btrfs_mark_buffer_dirty(c);
3485        btrfs_mark_buffer_dirty(split);
3486
3487        insert_ptr(trans, path, &disk_key, split->start,
3488                   path->slots[level + 1] + 1, level + 1);
3489
3490        if (path->slots[level] >= mid) {
3491                path->slots[level] -= mid;
3492                btrfs_tree_unlock(c);
3493                free_extent_buffer(c);
3494                path->nodes[level] = split;
3495                path->slots[level + 1] += 1;
3496        } else {
3497                btrfs_tree_unlock(split);
3498                free_extent_buffer(split);
3499        }
3500        return ret;
3501}
3502
3503/*
3504 * how many bytes are required to store the items in a leaf.  start
3505 * and nr indicate which items in the leaf to check.  This totals up the
3506 * space used both by the item structs and the item data
3507 */
3508static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3509{
3510        struct btrfs_item *start_item;
3511        struct btrfs_item *end_item;
3512        int data_len;
3513        int nritems = btrfs_header_nritems(l);
3514        int end = min(nritems, start + nr) - 1;
3515
3516        if (!nr)
3517                return 0;
3518        start_item = btrfs_item_nr(start);
3519        end_item = btrfs_item_nr(end);
3520        data_len = btrfs_item_offset(l, start_item) +
3521                   btrfs_item_size(l, start_item);
3522        data_len = data_len - btrfs_item_offset(l, end_item);
3523        data_len += sizeof(struct btrfs_item) * nr;
3524        WARN_ON(data_len < 0);
3525        return data_len;
3526}
3527
3528/*
3529 * The space between the end of the leaf items and
3530 * the start of the leaf data.  IOW, how much room
3531 * the leaf has left for both items and data
3532 */
3533noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3534{
3535        struct btrfs_fs_info *fs_info = leaf->fs_info;
3536        int nritems = btrfs_header_nritems(leaf);
3537        int ret;
3538
3539        ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3540        if (ret < 0) {
3541                btrfs_crit(fs_info,
3542                           "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3543                           ret,
3544                           (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3545                           leaf_space_used(leaf, 0, nritems), nritems);
3546        }
3547        return ret;
3548}
3549
3550/*
3551 * min slot controls the lowest index we're willing to push to the
3552 * right.  We'll push up to and including min_slot, but no lower
3553 */
3554static noinline int __push_leaf_right(struct btrfs_path *path,
3555                                      int data_size, int empty,
3556                                      struct extent_buffer *right,
3557                                      int free_space, u32 left_nritems,
3558                                      u32 min_slot)
3559{
3560        struct btrfs_fs_info *fs_info = right->fs_info;
3561        struct extent_buffer *left = path->nodes[0];
3562        struct extent_buffer *upper = path->nodes[1];
3563        struct btrfs_map_token token;
3564        struct btrfs_disk_key disk_key;
3565        int slot;
3566        u32 i;
3567        int push_space = 0;
3568        int push_items = 0;
3569        struct btrfs_item *item;
3570        u32 nr;
3571        u32 right_nritems;
3572        u32 data_end;
3573        u32 this_item_size;
3574
3575        if (empty)
3576                nr = 0;
3577        else
3578                nr = max_t(u32, 1, min_slot);
3579
3580        if (path->slots[0] >= left_nritems)
3581                push_space += data_size;
3582
3583        slot = path->slots[1];
3584        i = left_nritems - 1;
3585        while (i >= nr) {
3586                item = btrfs_item_nr(i);
3587
3588                if (!empty && push_items > 0) {
3589                        if (path->slots[0] > i)
3590                                break;
3591                        if (path->slots[0] == i) {
3592                                int space = btrfs_leaf_free_space(left);
3593
3594                                if (space + push_space * 2 > free_space)
3595                                        break;
3596                        }
3597                }
3598
3599                if (path->slots[0] == i)
3600                        push_space += data_size;
3601
3602                this_item_size = btrfs_item_size(left, item);
3603                if (this_item_size + sizeof(*item) + push_space > free_space)
3604                        break;
3605
3606                push_items++;
3607                push_space += this_item_size + sizeof(*item);
3608                if (i == 0)
3609                        break;
3610                i--;
3611        }
3612
3613        if (push_items == 0)
3614                goto out_unlock;
3615
3616        WARN_ON(!empty && push_items == left_nritems);
3617
3618        /* push left to right */
3619        right_nritems = btrfs_header_nritems(right);
3620
3621        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3622        push_space -= leaf_data_end(left);
3623
3624        /* make room in the right data area */
3625        data_end = leaf_data_end(right);
3626        memmove_extent_buffer(right,
3627                              BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3628                              BTRFS_LEAF_DATA_OFFSET + data_end,
3629                              BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3630
3631        /* copy from the left data area */
3632        copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3633                     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3634                     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3635                     push_space);
3636
3637        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3638                              btrfs_item_nr_offset(0),
3639                              right_nritems * sizeof(struct btrfs_item));
3640
3641        /* copy the items from left to right */
3642        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3643                   btrfs_item_nr_offset(left_nritems - push_items),
3644                   push_items * sizeof(struct btrfs_item));
3645
3646        /* update the item pointers */
3647        btrfs_init_map_token(&token, right);
3648        right_nritems += push_items;
3649        btrfs_set_header_nritems(right, right_nritems);
3650        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3651        for (i = 0; i < right_nritems; i++) {
3652                item = btrfs_item_nr(i);
3653                push_space -= btrfs_token_item_size(&token, item);
3654                btrfs_set_token_item_offset(&token, item, push_space);
3655        }
3656
3657        left_nritems -= push_items;
3658        btrfs_set_header_nritems(left, left_nritems);
3659
3660        if (left_nritems)
3661                btrfs_mark_buffer_dirty(left);
3662        else
3663                btrfs_clean_tree_block(left);
3664
3665        btrfs_mark_buffer_dirty(right);
3666
3667        btrfs_item_key(right, &disk_key, 0);
3668        btrfs_set_node_key(upper, &disk_key, slot + 1);
3669        btrfs_mark_buffer_dirty(upper);
3670
3671        /* then fixup the leaf pointer in the path */
3672        if (path->slots[0] >= left_nritems) {
3673                path->slots[0] -= left_nritems;
3674                if (btrfs_header_nritems(path->nodes[0]) == 0)
3675                        btrfs_clean_tree_block(path->nodes[0]);
3676                btrfs_tree_unlock(path->nodes[0]);
3677                free_extent_buffer(path->nodes[0]);
3678                path->nodes[0] = right;
3679                path->slots[1] += 1;
3680        } else {
3681                btrfs_tree_unlock(right);
3682                free_extent_buffer(right);
3683        }
3684        return 0;
3685
3686out_unlock:
3687        btrfs_tree_unlock(right);
3688        free_extent_buffer(right);
3689        return 1;
3690}
3691
3692/*
3693 * push some data in the path leaf to the right, trying to free up at
3694 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3695 *
3696 * returns 1 if the push failed because the other node didn't have enough
3697 * room, 0 if everything worked out and < 0 if there were major errors.
3698 *
3699 * this will push starting from min_slot to the end of the leaf.  It won't
3700 * push any slot lower than min_slot
3701 */
3702static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3703                           *root, struct btrfs_path *path,
3704                           int min_data_size, int data_size,
3705                           int empty, u32 min_slot)
3706{
3707        struct extent_buffer *left = path->nodes[0];
3708        struct extent_buffer *right;
3709        struct extent_buffer *upper;
3710        int slot;
3711        int free_space;
3712        u32 left_nritems;
3713        int ret;
3714
3715        if (!path->nodes[1])
3716                return 1;
3717
3718        slot = path->slots[1];
3719        upper = path->nodes[1];
3720        if (slot >= btrfs_header_nritems(upper) - 1)
3721                return 1;
3722
3723        btrfs_assert_tree_locked(path->nodes[1]);
3724
3725        right = btrfs_read_node_slot(upper, slot + 1);
3726        /*
3727         * slot + 1 is not valid or we fail to read the right node,
3728         * no big deal, just return.
3729         */
3730        if (IS_ERR(right))
3731                return 1;
3732
3733        btrfs_tree_lock(right);
3734        btrfs_set_lock_blocking_write(right);
3735
3736        free_space = btrfs_leaf_free_space(right);
3737        if (free_space < data_size)
3738                goto out_unlock;
3739
3740        /* cow and double check */
3741        ret = btrfs_cow_block(trans, root, right, upper,
3742                              slot + 1, &right);
3743        if (ret)
3744                goto out_unlock;
3745
3746        free_space = btrfs_leaf_free_space(right);
3747        if (free_space < data_size)
3748                goto out_unlock;
3749
3750        left_nritems = btrfs_header_nritems(left);
3751        if (left_nritems == 0)
3752                goto out_unlock;
3753
3754        if (path->slots[0] == left_nritems && !empty) {
3755                /* Key greater than all keys in the leaf, right neighbor has
3756                 * enough room for it and we're not emptying our leaf to delete
3757                 * it, therefore use right neighbor to insert the new item and
3758                 * no need to touch/dirty our left leaf. */
3759                btrfs_tree_unlock(left);
3760                free_extent_buffer(left);
3761                path->nodes[0] = right;
3762                path->slots[0] = 0;
3763                path->slots[1]++;
3764                return 0;
3765        }
3766
3767        return __push_leaf_right(path, min_data_size, empty,
3768                                right, free_space, left_nritems, min_slot);
3769out_unlock:
3770        btrfs_tree_unlock(right);
3771        free_extent_buffer(right);
3772        return 1;
3773}
3774
3775/*
3776 * push some data in the path leaf to the left, trying to free up at
3777 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3778 *
3779 * max_slot can put a limit on how far into the leaf we'll push items.  The
3780 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3781 * items
3782 */
3783static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3784                                     int empty, struct extent_buffer *left,
3785                                     int free_space, u32 right_nritems,
3786                                     u32 max_slot)
3787{
3788        struct btrfs_fs_info *fs_info = left->fs_info;
3789        struct btrfs_disk_key disk_key;
3790        struct extent_buffer *right = path->nodes[0];
3791        int i;
3792        int push_space = 0;
3793        int push_items = 0;
3794        struct btrfs_item *item;
3795        u32 old_left_nritems;
3796        u32 nr;
3797        int ret = 0;
3798        u32 this_item_size;
3799        u32 old_left_item_size;
3800        struct btrfs_map_token token;
3801
3802        if (empty)
3803                nr = min(right_nritems, max_slot);
3804        else
3805                nr = min(right_nritems - 1, max_slot);
3806
3807        for (i = 0; i < nr; i++) {
3808                item = btrfs_item_nr(i);
3809
3810                if (!empty && push_items > 0) {
3811                        if (path->slots[0] < i)
3812                                break;
3813                        if (path->slots[0] == i) {
3814                                int space = btrfs_leaf_free_space(right);
3815
3816                                if (space + push_space * 2 > free_space)
3817                                        break;
3818                        }
3819                }
3820
3821                if (path->slots[0] == i)
3822                        push_space += data_size;
3823
3824                this_item_size = btrfs_item_size(right, item);
3825                if (this_item_size + sizeof(*item) + push_space > free_space)
3826                        break;
3827
3828                push_items++;
3829                push_space += this_item_size + sizeof(*item);
3830        }
3831
3832        if (push_items == 0) {
3833                ret = 1;
3834                goto out;
3835        }
3836        WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3837
3838        /* push data from right to left */
3839        copy_extent_buffer(left, right,
3840                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
3841                           btrfs_item_nr_offset(0),
3842                           push_items * sizeof(struct btrfs_item));
3843
3844        push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3845                     btrfs_item_offset_nr(right, push_items - 1);
3846
3847        copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3848                     leaf_data_end(left) - push_space,
3849                     BTRFS_LEAF_DATA_OFFSET +
3850                     btrfs_item_offset_nr(right, push_items - 1),
3851                     push_space);
3852        old_left_nritems = btrfs_header_nritems(left);
3853        BUG_ON(old_left_nritems <= 0);
3854
3855        btrfs_init_map_token(&token, left);
3856        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3857        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3858                u32 ioff;
3859
3860                item = btrfs_item_nr(i);
3861
3862                ioff = btrfs_token_item_offset(&token, item);
3863                btrfs_set_token_item_offset(&token, item,
3864                      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3865        }
3866        btrfs_set_header_nritems(left, old_left_nritems + push_items);
3867
3868        /* fixup right node */
3869        if (push_items > right_nritems)
3870                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3871                       right_nritems);
3872
3873        if (push_items < right_nritems) {
3874                push_space = btrfs_item_offset_nr(right, push_items - 1) -
3875                                                  leaf_data_end(right);
3876                memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3877                                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3878                                      BTRFS_LEAF_DATA_OFFSET +
3879                                      leaf_data_end(right), push_space);
3880
3881                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3882                              btrfs_item_nr_offset(push_items),
3883                             (btrfs_header_nritems(right) - push_items) *
3884                             sizeof(struct btrfs_item));
3885        }
3886
3887        btrfs_init_map_token(&token, right);
3888        right_nritems -= push_items;
3889        btrfs_set_header_nritems(right, right_nritems);
3890        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3891        for (i = 0; i < right_nritems; i++) {
3892                item = btrfs_item_nr(i);
3893
3894                push_space = push_space - btrfs_token_item_size(&token, item);
3895                btrfs_set_token_item_offset(&token, item, push_space);
3896        }
3897
3898        btrfs_mark_buffer_dirty(left);
3899        if (right_nritems)
3900                btrfs_mark_buffer_dirty(right);
3901        else
3902                btrfs_clean_tree_block(right);
3903
3904        btrfs_item_key(right, &disk_key, 0);
3905        fixup_low_keys(path, &disk_key, 1);
3906
3907        /* then fixup the leaf pointer in the path */
3908        if (path->slots[0] < push_items) {
3909                path->slots[0] += old_left_nritems;
3910                btrfs_tree_unlock(path->nodes[0]);
3911                free_extent_buffer(path->nodes[0]);
3912                path->nodes[0] = left;
3913                path->slots[1] -= 1;
3914        } else {
3915                btrfs_tree_unlock(left);
3916                free_extent_buffer(left);
3917                path->slots[0] -= push_items;
3918        }
3919        BUG_ON(path->slots[0] < 0);
3920        return ret;
3921out:
3922        btrfs_tree_unlock(left);
3923        free_extent_buffer(left);
3924        return ret;
3925}
3926
3927/*
3928 * push some data in the path leaf to the left, trying to free up at
3929 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3930 *
3931 * max_slot can put a limit on how far into the leaf we'll push items.  The
3932 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3933 * items
3934 */
3935static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3936                          *root, struct btrfs_path *path, int min_data_size,
3937                          int data_size, int empty, u32 max_slot)
3938{
3939        struct extent_buffer *right = path->nodes[0];
3940        struct extent_buffer *left;
3941        int slot;
3942        int free_space;
3943        u32 right_nritems;
3944        int ret = 0;
3945
3946        slot = path->slots[1];
3947        if (slot == 0)
3948                return 1;
3949        if (!path->nodes[1])
3950                return 1;
3951
3952        right_nritems = btrfs_header_nritems(right);
3953        if (right_nritems == 0)
3954                return 1;
3955
3956        btrfs_assert_tree_locked(path->nodes[1]);
3957
3958        left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3959        /*
3960         * slot - 1 is not valid or we fail to read the left node,
3961         * no big deal, just return.
3962         */
3963        if (IS_ERR(left))
3964                return 1;
3965
3966        btrfs_tree_lock(left);
3967        btrfs_set_lock_blocking_write(left);
3968
3969        free_space = btrfs_leaf_free_space(left);
3970        if (free_space < data_size) {
3971                ret = 1;
3972                goto out;
3973        }
3974
3975        /* cow and double check */
3976        ret = btrfs_cow_block(trans, root, left,
3977                              path->nodes[1], slot - 1, &left);
3978        if (ret) {
3979                /* we hit -ENOSPC, but it isn't fatal here */
3980                if (ret == -ENOSPC)
3981                        ret = 1;
3982                goto out;
3983        }
3984
3985        free_space = btrfs_leaf_free_space(left);
3986        if (free_space < data_size) {
3987                ret = 1;
3988                goto out;
3989        }
3990
3991        return __push_leaf_left(path, min_data_size,
3992                               empty, left, free_space, right_nritems,
3993                               max_slot);
3994out:
3995        btrfs_tree_unlock(left);
3996        free_extent_buffer(left);
3997        return ret;
3998}
3999
4000/*
4001 * split the path's leaf in two, making sure there is at least data_size
4002 * available for the resulting leaf level of the path.
4003 */
4004static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4005                                    struct btrfs_path *path,
4006                                    struct extent_buffer *l,
4007                                    struct extent_buffer *right,
4008                                    int slot, int mid, int nritems)
4009{
4010        struct btrfs_fs_info *fs_info = trans->fs_info;
4011        int data_copy_size;
4012        int rt_data_off;
4013        int i;
4014        struct btrfs_disk_key disk_key;
4015        struct btrfs_map_token token;
4016
4017        nritems = nritems - mid;
4018        btrfs_set_header_nritems(right, nritems);
4019        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4020
4021        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4022                           btrfs_item_nr_offset(mid),
4023                           nritems * sizeof(struct btrfs_item));
4024
4025        copy_extent_buffer(right, l,
4026                     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4027                     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4028                     leaf_data_end(l), data_copy_size);
4029
4030        rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4031
4032        btrfs_init_map_token(&token, right);
4033        for (i = 0; i < nritems; i++) {
4034                struct btrfs_item *item = btrfs_item_nr(i);
4035                u32 ioff;
4036
4037                ioff = btrfs_token_item_offset(&token, item);
4038                btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
4039        }
4040
4041        btrfs_set_header_nritems(l, mid);
4042        btrfs_item_key(right, &disk_key, 0);
4043        insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4044
4045        btrfs_mark_buffer_dirty(right);
4046        btrfs_mark_buffer_dirty(l);
4047        BUG_ON(path->slots[0] != slot);
4048
4049        if (mid <= slot) {
4050                btrfs_tree_unlock(path->nodes[0]);
4051                free_extent_buffer(path->nodes[0]);
4052                path->nodes[0] = right;
4053                path->slots[0] -= mid;
4054                path->slots[1] += 1;
4055        } else {
4056                btrfs_tree_unlock(right);
4057                free_extent_buffer(right);
4058        }
4059
4060        BUG_ON(path->slots[0] < 0);
4061}
4062
4063/*
4064 * double splits happen when we need to insert a big item in the middle
4065 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4066 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4067 *          A                 B                 C
4068 *
4069 * We avoid this by trying to push the items on either side of our target
4070 * into the adjacent leaves.  If all goes well we can avoid the double split
4071 * completely.
4072 */
4073static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4074                                          struct btrfs_root *root,
4075                                          struct btrfs_path *path,
4076                                          int data_size)
4077{
4078        int ret;
4079        int progress = 0;
4080        int slot;
4081        u32 nritems;
4082        int space_needed = data_size;
4083
4084        slot = path->slots[0];
4085        if (slot < btrfs_header_nritems(path->nodes[0]))
4086                space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4087
4088        /*
4089         * try to push all the items after our slot into the
4090         * right leaf
4091         */
4092        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4093        if (ret < 0)
4094                return ret;
4095
4096        if (ret == 0)
4097                progress++;
4098
4099        nritems = btrfs_header_nritems(path->nodes[0]);
4100        /*
4101         * our goal is to get our slot at the start or end of a leaf.  If
4102         * we've done so we're done
4103         */
4104        if (path->slots[0] == 0 || path->slots[0] == nritems)
4105                return 0;
4106
4107        if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4108                return 0;
4109
4110        /* try to push all the items before our slot into the next leaf */
4111        slot = path->slots[0];
4112        space_needed = data_size;
4113        if (slot > 0)
4114                space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4115        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4116        if (ret < 0)
4117                return ret;
4118
4119        if (ret == 0)
4120                progress++;
4121
4122        if (progress)
4123                return 0;
4124        return 1;
4125}
4126
4127/*
4128 * split the path's leaf in two, making sure there is at least data_size
4129 * available for the resulting leaf level of the path.
4130 *
4131 * returns 0 if all went well and < 0 on failure.
4132 */
4133static noinline int split_leaf(struct btrfs_trans_handle *trans,
4134                               struct btrfs_root *root,
4135                               const struct btrfs_key *ins_key,
4136                               struct btrfs_path *path, int data_size,
4137                               int extend)
4138{
4139        struct btrfs_disk_key disk_key;
4140        struct extent_buffer *l;
4141        u32 nritems;
4142        int mid;
4143        int slot;
4144        struct extent_buffer *right;
4145        struct btrfs_fs_info *fs_info = root->fs_info;
4146        int ret = 0;
4147        int wret;
4148        int split;
4149        int num_doubles = 0;
4150        int tried_avoid_double = 0;
4151
4152        l = path->nodes[0];
4153        slot = path->slots[0];
4154        if (extend && data_size + btrfs_item_size_nr(l, slot) +
4155            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4156                return -EOVERFLOW;
4157
4158        /* first try to make some room by pushing left and right */
4159        if (data_size && path->nodes[1]) {
4160                int space_needed = data_size;
4161
4162                if (slot < btrfs_header_nritems(l))
4163                        space_needed -= btrfs_leaf_free_space(l);
4164
4165                wret = push_leaf_right(trans, root, path, space_needed,
4166                                       space_needed, 0, 0);
4167                if (wret < 0)
4168                        return wret;
4169                if (wret) {
4170                        space_needed = data_size;
4171                        if (slot > 0)
4172                                space_needed -= btrfs_leaf_free_space(l);
4173                        wret = push_leaf_left(trans, root, path, space_needed,
4174                                              space_needed, 0, (u32)-1);
4175                        if (wret < 0)
4176                                return wret;
4177                }
4178                l = path->nodes[0];
4179
4180                /* did the pushes work? */
4181                if (btrfs_leaf_free_space(l) >= data_size)
4182                        return 0;
4183        }
4184
4185        if (!path->nodes[1]) {
4186                ret = insert_new_root(trans, root, path, 1);
4187                if (ret)
4188                        return ret;
4189        }
4190again:
4191        split = 1;
4192        l = path->nodes[0];
4193        slot = path->slots[0];
4194        nritems = btrfs_header_nritems(l);
4195        mid = (nritems + 1) / 2;
4196
4197        if (mid <= slot) {
4198                if (nritems == 1 ||
4199                    leaf_space_used(l, mid, nritems - mid) + data_size >
4200                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
4201                        if (slot >= nritems) {
4202                                split = 0;
4203                        } else {
4204                                mid = slot;
4205                                if (mid != nritems &&
4206                                    leaf_space_used(l, mid, nritems - mid) +
4207                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4208                                        if (data_size && !tried_avoid_double)
4209                                                goto push_for_double;
4210                                        split = 2;
4211                                }
4212                        }
4213                }
4214        } else {
4215                if (leaf_space_used(l, 0, mid) + data_size >
4216                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
4217                        if (!extend && data_size && slot == 0) {
4218                                split = 0;
4219                        } else if ((extend || !data_size) && slot == 0) {
4220                                mid = 1;
4221                        } else {
4222                                mid = slot;
4223                                if (mid != nritems &&
4224                                    leaf_space_used(l, mid, nritems - mid) +
4225                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4226                                        if (data_size && !tried_avoid_double)
4227                                                goto push_for_double;
4228                                        split = 2;
4229                                }
4230                        }
4231                }
4232        }
4233
4234        if (split == 0)
4235                btrfs_cpu_key_to_disk(&disk_key, ins_key);
4236        else
4237                btrfs_item_key(l, &disk_key, mid);
4238
4239        right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4240                                             l->start, 0);
4241        if (IS_ERR(right))
4242                return PTR_ERR(right);
4243
4244        root_add_used(root, fs_info->nodesize);
4245
4246        if (split == 0) {
4247                if (mid <= slot) {
4248                        btrfs_set_header_nritems(right, 0);
4249                        insert_ptr(trans, path, &disk_key,
4250                                   right->start, path->slots[1] + 1, 1);
4251                        btrfs_tree_unlock(path->nodes[0]);
4252                        free_extent_buffer(path->nodes[0]);
4253                        path->nodes[0] = right;
4254                        path->slots[0] = 0;
4255                        path->slots[1] += 1;
4256                } else {
4257                        btrfs_set_header_nritems(right, 0);
4258                        insert_ptr(trans, path, &disk_key,
4259                                   right->start, path->slots[1], 1);
4260                        btrfs_tree_unlock(path->nodes[0]);
4261                        free_extent_buffer(path->nodes[0]);
4262                        path->nodes[0] = right;
4263                        path->slots[0] = 0;
4264                        if (path->slots[1] == 0)
4265                                fixup_low_keys(path, &disk_key, 1);
4266                }
4267                /*
4268                 * We create a new leaf 'right' for the required ins_len and
4269                 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4270                 * the content of ins_len to 'right'.
4271                 */
4272                return ret;
4273        }
4274
4275        copy_for_split(trans, path, l, right, slot, mid, nritems);
4276
4277        if (split == 2) {
4278                BUG_ON(num_doubles != 0);
4279                num_doubles++;
4280                goto again;
4281        }
4282
4283        return 0;
4284
4285push_for_double:
4286        push_for_double_split(trans, root, path, data_size);
4287        tried_avoid_double = 1;
4288        if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4289                return 0;
4290        goto again;
4291}
4292
4293static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4294                                         struct btrfs_root *root,
4295                                         struct btrfs_path *path, int ins_len)
4296{
4297        struct btrfs_key key;
4298        struct extent_buffer *leaf;
4299        struct btrfs_file_extent_item *fi;
4300        u64 extent_len = 0;
4301        u32 item_size;
4302        int ret;
4303
4304        leaf = path->nodes[0];
4305        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4306
4307        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4308               key.type != BTRFS_EXTENT_CSUM_KEY);
4309
4310        if (btrfs_leaf_free_space(leaf) >= ins_len)
4311                return 0;
4312
4313        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4314        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4315                fi = btrfs_item_ptr(leaf, path->slots[0],
4316                                    struct btrfs_file_extent_item);
4317                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4318        }
4319        btrfs_release_path(path);
4320
4321        path->keep_locks = 1;
4322        path->search_for_split = 1;
4323        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4324        path->search_for_split = 0;
4325        if (ret > 0)
4326                ret = -EAGAIN;
4327        if (ret < 0)
4328                goto err;
4329
4330        ret = -EAGAIN;
4331        leaf = path->nodes[0];
4332        /* if our item isn't there, return now */
4333        if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4334                goto err;
4335
4336        /* the leaf has  changed, it now has room.  return now */
4337        if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4338                goto err;
4339
4340        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4341                fi = btrfs_item_ptr(leaf, path->slots[0],
4342                                    struct btrfs_file_extent_item);
4343                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4344                        goto err;
4345        }
4346
4347        btrfs_set_path_blocking(path);
4348        ret = split_leaf(trans, root, &key, path, ins_len, 1);
4349        if (ret)
4350                goto err;
4351
4352        path->keep_locks = 0;
4353        btrfs_unlock_up_safe(path, 1);
4354        return 0;
4355err:
4356        path->keep_locks = 0;
4357        return ret;
4358}
4359
4360static noinline int split_item(struct btrfs_path *path,
4361                               const struct btrfs_key *new_key,
4362                               unsigned long split_offset)
4363{
4364        struct extent_buffer *leaf;
4365        struct btrfs_item *item;
4366        struct btrfs_item *new_item;
4367        int slot;
4368        char *buf;
4369        u32 nritems;
4370        u32 item_size;
4371        u32 orig_offset;
4372        struct btrfs_disk_key disk_key;
4373
4374        leaf = path->nodes[0];
4375        BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4376
4377        btrfs_set_path_blocking(path);
4378
4379        item = btrfs_item_nr(path->slots[0]);
4380        orig_offset = btrfs_item_offset(leaf, item);
4381        item_size = btrfs_item_size(leaf, item);
4382
4383        buf = kmalloc(item_size, GFP_NOFS);
4384        if (!buf)
4385                return -ENOMEM;
4386
4387        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4388                            path->slots[0]), item_size);
4389
4390        slot = path->slots[0] + 1;
4391        nritems = btrfs_header_nritems(leaf);
4392        if (slot != nritems) {
4393                /* shift the items */
4394                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4395                                btrfs_item_nr_offset(slot),
4396                                (nritems - slot) * sizeof(struct btrfs_item));
4397        }
4398
4399        btrfs_cpu_key_to_disk(&disk_key, new_key);
4400        btrfs_set_item_key(leaf, &disk_key, slot);
4401
4402        new_item = btrfs_item_nr(slot);
4403
4404        btrfs_set_item_offset(leaf, new_item, orig_offset);
4405        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4406
4407        btrfs_set_item_offset(leaf, item,
4408                              orig_offset + item_size - split_offset);
4409        btrfs_set_item_size(leaf, item, split_offset);
4410
4411        btrfs_set_header_nritems(leaf, nritems + 1);
4412
4413        /* write the data for the start of the original item */
4414        write_extent_buffer(leaf, buf,
4415                            btrfs_item_ptr_offset(leaf, path->slots[0]),
4416                            split_offset);
4417
4418        /* write the data for the new item */
4419        write_extent_buffer(leaf, buf + split_offset,
4420                            btrfs_item_ptr_offset(leaf, slot),
4421                            item_size - split_offset);
4422        btrfs_mark_buffer_dirty(leaf);
4423
4424        BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4425        kfree(buf);
4426        return 0;
4427}
4428
4429/*
4430 * This function splits a single item into two items,
4431 * giving 'new_key' to the new item and splitting the
4432 * old one at split_offset (from the start of the item).
4433 *
4434 * The path may be released by this operation.  After
4435 * the split, the path is pointing to the old item.  The
4436 * new item is going to be in the same node as the old one.
4437 *
4438 * Note, the item being split must be smaller enough to live alone on
4439 * a tree block with room for one extra struct btrfs_item
4440 *
4441 * This allows us to split the item in place, keeping a lock on the
4442 * leaf the entire time.
4443 */
4444int btrfs_split_item(struct btrfs_trans_handle *trans,
4445                     struct btrfs_root *root,
4446                     struct btrfs_path *path,
4447                     const struct btrfs_key *new_key,
4448                     unsigned long split_offset)
4449{
4450        int ret;
4451        ret = setup_leaf_for_split(trans, root, path,
4452                                   sizeof(struct btrfs_item));
4453        if (ret)
4454                return ret;
4455
4456        ret = split_item(path, new_key, split_offset);
4457        return ret;
4458}
4459
4460/*
4461 * This function duplicate a item, giving 'new_key' to the new item.
4462 * It guarantees both items live in the same tree leaf and the new item
4463 * is contiguous with the original item.
4464 *
4465 * This allows us to split file extent in place, keeping a lock on the
4466 * leaf the entire time.
4467 */
4468int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4469                         struct btrfs_root *root,
4470                         struct btrfs_path *path,
4471                         const struct btrfs_key *new_key)
4472{
4473        struct extent_buffer *leaf;
4474        int ret;
4475        u32 item_size;
4476
4477        leaf = path->nodes[0];
4478        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4479        ret = setup_leaf_for_split(trans, root, path,
4480                                   item_size + sizeof(struct btrfs_item));
4481        if (ret)
4482                return ret;
4483
4484        path->slots[0]++;
4485        setup_items_for_insert(root, path, new_key, &item_size,
4486                               item_size, item_size +
4487                               sizeof(struct btrfs_item), 1);
4488        leaf = path->nodes[0];
4489        memcpy_extent_buffer(leaf,
4490                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4491                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4492                             item_size);
4493        return 0;
4494}
4495
4496/*
4497 * make the item pointed to by the path smaller.  new_size indicates
4498 * how small to make it, and from_end tells us if we just chop bytes
4499 * off the end of the item or if we shift the item to chop bytes off
4500 * the front.
4501 */
4502void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4503{
4504        int slot;
4505        struct extent_buffer *leaf;
4506        struct btrfs_item *item;
4507        u32 nritems;
4508        unsigned int data_end;
4509        unsigned int old_data_start;
4510        unsigned int old_size;
4511        unsigned int size_diff;
4512        int i;
4513        struct btrfs_map_token token;
4514
4515        leaf = path->nodes[0];
4516        slot = path->slots[0];
4517
4518        old_size = btrfs_item_size_nr(leaf, slot);
4519        if (old_size == new_size)
4520                return;
4521
4522        nritems = btrfs_header_nritems(leaf);
4523        data_end = leaf_data_end(leaf);
4524
4525        old_data_start = btrfs_item_offset_nr(leaf, slot);
4526
4527        size_diff = old_size - new_size;
4528
4529        BUG_ON(slot < 0);
4530        BUG_ON(slot >= nritems);
4531
4532        /*
4533         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4534         */
4535        /* first correct the data pointers */
4536        btrfs_init_map_token(&token, leaf);
4537        for (i = slot; i < nritems; i++) {
4538                u32 ioff;
4539                item = btrfs_item_nr(i);
4540
4541                ioff = btrfs_token_item_offset(&token, item);
4542                btrfs_set_token_item_offset(&token, item, ioff + size_diff);
4543        }
4544
4545        /* shift the data */
4546        if (from_end) {
4547                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4548                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4549                              data_end, old_data_start + new_size - data_end);
4550        } else {
4551                struct btrfs_disk_key disk_key;
4552                u64 offset;
4553
4554                btrfs_item_key(leaf, &disk_key, slot);
4555
4556                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4557                        unsigned long ptr;
4558                        struct btrfs_file_extent_item *fi;
4559
4560                        fi = btrfs_item_ptr(leaf, slot,
4561                                            struct btrfs_file_extent_item);
4562                        fi = (struct btrfs_file_extent_item *)(
4563                             (unsigned long)fi - size_diff);
4564
4565                        if (btrfs_file_extent_type(leaf, fi) ==
4566                            BTRFS_FILE_EXTENT_INLINE) {
4567                                ptr = btrfs_item_ptr_offset(leaf, slot);
4568                                memmove_extent_buffer(leaf, ptr,
4569                                      (unsigned long)fi,
4570                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4571                        }
4572                }
4573
4574                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4575                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4576                              data_end, old_data_start - data_end);
4577
4578                offset = btrfs_disk_key_offset(&disk_key);
4579                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4580                btrfs_set_item_key(leaf, &disk_key, slot);
4581                if (slot == 0)
4582                        fixup_low_keys(path, &disk_key, 1);
4583        }
4584
4585        item = btrfs_item_nr(slot);
4586        btrfs_set_item_size(leaf, item, new_size);
4587        btrfs_mark_buffer_dirty(leaf);
4588
4589        if (btrfs_leaf_free_space(leaf) < 0) {
4590                btrfs_print_leaf(leaf);
4591                BUG();
4592        }
4593}
4594
4595/*
4596 * make the item pointed to by the path bigger, data_size is the added size.
4597 */
4598void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4599{
4600        int slot;
4601        struct extent_buffer *leaf;
4602        struct btrfs_item *item;
4603        u32 nritems;
4604        unsigned int data_end;
4605        unsigned int old_data;
4606        unsigned int old_size;
4607        int i;
4608        struct btrfs_map_token token;
4609
4610        leaf = path->nodes[0];
4611
4612        nritems = btrfs_header_nritems(leaf);
4613        data_end = leaf_data_end(leaf);
4614
4615        if (btrfs_leaf_free_space(leaf) < data_size) {
4616                btrfs_print_leaf(leaf);
4617                BUG();
4618        }
4619        slot = path->slots[0];
4620        old_data = btrfs_item_end_nr(leaf, slot);
4621
4622        BUG_ON(slot < 0);
4623        if (slot >= nritems) {
4624                btrfs_print_leaf(leaf);
4625                btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4626                           slot, nritems);
4627                BUG();
4628        }
4629
4630        /*
4631         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4632         */
4633        /* first correct the data pointers */
4634        btrfs_init_map_token(&token, leaf);
4635        for (i = slot; i < nritems; i++) {
4636                u32 ioff;
4637                item = btrfs_item_nr(i);
4638
4639                ioff = btrfs_token_item_offset(&token, item);
4640                btrfs_set_token_item_offset(&token, item, ioff - data_size);
4641        }
4642
4643        /* shift the data */
4644        memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4645                      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4646                      data_end, old_data - data_end);
4647
4648        data_end = old_data;
4649        old_size = btrfs_item_size_nr(leaf, slot);
4650        item = btrfs_item_nr(slot);
4651        btrfs_set_item_size(leaf, item, old_size + data_size);
4652        btrfs_mark_buffer_dirty(leaf);
4653
4654        if (btrfs_leaf_free_space(leaf) < 0) {
4655                btrfs_print_leaf(leaf);
4656                BUG();
4657        }
4658}
4659
4660/*
4661 * this is a helper for btrfs_insert_empty_items, the main goal here is
4662 * to save stack depth by doing the bulk of the work in a function
4663 * that doesn't call btrfs_search_slot
4664 */
4665void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4666                            const struct btrfs_key *cpu_key, u32 *data_size,
4667                            u32 total_data, u32 total_size, int nr)
4668{
4669        struct btrfs_fs_info *fs_info = root->fs_info;
4670        struct btrfs_item *item;
4671        int i;
4672        u32 nritems;
4673        unsigned int data_end;
4674        struct btrfs_disk_key disk_key;
4675        struct extent_buffer *leaf;
4676        int slot;
4677        struct btrfs_map_token token;
4678
4679        if (path->slots[0] == 0) {
4680                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4681                fixup_low_keys(path, &disk_key, 1);
4682        }
4683        btrfs_unlock_up_safe(path, 1);
4684
4685        leaf = path->nodes[0];
4686        slot = path->slots[0];
4687
4688        nritems = btrfs_header_nritems(leaf);
4689        data_end = leaf_data_end(leaf);
4690
4691        if (btrfs_leaf_free_space(leaf) < total_size) {
4692                btrfs_print_leaf(leaf);
4693                btrfs_crit(fs_info, "not enough freespace need %u have %d",
4694                           total_size, btrfs_leaf_free_space(leaf));
4695                BUG();
4696        }
4697
4698        btrfs_init_map_token(&token, leaf);
4699        if (slot != nritems) {
4700                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4701
4702                if (old_data < data_end) {
4703                        btrfs_print_leaf(leaf);
4704                        btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4705                                   slot, old_data, data_end);
4706                        BUG();
4707                }
4708                /*
4709                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4710                 */
4711                /* first correct the data pointers */
4712                for (i = slot; i < nritems; i++) {
4713                        u32 ioff;
4714
4715                        item = btrfs_item_nr(i);
4716                        ioff = btrfs_token_item_offset(&token, item);
4717                        btrfs_set_token_item_offset(&token, item,
4718                                                    ioff - total_data);
4719                }
4720                /* shift the items */
4721                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4722                              btrfs_item_nr_offset(slot),
4723                              (nritems - slot) * sizeof(struct btrfs_item));
4724
4725                /* shift the data */
4726                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4727                              data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4728                              data_end, old_data - data_end);
4729                data_end = old_data;
4730        }
4731
4732        /* setup the item for the new data */
4733        for (i = 0; i < nr; i++) {
4734                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4735                btrfs_set_item_key(leaf, &disk_key, slot + i);
4736                item = btrfs_item_nr(slot + i);
4737                btrfs_set_token_item_offset(&token, item, data_end - data_size[i]);
4738                data_end -= data_size[i];
4739                btrfs_set_token_item_size(&token, item, data_size[i]);
4740        }
4741
4742        btrfs_set_header_nritems(leaf, nritems + nr);
4743        btrfs_mark_buffer_dirty(leaf);
4744
4745        if (btrfs_leaf_free_space(leaf) < 0) {
4746                btrfs_print_leaf(leaf);
4747                BUG();
4748        }
4749}
4750
4751/*
4752 * Given a key and some data, insert items into the tree.
4753 * This does all the path init required, making room in the tree if needed.
4754 */
4755int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4756                            struct btrfs_root *root,
4757                            struct btrfs_path *path,
4758                            const struct btrfs_key *cpu_key, u32 *data_size,
4759                            int nr)
4760{
4761        int ret = 0;
4762        int slot;
4763        int i;
4764        u32 total_size = 0;
4765        u32 total_data = 0;
4766
4767        for (i = 0; i < nr; i++)
4768                total_data += data_size[i];
4769
4770        total_size = total_data + (nr * sizeof(struct btrfs_item));
4771        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4772        if (ret == 0)
4773                return -EEXIST;
4774        if (ret < 0)
4775                return ret;
4776
4777        slot = path->slots[0];
4778        BUG_ON(slot < 0);
4779
4780        setup_items_for_insert(root, path, cpu_key, data_size,
4781                               total_data, total_size, nr);
4782        return 0;
4783}
4784
4785/*
4786 * Given a key and some data, insert an item into the tree.
4787 * This does all the path init required, making room in the tree if needed.
4788 */
4789int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4790                      const struct btrfs_key *cpu_key, void *data,
4791                      u32 data_size)
4792{
4793        int ret = 0;
4794        struct btrfs_path *path;
4795        struct extent_buffer *leaf;
4796        unsigned long ptr;
4797
4798        path = btrfs_alloc_path();
4799        if (!path)
4800                return -ENOMEM;
4801        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4802        if (!ret) {
4803                leaf = path->nodes[0];
4804                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4805                write_extent_buffer(leaf, data, ptr, data_size);
4806                btrfs_mark_buffer_dirty(leaf);
4807        }
4808        btrfs_free_path(path);
4809        return ret;
4810}
4811
4812/*
4813 * delete the pointer from a given node.
4814 *
4815 * the tree should have been previously balanced so the deletion does not
4816 * empty a node.
4817 */
4818static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4819                    int level, int slot)
4820{
4821        struct extent_buffer *parent = path->nodes[level];
4822        u32 nritems;
4823        int ret;
4824
4825        nritems = btrfs_header_nritems(parent);
4826        if (slot != nritems - 1) {
4827                if (level) {
4828                        ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4829                                        nritems - slot - 1);
4830                        BUG_ON(ret < 0);
4831                }
4832                memmove_extent_buffer(parent,
4833                              btrfs_node_key_ptr_offset(slot),
4834                              btrfs_node_key_ptr_offset(slot + 1),
4835                              sizeof(struct btrfs_key_ptr) *
4836                              (nritems - slot - 1));
4837        } else if (level) {
4838                ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4839                                GFP_NOFS);
4840                BUG_ON(ret < 0);
4841        }
4842
4843        nritems--;
4844        btrfs_set_header_nritems(parent, nritems);
4845        if (nritems == 0 && parent == root->node) {
4846                BUG_ON(btrfs_header_level(root->node) != 1);
4847                /* just turn the root into a leaf and break */
4848                btrfs_set_header_level(root->node, 0);
4849        } else if (slot == 0) {
4850                struct btrfs_disk_key disk_key;
4851
4852                btrfs_node_key(parent, &disk_key, 0);
4853                fixup_low_keys(path, &disk_key, level + 1);
4854        }
4855        btrfs_mark_buffer_dirty(parent);
4856}
4857
4858/*
4859 * a helper function to delete the leaf pointed to by path->slots[1] and
4860 * path->nodes[1].
4861 *
4862 * This deletes the pointer in path->nodes[1] and frees the leaf
4863 * block extent.  zero is returned if it all worked out, < 0 otherwise.
4864 *
4865 * The path must have already been setup for deleting the leaf, including
4866 * all the proper balancing.  path->nodes[1] must be locked.
4867 */
4868static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4869                                    struct btrfs_root *root,
4870                                    struct btrfs_path *path,
4871                                    struct extent_buffer *leaf)
4872{
4873        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4874        del_ptr(root, path, 1, path->slots[1]);
4875
4876        /*
4877         * btrfs_free_extent is expensive, we want to make sure we
4878         * aren't holding any locks when we call it
4879         */
4880        btrfs_unlock_up_safe(path, 0);
4881
4882        root_sub_used(root, leaf->len);
4883
4884        atomic_inc(&leaf->refs);
4885        btrfs_free_tree_block(trans, root, leaf, 0, 1);
4886        free_extent_buffer_stale(leaf);
4887}
4888/*
4889 * delete the item at the leaf level in path.  If that empties
4890 * the leaf, remove it from the tree
4891 */
4892int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4893                    struct btrfs_path *path, int slot, int nr)
4894{
4895        struct btrfs_fs_info *fs_info = root->fs_info;
4896        struct extent_buffer *leaf;
4897        struct btrfs_item *item;
4898        u32 last_off;
4899        u32 dsize = 0;
4900        int ret = 0;
4901        int wret;
4902        int i;
4903        u32 nritems;
4904
4905        leaf = path->nodes[0];
4906        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4907
4908        for (i = 0; i < nr; i++)
4909                dsize += btrfs_item_size_nr(leaf, slot + i);
4910
4911        nritems = btrfs_header_nritems(leaf);
4912
4913        if (slot + nr != nritems) {
4914                int data_end = leaf_data_end(leaf);
4915                struct btrfs_map_token token;
4916
4917                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4918                              data_end + dsize,
4919                              BTRFS_LEAF_DATA_OFFSET + data_end,
4920                              last_off - data_end);
4921
4922                btrfs_init_map_token(&token, leaf);
4923                for (i = slot + nr; i < nritems; i++) {
4924                        u32 ioff;
4925
4926                        item = btrfs_item_nr(i);
4927                        ioff = btrfs_token_item_offset(&token, item);
4928                        btrfs_set_token_item_offset(&token, item, ioff + dsize);
4929                }
4930
4931                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4932                              btrfs_item_nr_offset(slot + nr),
4933                              sizeof(struct btrfs_item) *
4934                              (nritems - slot - nr));
4935        }
4936        btrfs_set_header_nritems(leaf, nritems - nr);
4937        nritems -= nr;
4938
4939        /* delete the leaf if we've emptied it */
4940        if (nritems == 0) {
4941                if (leaf == root->node) {
4942                        btrfs_set_header_level(leaf, 0);
4943                } else {
4944                        btrfs_set_path_blocking(path);
4945                        btrfs_clean_tree_block(leaf);
4946                        btrfs_del_leaf(trans, root, path, leaf);
4947                }
4948        } else {
4949                int used = leaf_space_used(leaf, 0, nritems);
4950                if (slot == 0) {
4951                        struct btrfs_disk_key disk_key;
4952
4953                        btrfs_item_key(leaf, &disk_key, 0);
4954                        fixup_low_keys(path, &disk_key, 1);
4955                }
4956
4957                /* delete the leaf if it is mostly empty */
4958                if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4959                        /* push_leaf_left fixes the path.
4960                         * make sure the path still points to our leaf
4961                         * for possible call to del_ptr below
4962                         */
4963                        slot = path->slots[1];
4964                        atomic_inc(&leaf->refs);
4965
4966                        btrfs_set_path_blocking(path);
4967                        wret = push_leaf_left(trans, root, path, 1, 1,
4968                                              1, (u32)-1);
4969                        if (wret < 0 && wret != -ENOSPC)
4970                                ret = wret;
4971
4972                        if (path->nodes[0] == leaf &&
4973                            btrfs_header_nritems(leaf)) {
4974                                wret = push_leaf_right(trans, root, path, 1,
4975                                                       1, 1, 0);
4976                                if (wret < 0 && wret != -ENOSPC)
4977                                        ret = wret;
4978                        }
4979
4980                        if (btrfs_header_nritems(leaf) == 0) {
4981                                path->slots[1] = slot;
4982                                btrfs_del_leaf(trans, root, path, leaf);
4983                                free_extent_buffer(leaf);
4984                                ret = 0;
4985                        } else {
4986                                /* if we're still in the path, make sure
4987                                 * we're dirty.  Otherwise, one of the
4988                                 * push_leaf functions must have already
4989                                 * dirtied this buffer
4990                                 */
4991                                if (path->nodes[0] == leaf)
4992                                        btrfs_mark_buffer_dirty(leaf);
4993                                free_extent_buffer(leaf);
4994                        }
4995                } else {
4996                        btrfs_mark_buffer_dirty(leaf);
4997                }
4998        }
4999        return ret;
5000}
5001
5002/*
5003 * search the tree again to find a leaf with lesser keys
5004 * returns 0 if it found something or 1 if there are no lesser leaves.
5005 * returns < 0 on io errors.
5006 *
5007 * This may release the path, and so you may lose any locks held at the
5008 * time you call it.
5009 */
5010int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5011{
5012        struct btrfs_key key;
5013        struct btrfs_disk_key found_key;
5014        int ret;
5015
5016        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5017
5018        if (key.offset > 0) {
5019                key.offset--;
5020        } else if (key.type > 0) {
5021                key.type--;
5022                key.offset = (u64)-1;
5023        } else if (key.objectid > 0) {
5024                key.objectid--;
5025                key.type = (u8)-1;
5026                key.offset = (u64)-1;
5027        } else {
5028                return 1;
5029        }
5030
5031        btrfs_release_path(path);
5032        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5033        if (ret < 0)
5034                return ret;
5035        btrfs_item_key(path->nodes[0], &found_key, 0);
5036        ret = comp_keys(&found_key, &key);
5037        /*
5038         * We might have had an item with the previous key in the tree right
5039         * before we released our path. And after we released our path, that
5040         * item might have been pushed to the first slot (0) of the leaf we
5041         * were holding due to a tree balance. Alternatively, an item with the
5042         * previous key can exist as the only element of a leaf (big fat item).
5043         * Therefore account for these 2 cases, so that our callers (like
5044         * btrfs_previous_item) don't miss an existing item with a key matching
5045         * the previous key we computed above.
5046         */
5047        if (ret <= 0)
5048                return 0;
5049        return 1;
5050}
5051
5052/*
5053 * A helper function to walk down the tree starting at min_key, and looking
5054 * for nodes or leaves that are have a minimum transaction id.
5055 * This is used by the btree defrag code, and tree logging
5056 *
5057 * This does not cow, but it does stuff the starting key it finds back
5058 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5059 * key and get a writable path.
5060 *
5061 * This honors path->lowest_level to prevent descent past a given level
5062 * of the tree.
5063 *
5064 * min_trans indicates the oldest transaction that you are interested
5065 * in walking through.  Any nodes or leaves older than min_trans are
5066 * skipped over (without reading them).
5067 *
5068 * returns zero if something useful was found, < 0 on error and 1 if there
5069 * was nothing in the tree that matched the search criteria.
5070 */
5071int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5072                         struct btrfs_path *path,
5073                         u64 min_trans)
5074{
5075        struct extent_buffer *cur;
5076        struct btrfs_key found_key;
5077        int slot;
5078        int sret;
5079        u32 nritems;
5080        int level;
5081        int ret = 1;
5082        int keep_locks = path->keep_locks;
5083
5084        path->keep_locks = 1;
5085again:
5086        cur = btrfs_read_lock_root_node(root);
5087        level = btrfs_header_level(cur);
5088        WARN_ON(path->nodes[level]);
5089        path->nodes[level] = cur;
5090        path->locks[level] = BTRFS_READ_LOCK;
5091
5092        if (btrfs_header_generation(cur) < min_trans) {
5093                ret = 1;
5094                goto out;
5095        }
5096        while (1) {
5097                nritems = btrfs_header_nritems(cur);
5098                level = btrfs_header_level(cur);
5099                sret = btrfs_bin_search(cur, min_key, &slot);
5100                if (sret < 0) {
5101                        ret = sret;
5102                        goto out;
5103                }
5104
5105                /* at the lowest level, we're done, setup the path and exit */
5106                if (level == path->lowest_level) {
5107                        if (slot >= nritems)
5108                                goto find_next_key;
5109                        ret = 0;
5110                        path->slots[level] = slot;
5111                        btrfs_item_key_to_cpu(cur, &found_key, slot);
5112                        goto out;
5113                }
5114                if (sret && slot > 0)
5115                        slot--;
5116                /*
5117                 * check this node pointer against the min_trans parameters.
5118                 * If it is too old, old, skip to the next one.
5119                 */
5120                while (slot < nritems) {
5121                        u64 gen;
5122
5123                        gen = btrfs_node_ptr_generation(cur, slot);
5124                        if (gen < min_trans) {
5125                                slot++;
5126                                continue;
5127                        }
5128                        break;
5129                }
5130find_next_key:
5131                /*
5132                 * we didn't find a candidate key in this node, walk forward
5133                 * and find another one
5134                 */
5135                if (slot >= nritems) {
5136                        path->slots[level] = slot;
5137                        btrfs_set_path_blocking(path);
5138                        sret = btrfs_find_next_key(root, path, min_key, level,
5139                                                  min_trans);
5140                        if (sret == 0) {
5141                                btrfs_release_path(path);
5142                                goto again;
5143                        } else {
5144                                goto out;
5145                        }
5146                }
5147                /* save our key for returning back */
5148                btrfs_node_key_to_cpu(cur, &found_key, slot);
5149                path->slots[level] = slot;
5150                if (level == path->lowest_level) {
5151                        ret = 0;
5152                        goto out;
5153                }
5154                btrfs_set_path_blocking(path);
5155                cur = btrfs_read_node_slot(cur, slot);
5156                if (IS_ERR(cur)) {
5157                        ret = PTR_ERR(cur);
5158                        goto out;
5159                }
5160
5161                btrfs_tree_read_lock(cur);
5162
5163                path->locks[level - 1] = BTRFS_READ_LOCK;
5164                path->nodes[level - 1] = cur;
5165                unlock_up(path, level, 1, 0, NULL);
5166        }
5167out:
5168        path->keep_locks = keep_locks;
5169        if (ret == 0) {
5170                btrfs_unlock_up_safe(path, path->lowest_level + 1);
5171                btrfs_set_path_blocking(path);
5172                memcpy(min_key, &found_key, sizeof(found_key));
5173        }
5174        return ret;
5175}
5176
5177/*
5178 * this is similar to btrfs_next_leaf, but does not try to preserve
5179 * and fixup the path.  It looks for and returns the next key in the
5180 * tree based on the current path and the min_trans parameters.
5181 *
5182 * 0 is returned if another key is found, < 0 if there are any errors
5183 * and 1 is returned if there are no higher keys in the tree
5184 *
5185 * path->keep_locks should be set to 1 on the search made before
5186 * calling this function.
5187 */
5188int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5189                        struct btrfs_key *key, int level, u64 min_trans)
5190{
5191        int slot;
5192        struct extent_buffer *c;
5193
5194        WARN_ON(!path->keep_locks && !path->skip_locking);
5195        while (level < BTRFS_MAX_LEVEL) {
5196                if (!path->nodes[level])
5197                        return 1;
5198
5199                slot = path->slots[level] + 1;
5200                c = path->nodes[level];
5201next:
5202                if (slot >= btrfs_header_nritems(c)) {
5203                        int ret;
5204                        int orig_lowest;
5205                        struct btrfs_key cur_key;
5206                        if (level + 1 >= BTRFS_MAX_LEVEL ||
5207                            !path->nodes[level + 1])
5208                                return 1;
5209
5210                        if (path->locks[level + 1] || path->skip_locking) {
5211                                level++;
5212                                continue;
5213                        }
5214
5215                        slot = btrfs_header_nritems(c) - 1;
5216                        if (level == 0)
5217                                btrfs_item_key_to_cpu(c, &cur_key, slot);
5218                        else
5219                                btrfs_node_key_to_cpu(c, &cur_key, slot);
5220
5221                        orig_lowest = path->lowest_level;
5222                        btrfs_release_path(path);
5223                        path->lowest_level = level;
5224                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
5225                                                0, 0);
5226                        path->lowest_level = orig_lowest;
5227                        if (ret < 0)
5228                                return ret;
5229
5230                        c = path->nodes[level];
5231                        slot = path->slots[level];
5232                        if (ret == 0)
5233                                slot++;
5234                        goto next;
5235                }
5236
5237                if (level == 0)
5238                        btrfs_item_key_to_cpu(c, key, slot);
5239                else {
5240                        u64 gen = btrfs_node_ptr_generation(c, slot);
5241
5242                        if (gen < min_trans) {
5243                                slot++;
5244                                goto next;
5245                        }
5246                        btrfs_node_key_to_cpu(c, key, slot);
5247                }
5248                return 0;
5249        }
5250        return 1;
5251}
5252
5253/*
5254 * search the tree again to find a leaf with greater keys
5255 * returns 0 if it found something or 1 if there are no greater leaves.
5256 * returns < 0 on io errors.
5257 */
5258int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5259{
5260        return btrfs_next_old_leaf(root, path, 0);
5261}
5262
5263int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5264                        u64 time_seq)
5265{
5266        int slot;
5267        int level;
5268        struct extent_buffer *c;
5269        struct extent_buffer *next;
5270        struct btrfs_key key;
5271        u32 nritems;
5272        int ret;
5273        int old_spinning = path->leave_spinning;
5274        int next_rw_lock = 0;
5275
5276        nritems = btrfs_header_nritems(path->nodes[0]);
5277        if (nritems == 0)
5278                return 1;
5279
5280        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5281again:
5282        level = 1;
5283        next = NULL;
5284        next_rw_lock = 0;
5285        btrfs_release_path(path);
5286
5287        path->keep_locks = 1;
5288        path->leave_spinning = 1;
5289
5290        if (time_seq)
5291                ret = btrfs_search_old_slot(root, &key, path, time_seq);
5292        else
5293                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5294        path->keep_locks = 0;
5295
5296        if (ret < 0)
5297                return ret;
5298
5299        nritems = btrfs_header_nritems(path->nodes[0]);
5300        /*
5301         * by releasing the path above we dropped all our locks.  A balance
5302         * could have added more items next to the key that used to be
5303         * at the very end of the block.  So, check again here and
5304         * advance the path if there are now more items available.
5305         */
5306        if (nritems > 0 && path->slots[0] < nritems - 1) {
5307                if (ret == 0)
5308                        path->slots[0]++;
5309                ret = 0;
5310                goto done;
5311        }
5312        /*
5313         * So the above check misses one case:
5314         * - after releasing the path above, someone has removed the item that
5315         *   used to be at the very end of the block, and balance between leafs
5316         *   gets another one with bigger key.offset to replace it.
5317         *
5318         * This one should be returned as well, or we can get leaf corruption
5319         * later(esp. in __btrfs_drop_extents()).
5320         *
5321         * And a bit more explanation about this check,
5322         * with ret > 0, the key isn't found, the path points to the slot
5323         * where it should be inserted, so the path->slots[0] item must be the
5324         * bigger one.
5325         */
5326        if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5327                ret = 0;
5328                goto done;
5329        }
5330
5331        while (level < BTRFS_MAX_LEVEL) {
5332                if (!path->nodes[level]) {
5333                        ret = 1;
5334                        goto done;
5335                }
5336
5337                slot = path->slots[level] + 1;
5338                c = path->nodes[level];
5339                if (slot >= btrfs_header_nritems(c)) {
5340                        level++;
5341                        if (level == BTRFS_MAX_LEVEL) {
5342                                ret = 1;
5343                                goto done;
5344                        }
5345                        continue;
5346                }
5347
5348                if (next) {
5349                        btrfs_tree_unlock_rw(next, next_rw_lock);
5350                        free_extent_buffer(next);
5351                }
5352
5353                next = c;
5354                next_rw_lock = path->locks[level];
5355                ret = read_block_for_search(root, path, &next, level,
5356                                            slot, &key);
5357                if (ret == -EAGAIN)
5358                        goto again;
5359
5360                if (ret < 0) {
5361                        btrfs_release_path(path);
5362                        goto done;
5363                }
5364
5365                if (!path->skip_locking) {
5366                        ret = btrfs_try_tree_read_lock(next);
5367                        if (!ret && time_seq) {
5368                                /*
5369                                 * If we don't get the lock, we may be racing
5370                                 * with push_leaf_left, holding that lock while
5371                                 * itself waiting for the leaf we've currently
5372                                 * locked. To solve this situation, we give up
5373                                 * on our lock and cycle.
5374                                 */
5375                                free_extent_buffer(next);
5376                                btrfs_release_path(path);
5377                                cond_resched();
5378                                goto again;
5379                        }
5380                        if (!ret) {
5381                                btrfs_set_path_blocking(path);
5382                                btrfs_tree_read_lock(next);
5383                        }
5384                        next_rw_lock = BTRFS_READ_LOCK;
5385                }
5386                break;
5387        }
5388        path->slots[level] = slot;
5389        while (1) {
5390                level--;
5391                c = path->nodes[level];
5392                if (path->locks[level])
5393                        btrfs_tree_unlock_rw(c, path->locks[level]);
5394
5395                free_extent_buffer(c);
5396                path->nodes[level] = next;
5397                path->slots[level] = 0;
5398                if (!path->skip_locking)
5399                        path->locks[level] = next_rw_lock;
5400                if (!level)
5401                        break;
5402
5403                ret = read_block_for_search(root, path, &next, level,
5404                                            0, &key);
5405                if (ret == -EAGAIN)
5406                        goto again;
5407
5408                if (ret < 0) {
5409                        btrfs_release_path(path);
5410                        goto done;
5411                }
5412
5413                if (!path->skip_locking) {
5414                        ret = btrfs_try_tree_read_lock(next);
5415                        if (!ret) {
5416                                btrfs_set_path_blocking(path);
5417                                btrfs_tree_read_lock(next);
5418                        }
5419                        next_rw_lock = BTRFS_READ_LOCK;
5420                }
5421        }
5422        ret = 0;
5423done:
5424        unlock_up(path, 0, 1, 0, NULL);
5425        path->leave_spinning = old_spinning;
5426        if (!old_spinning)
5427                btrfs_set_path_blocking(path);
5428
5429        return ret;
5430}
5431
5432/*
5433 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5434 * searching until it gets past min_objectid or finds an item of 'type'
5435 *
5436 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5437 */
5438int btrfs_previous_item(struct btrfs_root *root,
5439                        struct btrfs_path *path, u64 min_objectid,
5440                        int type)
5441{
5442        struct btrfs_key found_key;
5443        struct extent_buffer *leaf;
5444        u32 nritems;
5445        int ret;
5446
5447        while (1) {
5448                if (path->slots[0] == 0) {
5449                        btrfs_set_path_blocking(path);
5450                        ret = btrfs_prev_leaf(root, path);
5451                        if (ret != 0)
5452                                return ret;
5453                } else {
5454                        path->slots[0]--;
5455                }
5456                leaf = path->nodes[0];
5457                nritems = btrfs_header_nritems(leaf);
5458                if (nritems == 0)
5459                        return 1;
5460                if (path->slots[0] == nritems)
5461                        path->slots[0]--;
5462
5463                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5464                if (found_key.objectid < min_objectid)
5465                        break;
5466                if (found_key.type == type)
5467                        return 0;
5468                if (found_key.objectid == min_objectid &&
5469                    found_key.type < type)
5470                        break;
5471        }
5472        return 1;
5473}
5474
5475/*
5476 * search in extent tree to find a previous Metadata/Data extent item with
5477 * min objecitd.
5478 *
5479 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5480 */
5481int btrfs_previous_extent_item(struct btrfs_root *root,
5482                        struct btrfs_path *path, u64 min_objectid)
5483{
5484        struct btrfs_key found_key;
5485        struct extent_buffer *leaf;
5486        u32 nritems;
5487        int ret;
5488
5489        while (1) {
5490                if (path->slots[0] == 0) {
5491                        btrfs_set_path_blocking(path);
5492                        ret = btrfs_prev_leaf(root, path);
5493                        if (ret != 0)
5494                                return ret;
5495                } else {
5496                        path->slots[0]--;
5497                }
5498                leaf = path->nodes[0];
5499                nritems = btrfs_header_nritems(leaf);
5500                if (nritems == 0)
5501                        return 1;
5502                if (path->slots[0] == nritems)
5503                        path->slots[0]--;
5504
5505                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5506                if (found_key.objectid < min_objectid)
5507                        break;
5508                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5509                    found_key.type == BTRFS_METADATA_ITEM_KEY)
5510                        return 0;
5511                if (found_key.objectid == min_objectid &&
5512                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5513                        break;
5514        }
5515        return 1;
5516}
5517