linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/slab.h>
   8#include <linux/rbtree.h>
   9#include <linux/mm.h>
  10#include "ctree.h"
  11#include "disk-io.h"
  12#include "transaction.h"
  13#include "print-tree.h"
  14#include "locking.h"
  15#include "volumes.h"
  16#include "qgroup.h"
  17#include "tree-mod-log.h"
  18
  19static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  20                      *root, struct btrfs_path *path, int level);
  21static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  22                      const struct btrfs_key *ins_key, struct btrfs_path *path,
  23                      int data_size, int extend);
  24static int push_node_left(struct btrfs_trans_handle *trans,
  25                          struct extent_buffer *dst,
  26                          struct extent_buffer *src, int empty);
  27static int balance_node_right(struct btrfs_trans_handle *trans,
  28                              struct extent_buffer *dst_buf,
  29                              struct extent_buffer *src_buf);
  30static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
  31                    int level, int slot);
  32
  33static const struct btrfs_csums {
  34        u16             size;
  35        const char      name[10];
  36        const char      driver[12];
  37} btrfs_csums[] = {
  38        [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
  39        [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
  40        [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
  41        [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
  42                                     .driver = "blake2b-256" },
  43};
  44
  45int btrfs_super_csum_size(const struct btrfs_super_block *s)
  46{
  47        u16 t = btrfs_super_csum_type(s);
  48        /*
  49         * csum type is validated at mount time
  50         */
  51        return btrfs_csums[t].size;
  52}
  53
  54const char *btrfs_super_csum_name(u16 csum_type)
  55{
  56        /* csum type is validated at mount time */
  57        return btrfs_csums[csum_type].name;
  58}
  59
  60/*
  61 * Return driver name if defined, otherwise the name that's also a valid driver
  62 * name
  63 */
  64const char *btrfs_super_csum_driver(u16 csum_type)
  65{
  66        /* csum type is validated at mount time */
  67        return btrfs_csums[csum_type].driver[0] ?
  68                btrfs_csums[csum_type].driver :
  69                btrfs_csums[csum_type].name;
  70}
  71
  72size_t __attribute_const__ btrfs_get_num_csums(void)
  73{
  74        return ARRAY_SIZE(btrfs_csums);
  75}
  76
  77struct btrfs_path *btrfs_alloc_path(void)
  78{
  79        return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  80}
  81
  82/* this also releases the path */
  83void btrfs_free_path(struct btrfs_path *p)
  84{
  85        if (!p)
  86                return;
  87        btrfs_release_path(p);
  88        kmem_cache_free(btrfs_path_cachep, p);
  89}
  90
  91/*
  92 * path release drops references on the extent buffers in the path
  93 * and it drops any locks held by this path
  94 *
  95 * It is safe to call this on paths that no locks or extent buffers held.
  96 */
  97noinline void btrfs_release_path(struct btrfs_path *p)
  98{
  99        int i;
 100
 101        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 102                p->slots[i] = 0;
 103                if (!p->nodes[i])
 104                        continue;
 105                if (p->locks[i]) {
 106                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
 107                        p->locks[i] = 0;
 108                }
 109                free_extent_buffer(p->nodes[i]);
 110                p->nodes[i] = NULL;
 111        }
 112}
 113
 114/*
 115 * safely gets a reference on the root node of a tree.  A lock
 116 * is not taken, so a concurrent writer may put a different node
 117 * at the root of the tree.  See btrfs_lock_root_node for the
 118 * looping required.
 119 *
 120 * The extent buffer returned by this has a reference taken, so
 121 * it won't disappear.  It may stop being the root of the tree
 122 * at any time because there are no locks held.
 123 */
 124struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 125{
 126        struct extent_buffer *eb;
 127
 128        while (1) {
 129                rcu_read_lock();
 130                eb = rcu_dereference(root->node);
 131
 132                /*
 133                 * RCU really hurts here, we could free up the root node because
 134                 * it was COWed but we may not get the new root node yet so do
 135                 * the inc_not_zero dance and if it doesn't work then
 136                 * synchronize_rcu and try again.
 137                 */
 138                if (atomic_inc_not_zero(&eb->refs)) {
 139                        rcu_read_unlock();
 140                        break;
 141                }
 142                rcu_read_unlock();
 143                synchronize_rcu();
 144        }
 145        return eb;
 146}
 147
 148/*
 149 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
 150 * just get put onto a simple dirty list.  Transaction walks this list to make
 151 * sure they get properly updated on disk.
 152 */
 153static void add_root_to_dirty_list(struct btrfs_root *root)
 154{
 155        struct btrfs_fs_info *fs_info = root->fs_info;
 156
 157        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
 158            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
 159                return;
 160
 161        spin_lock(&fs_info->trans_lock);
 162        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
 163                /* Want the extent tree to be the last on the list */
 164                if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
 165                        list_move_tail(&root->dirty_list,
 166                                       &fs_info->dirty_cowonly_roots);
 167                else
 168                        list_move(&root->dirty_list,
 169                                  &fs_info->dirty_cowonly_roots);
 170        }
 171        spin_unlock(&fs_info->trans_lock);
 172}
 173
 174/*
 175 * used by snapshot creation to make a copy of a root for a tree with
 176 * a given objectid.  The buffer with the new root node is returned in
 177 * cow_ret, and this func returns zero on success or a negative error code.
 178 */
 179int btrfs_copy_root(struct btrfs_trans_handle *trans,
 180                      struct btrfs_root *root,
 181                      struct extent_buffer *buf,
 182                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 183{
 184        struct btrfs_fs_info *fs_info = root->fs_info;
 185        struct extent_buffer *cow;
 186        int ret = 0;
 187        int level;
 188        struct btrfs_disk_key disk_key;
 189
 190        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 191                trans->transid != fs_info->running_transaction->transid);
 192        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 193                trans->transid != root->last_trans);
 194
 195        level = btrfs_header_level(buf);
 196        if (level == 0)
 197                btrfs_item_key(buf, &disk_key, 0);
 198        else
 199                btrfs_node_key(buf, &disk_key, 0);
 200
 201        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 202                                     &disk_key, level, buf->start, 0,
 203                                     BTRFS_NESTING_NEW_ROOT);
 204        if (IS_ERR(cow))
 205                return PTR_ERR(cow);
 206
 207        copy_extent_buffer_full(cow, buf);
 208        btrfs_set_header_bytenr(cow, cow->start);
 209        btrfs_set_header_generation(cow, trans->transid);
 210        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 211        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 212                                     BTRFS_HEADER_FLAG_RELOC);
 213        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 214                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 215        else
 216                btrfs_set_header_owner(cow, new_root_objectid);
 217
 218        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
 219
 220        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 221        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 222                ret = btrfs_inc_ref(trans, root, cow, 1);
 223        else
 224                ret = btrfs_inc_ref(trans, root, cow, 0);
 225        if (ret) {
 226                btrfs_tree_unlock(cow);
 227                free_extent_buffer(cow);
 228                btrfs_abort_transaction(trans, ret);
 229                return ret;
 230        }
 231
 232        btrfs_mark_buffer_dirty(cow);
 233        *cow_ret = cow;
 234        return 0;
 235}
 236
 237/*
 238 * check if the tree block can be shared by multiple trees
 239 */
 240int btrfs_block_can_be_shared(struct btrfs_root *root,
 241                              struct extent_buffer *buf)
 242{
 243        /*
 244         * Tree blocks not in shareable trees and tree roots are never shared.
 245         * If a block was allocated after the last snapshot and the block was
 246         * not allocated by tree relocation, we know the block is not shared.
 247         */
 248        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 249            buf != root->node && buf != root->commit_root &&
 250            (btrfs_header_generation(buf) <=
 251             btrfs_root_last_snapshot(&root->root_item) ||
 252             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 253                return 1;
 254
 255        return 0;
 256}
 257
 258static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 259                                       struct btrfs_root *root,
 260                                       struct extent_buffer *buf,
 261                                       struct extent_buffer *cow,
 262                                       int *last_ref)
 263{
 264        struct btrfs_fs_info *fs_info = root->fs_info;
 265        u64 refs;
 266        u64 owner;
 267        u64 flags;
 268        u64 new_flags = 0;
 269        int ret;
 270
 271        /*
 272         * Backrefs update rules:
 273         *
 274         * Always use full backrefs for extent pointers in tree block
 275         * allocated by tree relocation.
 276         *
 277         * If a shared tree block is no longer referenced by its owner
 278         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 279         * use full backrefs for extent pointers in tree block.
 280         *
 281         * If a tree block is been relocating
 282         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
 283         * use full backrefs for extent pointers in tree block.
 284         * The reason for this is some operations (such as drop tree)
 285         * are only allowed for blocks use full backrefs.
 286         */
 287
 288        if (btrfs_block_can_be_shared(root, buf)) {
 289                ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
 290                                               btrfs_header_level(buf), 1,
 291                                               &refs, &flags);
 292                if (ret)
 293                        return ret;
 294                if (refs == 0) {
 295                        ret = -EROFS;
 296                        btrfs_handle_fs_error(fs_info, ret, NULL);
 297                        return ret;
 298                }
 299        } else {
 300                refs = 1;
 301                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
 302                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 303                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 304                else
 305                        flags = 0;
 306        }
 307
 308        owner = btrfs_header_owner(buf);
 309        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
 310               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 311
 312        if (refs > 1) {
 313                if ((owner == root->root_key.objectid ||
 314                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 315                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
 316                        ret = btrfs_inc_ref(trans, root, buf, 1);
 317                        if (ret)
 318                                return ret;
 319
 320                        if (root->root_key.objectid ==
 321                            BTRFS_TREE_RELOC_OBJECTID) {
 322                                ret = btrfs_dec_ref(trans, root, buf, 0);
 323                                if (ret)
 324                                        return ret;
 325                                ret = btrfs_inc_ref(trans, root, cow, 1);
 326                                if (ret)
 327                                        return ret;
 328                        }
 329                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 330                } else {
 331
 332                        if (root->root_key.objectid ==
 333                            BTRFS_TREE_RELOC_OBJECTID)
 334                                ret = btrfs_inc_ref(trans, root, cow, 1);
 335                        else
 336                                ret = btrfs_inc_ref(trans, root, cow, 0);
 337                        if (ret)
 338                                return ret;
 339                }
 340                if (new_flags != 0) {
 341                        int level = btrfs_header_level(buf);
 342
 343                        ret = btrfs_set_disk_extent_flags(trans, buf,
 344                                                          new_flags, level, 0);
 345                        if (ret)
 346                                return ret;
 347                }
 348        } else {
 349                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 350                        if (root->root_key.objectid ==
 351                            BTRFS_TREE_RELOC_OBJECTID)
 352                                ret = btrfs_inc_ref(trans, root, cow, 1);
 353                        else
 354                                ret = btrfs_inc_ref(trans, root, cow, 0);
 355                        if (ret)
 356                                return ret;
 357                        ret = btrfs_dec_ref(trans, root, buf, 1);
 358                        if (ret)
 359                                return ret;
 360                }
 361                btrfs_clean_tree_block(buf);
 362                *last_ref = 1;
 363        }
 364        return 0;
 365}
 366
 367/*
 368 * does the dirty work in cow of a single block.  The parent block (if
 369 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 370 * dirty and returned locked.  If you modify the block it needs to be marked
 371 * dirty again.
 372 *
 373 * search_start -- an allocation hint for the new block
 374 *
 375 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 376 * bytes the allocator should try to find free next to the block it returns.
 377 * This is just a hint and may be ignored by the allocator.
 378 */
 379static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 380                             struct btrfs_root *root,
 381                             struct extent_buffer *buf,
 382                             struct extent_buffer *parent, int parent_slot,
 383                             struct extent_buffer **cow_ret,
 384                             u64 search_start, u64 empty_size,
 385                             enum btrfs_lock_nesting nest)
 386{
 387        struct btrfs_fs_info *fs_info = root->fs_info;
 388        struct btrfs_disk_key disk_key;
 389        struct extent_buffer *cow;
 390        int level, ret;
 391        int last_ref = 0;
 392        int unlock_orig = 0;
 393        u64 parent_start = 0;
 394
 395        if (*cow_ret == buf)
 396                unlock_orig = 1;
 397
 398        btrfs_assert_tree_locked(buf);
 399
 400        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 401                trans->transid != fs_info->running_transaction->transid);
 402        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 403                trans->transid != root->last_trans);
 404
 405        level = btrfs_header_level(buf);
 406
 407        if (level == 0)
 408                btrfs_item_key(buf, &disk_key, 0);
 409        else
 410                btrfs_node_key(buf, &disk_key, 0);
 411
 412        if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
 413                parent_start = parent->start;
 414
 415        cow = btrfs_alloc_tree_block(trans, root, parent_start,
 416                                     root->root_key.objectid, &disk_key, level,
 417                                     search_start, empty_size, nest);
 418        if (IS_ERR(cow))
 419                return PTR_ERR(cow);
 420
 421        /* cow is set to blocking by btrfs_init_new_buffer */
 422
 423        copy_extent_buffer_full(cow, buf);
 424        btrfs_set_header_bytenr(cow, cow->start);
 425        btrfs_set_header_generation(cow, trans->transid);
 426        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 427        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 428                                     BTRFS_HEADER_FLAG_RELOC);
 429        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
 430                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 431        else
 432                btrfs_set_header_owner(cow, root->root_key.objectid);
 433
 434        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
 435
 436        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
 437        if (ret) {
 438                btrfs_tree_unlock(cow);
 439                free_extent_buffer(cow);
 440                btrfs_abort_transaction(trans, ret);
 441                return ret;
 442        }
 443
 444        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
 445                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
 446                if (ret) {
 447                        btrfs_tree_unlock(cow);
 448                        free_extent_buffer(cow);
 449                        btrfs_abort_transaction(trans, ret);
 450                        return ret;
 451                }
 452        }
 453
 454        if (buf == root->node) {
 455                WARN_ON(parent && parent != buf);
 456                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
 457                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 458                        parent_start = buf->start;
 459
 460                atomic_inc(&cow->refs);
 461                ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
 462                BUG_ON(ret < 0);
 463                rcu_assign_pointer(root->node, cow);
 464
 465                btrfs_free_tree_block(trans, root, buf, parent_start,
 466                                      last_ref);
 467                free_extent_buffer(buf);
 468                add_root_to_dirty_list(root);
 469        } else {
 470                WARN_ON(trans->transid != btrfs_header_generation(parent));
 471                btrfs_tree_mod_log_insert_key(parent, parent_slot,
 472                                              BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
 473                btrfs_set_node_blockptr(parent, parent_slot,
 474                                        cow->start);
 475                btrfs_set_node_ptr_generation(parent, parent_slot,
 476                                              trans->transid);
 477                btrfs_mark_buffer_dirty(parent);
 478                if (last_ref) {
 479                        ret = btrfs_tree_mod_log_free_eb(buf);
 480                        if (ret) {
 481                                btrfs_tree_unlock(cow);
 482                                free_extent_buffer(cow);
 483                                btrfs_abort_transaction(trans, ret);
 484                                return ret;
 485                        }
 486                }
 487                btrfs_free_tree_block(trans, root, buf, parent_start,
 488                                      last_ref);
 489        }
 490        if (unlock_orig)
 491                btrfs_tree_unlock(buf);
 492        free_extent_buffer_stale(buf);
 493        btrfs_mark_buffer_dirty(cow);
 494        *cow_ret = cow;
 495        return 0;
 496}
 497
 498static inline int should_cow_block(struct btrfs_trans_handle *trans,
 499                                   struct btrfs_root *root,
 500                                   struct extent_buffer *buf)
 501{
 502        if (btrfs_is_testing(root->fs_info))
 503                return 0;
 504
 505        /* Ensure we can see the FORCE_COW bit */
 506        smp_mb__before_atomic();
 507
 508        /*
 509         * We do not need to cow a block if
 510         * 1) this block is not created or changed in this transaction;
 511         * 2) this block does not belong to TREE_RELOC tree;
 512         * 3) the root is not forced COW.
 513         *
 514         * What is forced COW:
 515         *    when we create snapshot during committing the transaction,
 516         *    after we've finished copying src root, we must COW the shared
 517         *    block to ensure the metadata consistency.
 518         */
 519        if (btrfs_header_generation(buf) == trans->transid &&
 520            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
 521            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
 522              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
 523            !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
 524                return 0;
 525        return 1;
 526}
 527
 528/*
 529 * cows a single block, see __btrfs_cow_block for the real work.
 530 * This version of it has extra checks so that a block isn't COWed more than
 531 * once per transaction, as long as it hasn't been written yet
 532 */
 533noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 534                    struct btrfs_root *root, struct extent_buffer *buf,
 535                    struct extent_buffer *parent, int parent_slot,
 536                    struct extent_buffer **cow_ret,
 537                    enum btrfs_lock_nesting nest)
 538{
 539        struct btrfs_fs_info *fs_info = root->fs_info;
 540        u64 search_start;
 541        int ret;
 542
 543        if (test_bit(BTRFS_ROOT_DELETING, &root->state))
 544                btrfs_err(fs_info,
 545                        "COW'ing blocks on a fs root that's being dropped");
 546
 547        if (trans->transaction != fs_info->running_transaction)
 548                WARN(1, KERN_CRIT "trans %llu running %llu\n",
 549                       trans->transid,
 550                       fs_info->running_transaction->transid);
 551
 552        if (trans->transid != fs_info->generation)
 553                WARN(1, KERN_CRIT "trans %llu running %llu\n",
 554                       trans->transid, fs_info->generation);
 555
 556        if (!should_cow_block(trans, root, buf)) {
 557                *cow_ret = buf;
 558                return 0;
 559        }
 560
 561        search_start = buf->start & ~((u64)SZ_1G - 1);
 562
 563        /*
 564         * Before CoWing this block for later modification, check if it's
 565         * the subtree root and do the delayed subtree trace if needed.
 566         *
 567         * Also We don't care about the error, as it's handled internally.
 568         */
 569        btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
 570        ret = __btrfs_cow_block(trans, root, buf, parent,
 571                                 parent_slot, cow_ret, search_start, 0, nest);
 572
 573        trace_btrfs_cow_block(root, buf, *cow_ret);
 574
 575        return ret;
 576}
 577ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
 578
 579/*
 580 * helper function for defrag to decide if two blocks pointed to by a
 581 * node are actually close by
 582 */
 583static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
 584{
 585        if (blocknr < other && other - (blocknr + blocksize) < 32768)
 586                return 1;
 587        if (blocknr > other && blocknr - (other + blocksize) < 32768)
 588                return 1;
 589        return 0;
 590}
 591
 592#ifdef __LITTLE_ENDIAN
 593
 594/*
 595 * Compare two keys, on little-endian the disk order is same as CPU order and
 596 * we can avoid the conversion.
 597 */
 598static int comp_keys(const struct btrfs_disk_key *disk_key,
 599                     const struct btrfs_key *k2)
 600{
 601        const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
 602
 603        return btrfs_comp_cpu_keys(k1, k2);
 604}
 605
 606#else
 607
 608/*
 609 * compare two keys in a memcmp fashion
 610 */
 611static int comp_keys(const struct btrfs_disk_key *disk,
 612                     const struct btrfs_key *k2)
 613{
 614        struct btrfs_key k1;
 615
 616        btrfs_disk_key_to_cpu(&k1, disk);
 617
 618        return btrfs_comp_cpu_keys(&k1, k2);
 619}
 620#endif
 621
 622/*
 623 * same as comp_keys only with two btrfs_key's
 624 */
 625int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
 626{
 627        if (k1->objectid > k2->objectid)
 628                return 1;
 629        if (k1->objectid < k2->objectid)
 630                return -1;
 631        if (k1->type > k2->type)
 632                return 1;
 633        if (k1->type < k2->type)
 634                return -1;
 635        if (k1->offset > k2->offset)
 636                return 1;
 637        if (k1->offset < k2->offset)
 638                return -1;
 639        return 0;
 640}
 641
 642/*
 643 * this is used by the defrag code to go through all the
 644 * leaves pointed to by a node and reallocate them so that
 645 * disk order is close to key order
 646 */
 647int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 648                       struct btrfs_root *root, struct extent_buffer *parent,
 649                       int start_slot, u64 *last_ret,
 650                       struct btrfs_key *progress)
 651{
 652        struct btrfs_fs_info *fs_info = root->fs_info;
 653        struct extent_buffer *cur;
 654        u64 blocknr;
 655        u64 search_start = *last_ret;
 656        u64 last_block = 0;
 657        u64 other;
 658        u32 parent_nritems;
 659        int end_slot;
 660        int i;
 661        int err = 0;
 662        u32 blocksize;
 663        int progress_passed = 0;
 664        struct btrfs_disk_key disk_key;
 665
 666        WARN_ON(trans->transaction != fs_info->running_transaction);
 667        WARN_ON(trans->transid != fs_info->generation);
 668
 669        parent_nritems = btrfs_header_nritems(parent);
 670        blocksize = fs_info->nodesize;
 671        end_slot = parent_nritems - 1;
 672
 673        if (parent_nritems <= 1)
 674                return 0;
 675
 676        for (i = start_slot; i <= end_slot; i++) {
 677                int close = 1;
 678
 679                btrfs_node_key(parent, &disk_key, i);
 680                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
 681                        continue;
 682
 683                progress_passed = 1;
 684                blocknr = btrfs_node_blockptr(parent, i);
 685                if (last_block == 0)
 686                        last_block = blocknr;
 687
 688                if (i > 0) {
 689                        other = btrfs_node_blockptr(parent, i - 1);
 690                        close = close_blocks(blocknr, other, blocksize);
 691                }
 692                if (!close && i < end_slot) {
 693                        other = btrfs_node_blockptr(parent, i + 1);
 694                        close = close_blocks(blocknr, other, blocksize);
 695                }
 696                if (close) {
 697                        last_block = blocknr;
 698                        continue;
 699                }
 700
 701                cur = btrfs_read_node_slot(parent, i);
 702                if (IS_ERR(cur))
 703                        return PTR_ERR(cur);
 704                if (search_start == 0)
 705                        search_start = last_block;
 706
 707                btrfs_tree_lock(cur);
 708                err = __btrfs_cow_block(trans, root, cur, parent, i,
 709                                        &cur, search_start,
 710                                        min(16 * blocksize,
 711                                            (end_slot - i) * blocksize),
 712                                        BTRFS_NESTING_COW);
 713                if (err) {
 714                        btrfs_tree_unlock(cur);
 715                        free_extent_buffer(cur);
 716                        break;
 717                }
 718                search_start = cur->start;
 719                last_block = cur->start;
 720                *last_ret = search_start;
 721                btrfs_tree_unlock(cur);
 722                free_extent_buffer(cur);
 723        }
 724        return err;
 725}
 726
 727/*
 728 * search for key in the extent_buffer.  The items start at offset p,
 729 * and they are item_size apart.  There are 'max' items in p.
 730 *
 731 * the slot in the array is returned via slot, and it points to
 732 * the place where you would insert key if it is not found in
 733 * the array.
 734 *
 735 * slot may point to max if the key is bigger than all of the keys
 736 */
 737static noinline int generic_bin_search(struct extent_buffer *eb,
 738                                       unsigned long p, int item_size,
 739                                       const struct btrfs_key *key,
 740                                       int max, int *slot)
 741{
 742        int low = 0;
 743        int high = max;
 744        int ret;
 745        const int key_size = sizeof(struct btrfs_disk_key);
 746
 747        if (low > high) {
 748                btrfs_err(eb->fs_info,
 749                 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
 750                          __func__, low, high, eb->start,
 751                          btrfs_header_owner(eb), btrfs_header_level(eb));
 752                return -EINVAL;
 753        }
 754
 755        while (low < high) {
 756                unsigned long oip;
 757                unsigned long offset;
 758                struct btrfs_disk_key *tmp;
 759                struct btrfs_disk_key unaligned;
 760                int mid;
 761
 762                mid = (low + high) / 2;
 763                offset = p + mid * item_size;
 764                oip = offset_in_page(offset);
 765
 766                if (oip + key_size <= PAGE_SIZE) {
 767                        const unsigned long idx = get_eb_page_index(offset);
 768                        char *kaddr = page_address(eb->pages[idx]);
 769
 770                        oip = get_eb_offset_in_page(eb, offset);
 771                        tmp = (struct btrfs_disk_key *)(kaddr + oip);
 772                } else {
 773                        read_extent_buffer(eb, &unaligned, offset, key_size);
 774                        tmp = &unaligned;
 775                }
 776
 777                ret = comp_keys(tmp, key);
 778
 779                if (ret < 0)
 780                        low = mid + 1;
 781                else if (ret > 0)
 782                        high = mid;
 783                else {
 784                        *slot = mid;
 785                        return 0;
 786                }
 787        }
 788        *slot = low;
 789        return 1;
 790}
 791
 792/*
 793 * simple bin_search frontend that does the right thing for
 794 * leaves vs nodes
 795 */
 796int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
 797                     int *slot)
 798{
 799        if (btrfs_header_level(eb) == 0)
 800                return generic_bin_search(eb,
 801                                          offsetof(struct btrfs_leaf, items),
 802                                          sizeof(struct btrfs_item),
 803                                          key, btrfs_header_nritems(eb),
 804                                          slot);
 805        else
 806                return generic_bin_search(eb,
 807                                          offsetof(struct btrfs_node, ptrs),
 808                                          sizeof(struct btrfs_key_ptr),
 809                                          key, btrfs_header_nritems(eb),
 810                                          slot);
 811}
 812
 813static void root_add_used(struct btrfs_root *root, u32 size)
 814{
 815        spin_lock(&root->accounting_lock);
 816        btrfs_set_root_used(&root->root_item,
 817                            btrfs_root_used(&root->root_item) + size);
 818        spin_unlock(&root->accounting_lock);
 819}
 820
 821static void root_sub_used(struct btrfs_root *root, u32 size)
 822{
 823        spin_lock(&root->accounting_lock);
 824        btrfs_set_root_used(&root->root_item,
 825                            btrfs_root_used(&root->root_item) - size);
 826        spin_unlock(&root->accounting_lock);
 827}
 828
 829/* given a node and slot number, this reads the blocks it points to.  The
 830 * extent buffer is returned with a reference taken (but unlocked).
 831 */
 832struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
 833                                           int slot)
 834{
 835        int level = btrfs_header_level(parent);
 836        struct extent_buffer *eb;
 837        struct btrfs_key first_key;
 838
 839        if (slot < 0 || slot >= btrfs_header_nritems(parent))
 840                return ERR_PTR(-ENOENT);
 841
 842        BUG_ON(level == 0);
 843
 844        btrfs_node_key_to_cpu(parent, &first_key, slot);
 845        eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
 846                             btrfs_header_owner(parent),
 847                             btrfs_node_ptr_generation(parent, slot),
 848                             level - 1, &first_key);
 849        if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
 850                free_extent_buffer(eb);
 851                eb = ERR_PTR(-EIO);
 852        }
 853
 854        return eb;
 855}
 856
 857/*
 858 * node level balancing, used to make sure nodes are in proper order for
 859 * item deletion.  We balance from the top down, so we have to make sure
 860 * that a deletion won't leave an node completely empty later on.
 861 */
 862static noinline int balance_level(struct btrfs_trans_handle *trans,
 863                         struct btrfs_root *root,
 864                         struct btrfs_path *path, int level)
 865{
 866        struct btrfs_fs_info *fs_info = root->fs_info;
 867        struct extent_buffer *right = NULL;
 868        struct extent_buffer *mid;
 869        struct extent_buffer *left = NULL;
 870        struct extent_buffer *parent = NULL;
 871        int ret = 0;
 872        int wret;
 873        int pslot;
 874        int orig_slot = path->slots[level];
 875        u64 orig_ptr;
 876
 877        ASSERT(level > 0);
 878
 879        mid = path->nodes[level];
 880
 881        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
 882        WARN_ON(btrfs_header_generation(mid) != trans->transid);
 883
 884        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 885
 886        if (level < BTRFS_MAX_LEVEL - 1) {
 887                parent = path->nodes[level + 1];
 888                pslot = path->slots[level + 1];
 889        }
 890
 891        /*
 892         * deal with the case where there is only one pointer in the root
 893         * by promoting the node below to a root
 894         */
 895        if (!parent) {
 896                struct extent_buffer *child;
 897
 898                if (btrfs_header_nritems(mid) != 1)
 899                        return 0;
 900
 901                /* promote the child to a root */
 902                child = btrfs_read_node_slot(mid, 0);
 903                if (IS_ERR(child)) {
 904                        ret = PTR_ERR(child);
 905                        btrfs_handle_fs_error(fs_info, ret, NULL);
 906                        goto enospc;
 907                }
 908
 909                btrfs_tree_lock(child);
 910                ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
 911                                      BTRFS_NESTING_COW);
 912                if (ret) {
 913                        btrfs_tree_unlock(child);
 914                        free_extent_buffer(child);
 915                        goto enospc;
 916                }
 917
 918                ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
 919                BUG_ON(ret < 0);
 920                rcu_assign_pointer(root->node, child);
 921
 922                add_root_to_dirty_list(root);
 923                btrfs_tree_unlock(child);
 924
 925                path->locks[level] = 0;
 926                path->nodes[level] = NULL;
 927                btrfs_clean_tree_block(mid);
 928                btrfs_tree_unlock(mid);
 929                /* once for the path */
 930                free_extent_buffer(mid);
 931
 932                root_sub_used(root, mid->len);
 933                btrfs_free_tree_block(trans, root, mid, 0, 1);
 934                /* once for the root ptr */
 935                free_extent_buffer_stale(mid);
 936                return 0;
 937        }
 938        if (btrfs_header_nritems(mid) >
 939            BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
 940                return 0;
 941
 942        left = btrfs_read_node_slot(parent, pslot - 1);
 943        if (IS_ERR(left))
 944                left = NULL;
 945
 946        if (left) {
 947                __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
 948                wret = btrfs_cow_block(trans, root, left,
 949                                       parent, pslot - 1, &left,
 950                                       BTRFS_NESTING_LEFT_COW);
 951                if (wret) {
 952                        ret = wret;
 953                        goto enospc;
 954                }
 955        }
 956
 957        right = btrfs_read_node_slot(parent, pslot + 1);
 958        if (IS_ERR(right))
 959                right = NULL;
 960
 961        if (right) {
 962                __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
 963                wret = btrfs_cow_block(trans, root, right,
 964                                       parent, pslot + 1, &right,
 965                                       BTRFS_NESTING_RIGHT_COW);
 966                if (wret) {
 967                        ret = wret;
 968                        goto enospc;
 969                }
 970        }
 971
 972        /* first, try to make some room in the middle buffer */
 973        if (left) {
 974                orig_slot += btrfs_header_nritems(left);
 975                wret = push_node_left(trans, left, mid, 1);
 976                if (wret < 0)
 977                        ret = wret;
 978        }
 979
 980        /*
 981         * then try to empty the right most buffer into the middle
 982         */
 983        if (right) {
 984                wret = push_node_left(trans, mid, right, 1);
 985                if (wret < 0 && wret != -ENOSPC)
 986                        ret = wret;
 987                if (btrfs_header_nritems(right) == 0) {
 988                        btrfs_clean_tree_block(right);
 989                        btrfs_tree_unlock(right);
 990                        del_ptr(root, path, level + 1, pslot + 1);
 991                        root_sub_used(root, right->len);
 992                        btrfs_free_tree_block(trans, root, right, 0, 1);
 993                        free_extent_buffer_stale(right);
 994                        right = NULL;
 995                } else {
 996                        struct btrfs_disk_key right_key;
 997                        btrfs_node_key(right, &right_key, 0);
 998                        ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
 999                                        BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1000                        BUG_ON(ret < 0);
1001                        btrfs_set_node_key(parent, &right_key, pslot + 1);
1002                        btrfs_mark_buffer_dirty(parent);
1003                }
1004        }
1005        if (btrfs_header_nritems(mid) == 1) {
1006                /*
1007                 * we're not allowed to leave a node with one item in the
1008                 * tree during a delete.  A deletion from lower in the tree
1009                 * could try to delete the only pointer in this node.
1010                 * So, pull some keys from the left.
1011                 * There has to be a left pointer at this point because
1012                 * otherwise we would have pulled some pointers from the
1013                 * right
1014                 */
1015                if (!left) {
1016                        ret = -EROFS;
1017                        btrfs_handle_fs_error(fs_info, ret, NULL);
1018                        goto enospc;
1019                }
1020                wret = balance_node_right(trans, mid, left);
1021                if (wret < 0) {
1022                        ret = wret;
1023                        goto enospc;
1024                }
1025                if (wret == 1) {
1026                        wret = push_node_left(trans, left, mid, 1);
1027                        if (wret < 0)
1028                                ret = wret;
1029                }
1030                BUG_ON(wret == 1);
1031        }
1032        if (btrfs_header_nritems(mid) == 0) {
1033                btrfs_clean_tree_block(mid);
1034                btrfs_tree_unlock(mid);
1035                del_ptr(root, path, level + 1, pslot);
1036                root_sub_used(root, mid->len);
1037                btrfs_free_tree_block(trans, root, mid, 0, 1);
1038                free_extent_buffer_stale(mid);
1039                mid = NULL;
1040        } else {
1041                /* update the parent key to reflect our changes */
1042                struct btrfs_disk_key mid_key;
1043                btrfs_node_key(mid, &mid_key, 0);
1044                ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1045                                BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1046                BUG_ON(ret < 0);
1047                btrfs_set_node_key(parent, &mid_key, pslot);
1048                btrfs_mark_buffer_dirty(parent);
1049        }
1050
1051        /* update the path */
1052        if (left) {
1053                if (btrfs_header_nritems(left) > orig_slot) {
1054                        atomic_inc(&left->refs);
1055                        /* left was locked after cow */
1056                        path->nodes[level] = left;
1057                        path->slots[level + 1] -= 1;
1058                        path->slots[level] = orig_slot;
1059                        if (mid) {
1060                                btrfs_tree_unlock(mid);
1061                                free_extent_buffer(mid);
1062                        }
1063                } else {
1064                        orig_slot -= btrfs_header_nritems(left);
1065                        path->slots[level] = orig_slot;
1066                }
1067        }
1068        /* double check we haven't messed things up */
1069        if (orig_ptr !=
1070            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1071                BUG();
1072enospc:
1073        if (right) {
1074                btrfs_tree_unlock(right);
1075                free_extent_buffer(right);
1076        }
1077        if (left) {
1078                if (path->nodes[level] != left)
1079                        btrfs_tree_unlock(left);
1080                free_extent_buffer(left);
1081        }
1082        return ret;
1083}
1084
1085/* Node balancing for insertion.  Here we only split or push nodes around
1086 * when they are completely full.  This is also done top down, so we
1087 * have to be pessimistic.
1088 */
1089static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1090                                          struct btrfs_root *root,
1091                                          struct btrfs_path *path, int level)
1092{
1093        struct btrfs_fs_info *fs_info = root->fs_info;
1094        struct extent_buffer *right = NULL;
1095        struct extent_buffer *mid;
1096        struct extent_buffer *left = NULL;
1097        struct extent_buffer *parent = NULL;
1098        int ret = 0;
1099        int wret;
1100        int pslot;
1101        int orig_slot = path->slots[level];
1102
1103        if (level == 0)
1104                return 1;
1105
1106        mid = path->nodes[level];
1107        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1108
1109        if (level < BTRFS_MAX_LEVEL - 1) {
1110                parent = path->nodes[level + 1];
1111                pslot = path->slots[level + 1];
1112        }
1113
1114        if (!parent)
1115                return 1;
1116
1117        left = btrfs_read_node_slot(parent, pslot - 1);
1118        if (IS_ERR(left))
1119                left = NULL;
1120
1121        /* first, try to make some room in the middle buffer */
1122        if (left) {
1123                u32 left_nr;
1124
1125                __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1126
1127                left_nr = btrfs_header_nritems(left);
1128                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1129                        wret = 1;
1130                } else {
1131                        ret = btrfs_cow_block(trans, root, left, parent,
1132                                              pslot - 1, &left,
1133                                              BTRFS_NESTING_LEFT_COW);
1134                        if (ret)
1135                                wret = 1;
1136                        else {
1137                                wret = push_node_left(trans, left, mid, 0);
1138                        }
1139                }
1140                if (wret < 0)
1141                        ret = wret;
1142                if (wret == 0) {
1143                        struct btrfs_disk_key disk_key;
1144                        orig_slot += left_nr;
1145                        btrfs_node_key(mid, &disk_key, 0);
1146                        ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1147                                        BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1148                        BUG_ON(ret < 0);
1149                        btrfs_set_node_key(parent, &disk_key, pslot);
1150                        btrfs_mark_buffer_dirty(parent);
1151                        if (btrfs_header_nritems(left) > orig_slot) {
1152                                path->nodes[level] = left;
1153                                path->slots[level + 1] -= 1;
1154                                path->slots[level] = orig_slot;
1155                                btrfs_tree_unlock(mid);
1156                                free_extent_buffer(mid);
1157                        } else {
1158                                orig_slot -=
1159                                        btrfs_header_nritems(left);
1160                                path->slots[level] = orig_slot;
1161                                btrfs_tree_unlock(left);
1162                                free_extent_buffer(left);
1163                        }
1164                        return 0;
1165                }
1166                btrfs_tree_unlock(left);
1167                free_extent_buffer(left);
1168        }
1169        right = btrfs_read_node_slot(parent, pslot + 1);
1170        if (IS_ERR(right))
1171                right = NULL;
1172
1173        /*
1174         * then try to empty the right most buffer into the middle
1175         */
1176        if (right) {
1177                u32 right_nr;
1178
1179                __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1180
1181                right_nr = btrfs_header_nritems(right);
1182                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1183                        wret = 1;
1184                } else {
1185                        ret = btrfs_cow_block(trans, root, right,
1186                                              parent, pslot + 1,
1187                                              &right, BTRFS_NESTING_RIGHT_COW);
1188                        if (ret)
1189                                wret = 1;
1190                        else {
1191                                wret = balance_node_right(trans, right, mid);
1192                        }
1193                }
1194                if (wret < 0)
1195                        ret = wret;
1196                if (wret == 0) {
1197                        struct btrfs_disk_key disk_key;
1198
1199                        btrfs_node_key(right, &disk_key, 0);
1200                        ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1201                                        BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
1202                        BUG_ON(ret < 0);
1203                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
1204                        btrfs_mark_buffer_dirty(parent);
1205
1206                        if (btrfs_header_nritems(mid) <= orig_slot) {
1207                                path->nodes[level] = right;
1208                                path->slots[level + 1] += 1;
1209                                path->slots[level] = orig_slot -
1210                                        btrfs_header_nritems(mid);
1211                                btrfs_tree_unlock(mid);
1212                                free_extent_buffer(mid);
1213                        } else {
1214                                btrfs_tree_unlock(right);
1215                                free_extent_buffer(right);
1216                        }
1217                        return 0;
1218                }
1219                btrfs_tree_unlock(right);
1220                free_extent_buffer(right);
1221        }
1222        return 1;
1223}
1224
1225/*
1226 * readahead one full node of leaves, finding things that are close
1227 * to the block in 'slot', and triggering ra on them.
1228 */
1229static void reada_for_search(struct btrfs_fs_info *fs_info,
1230                             struct btrfs_path *path,
1231                             int level, int slot, u64 objectid)
1232{
1233        struct extent_buffer *node;
1234        struct btrfs_disk_key disk_key;
1235        u32 nritems;
1236        u64 search;
1237        u64 target;
1238        u64 nread = 0;
1239        u64 nread_max;
1240        struct extent_buffer *eb;
1241        u32 nr;
1242        u32 blocksize;
1243        u32 nscan = 0;
1244
1245        if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1246                return;
1247
1248        if (!path->nodes[level])
1249                return;
1250
1251        node = path->nodes[level];
1252
1253        /*
1254         * Since the time between visiting leaves is much shorter than the time
1255         * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1256         * much IO at once (possibly random).
1257         */
1258        if (path->reada == READA_FORWARD_ALWAYS) {
1259                if (level > 1)
1260                        nread_max = node->fs_info->nodesize;
1261                else
1262                        nread_max = SZ_128K;
1263        } else {
1264                nread_max = SZ_64K;
1265        }
1266
1267        search = btrfs_node_blockptr(node, slot);
1268        blocksize = fs_info->nodesize;
1269        eb = find_extent_buffer(fs_info, search);
1270        if (eb) {
1271                free_extent_buffer(eb);
1272                return;
1273        }
1274
1275        target = search;
1276
1277        nritems = btrfs_header_nritems(node);
1278        nr = slot;
1279
1280        while (1) {
1281                if (path->reada == READA_BACK) {
1282                        if (nr == 0)
1283                                break;
1284                        nr--;
1285                } else if (path->reada == READA_FORWARD ||
1286                           path->reada == READA_FORWARD_ALWAYS) {
1287                        nr++;
1288                        if (nr >= nritems)
1289                                break;
1290                }
1291                if (path->reada == READA_BACK && objectid) {
1292                        btrfs_node_key(node, &disk_key, nr);
1293                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
1294                                break;
1295                }
1296                search = btrfs_node_blockptr(node, nr);
1297                if (path->reada == READA_FORWARD_ALWAYS ||
1298                    (search <= target && target - search <= 65536) ||
1299                    (search > target && search - target <= 65536)) {
1300                        btrfs_readahead_node_child(node, nr);
1301                        nread += blocksize;
1302                }
1303                nscan++;
1304                if (nread > nread_max || nscan > 32)
1305                        break;
1306        }
1307}
1308
1309static noinline void reada_for_balance(struct btrfs_path *path, int level)
1310{
1311        struct extent_buffer *parent;
1312        int slot;
1313        int nritems;
1314
1315        parent = path->nodes[level + 1];
1316        if (!parent)
1317                return;
1318
1319        nritems = btrfs_header_nritems(parent);
1320        slot = path->slots[level + 1];
1321
1322        if (slot > 0)
1323                btrfs_readahead_node_child(parent, slot - 1);
1324        if (slot + 1 < nritems)
1325                btrfs_readahead_node_child(parent, slot + 1);
1326}
1327
1328
1329/*
1330 * when we walk down the tree, it is usually safe to unlock the higher layers
1331 * in the tree.  The exceptions are when our path goes through slot 0, because
1332 * operations on the tree might require changing key pointers higher up in the
1333 * tree.
1334 *
1335 * callers might also have set path->keep_locks, which tells this code to keep
1336 * the lock if the path points to the last slot in the block.  This is part of
1337 * walking through the tree, and selecting the next slot in the higher block.
1338 *
1339 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1340 * if lowest_unlock is 1, level 0 won't be unlocked
1341 */
1342static noinline void unlock_up(struct btrfs_path *path, int level,
1343                               int lowest_unlock, int min_write_lock_level,
1344                               int *write_lock_level)
1345{
1346        int i;
1347        int skip_level = level;
1348        int no_skips = 0;
1349        struct extent_buffer *t;
1350
1351        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1352                if (!path->nodes[i])
1353                        break;
1354                if (!path->locks[i])
1355                        break;
1356                if (!no_skips && path->slots[i] == 0) {
1357                        skip_level = i + 1;
1358                        continue;
1359                }
1360                if (!no_skips && path->keep_locks) {
1361                        u32 nritems;
1362                        t = path->nodes[i];
1363                        nritems = btrfs_header_nritems(t);
1364                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
1365                                skip_level = i + 1;
1366                                continue;
1367                        }
1368                }
1369                if (skip_level < i && i >= lowest_unlock)
1370                        no_skips = 1;
1371
1372                t = path->nodes[i];
1373                if (i >= lowest_unlock && i > skip_level) {
1374                        btrfs_tree_unlock_rw(t, path->locks[i]);
1375                        path->locks[i] = 0;
1376                        if (write_lock_level &&
1377                            i > min_write_lock_level &&
1378                            i <= *write_lock_level) {
1379                                *write_lock_level = i - 1;
1380                        }
1381                }
1382        }
1383}
1384
1385/*
1386 * helper function for btrfs_search_slot.  The goal is to find a block
1387 * in cache without setting the path to blocking.  If we find the block
1388 * we return zero and the path is unchanged.
1389 *
1390 * If we can't find the block, we set the path blocking and do some
1391 * reada.  -EAGAIN is returned and the search must be repeated.
1392 */
1393static int
1394read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1395                      struct extent_buffer **eb_ret, int level, int slot,
1396                      const struct btrfs_key *key)
1397{
1398        struct btrfs_fs_info *fs_info = root->fs_info;
1399        u64 blocknr;
1400        u64 gen;
1401        struct extent_buffer *tmp;
1402        struct btrfs_key first_key;
1403        int ret;
1404        int parent_level;
1405
1406        blocknr = btrfs_node_blockptr(*eb_ret, slot);
1407        gen = btrfs_node_ptr_generation(*eb_ret, slot);
1408        parent_level = btrfs_header_level(*eb_ret);
1409        btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
1410
1411        tmp = find_extent_buffer(fs_info, blocknr);
1412        if (tmp) {
1413                if (p->reada == READA_FORWARD_ALWAYS)
1414                        reada_for_search(fs_info, p, level, slot, key->objectid);
1415
1416                /* first we do an atomic uptodate check */
1417                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1418                        /*
1419                         * Do extra check for first_key, eb can be stale due to
1420                         * being cached, read from scrub, or have multiple
1421                         * parents (shared tree blocks).
1422                         */
1423                        if (btrfs_verify_level_key(tmp,
1424                                        parent_level - 1, &first_key, gen)) {
1425                                free_extent_buffer(tmp);
1426                                return -EUCLEAN;
1427                        }
1428                        *eb_ret = tmp;
1429                        return 0;
1430                }
1431
1432                /* now we're allowed to do a blocking uptodate check */
1433                ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
1434                if (!ret) {
1435                        *eb_ret = tmp;
1436                        return 0;
1437                }
1438                free_extent_buffer(tmp);
1439                btrfs_release_path(p);
1440                return -EIO;
1441        }
1442
1443        /*
1444         * reduce lock contention at high levels
1445         * of the btree by dropping locks before
1446         * we read.  Don't release the lock on the current
1447         * level because we need to walk this node to figure
1448         * out which blocks to read.
1449         */
1450        btrfs_unlock_up_safe(p, level + 1);
1451
1452        if (p->reada != READA_NONE)
1453                reada_for_search(fs_info, p, level, slot, key->objectid);
1454
1455        ret = -EAGAIN;
1456        tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
1457                              gen, parent_level - 1, &first_key);
1458        if (!IS_ERR(tmp)) {
1459                /*
1460                 * If the read above didn't mark this buffer up to date,
1461                 * it will never end up being up to date.  Set ret to EIO now
1462                 * and give up so that our caller doesn't loop forever
1463                 * on our EAGAINs.
1464                 */
1465                if (!extent_buffer_uptodate(tmp))
1466                        ret = -EIO;
1467                free_extent_buffer(tmp);
1468        } else {
1469                ret = PTR_ERR(tmp);
1470        }
1471
1472        btrfs_release_path(p);
1473        return ret;
1474}
1475
1476/*
1477 * helper function for btrfs_search_slot.  This does all of the checks
1478 * for node-level blocks and does any balancing required based on
1479 * the ins_len.
1480 *
1481 * If no extra work was required, zero is returned.  If we had to
1482 * drop the path, -EAGAIN is returned and btrfs_search_slot must
1483 * start over
1484 */
1485static int
1486setup_nodes_for_search(struct btrfs_trans_handle *trans,
1487                       struct btrfs_root *root, struct btrfs_path *p,
1488                       struct extent_buffer *b, int level, int ins_len,
1489                       int *write_lock_level)
1490{
1491        struct btrfs_fs_info *fs_info = root->fs_info;
1492        int ret = 0;
1493
1494        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1495            BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1496
1497                if (*write_lock_level < level + 1) {
1498                        *write_lock_level = level + 1;
1499                        btrfs_release_path(p);
1500                        return -EAGAIN;
1501                }
1502
1503                reada_for_balance(p, level);
1504                ret = split_node(trans, root, p, level);
1505
1506                b = p->nodes[level];
1507        } else if (ins_len < 0 && btrfs_header_nritems(b) <
1508                   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1509
1510                if (*write_lock_level < level + 1) {
1511                        *write_lock_level = level + 1;
1512                        btrfs_release_path(p);
1513                        return -EAGAIN;
1514                }
1515
1516                reada_for_balance(p, level);
1517                ret = balance_level(trans, root, p, level);
1518                if (ret)
1519                        return ret;
1520
1521                b = p->nodes[level];
1522                if (!b) {
1523                        btrfs_release_path(p);
1524                        return -EAGAIN;
1525                }
1526                BUG_ON(btrfs_header_nritems(b) == 1);
1527        }
1528        return ret;
1529}
1530
1531int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1532                u64 iobjectid, u64 ioff, u8 key_type,
1533                struct btrfs_key *found_key)
1534{
1535        int ret;
1536        struct btrfs_key key;
1537        struct extent_buffer *eb;
1538
1539        ASSERT(path);
1540        ASSERT(found_key);
1541
1542        key.type = key_type;
1543        key.objectid = iobjectid;
1544        key.offset = ioff;
1545
1546        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1547        if (ret < 0)
1548                return ret;
1549
1550        eb = path->nodes[0];
1551        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1552                ret = btrfs_next_leaf(fs_root, path);
1553                if (ret)
1554                        return ret;
1555                eb = path->nodes[0];
1556        }
1557
1558        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1559        if (found_key->type != key.type ||
1560                        found_key->objectid != key.objectid)
1561                return 1;
1562
1563        return 0;
1564}
1565
1566static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1567                                                        struct btrfs_path *p,
1568                                                        int write_lock_level)
1569{
1570        struct btrfs_fs_info *fs_info = root->fs_info;
1571        struct extent_buffer *b;
1572        int root_lock;
1573        int level = 0;
1574
1575        /* We try very hard to do read locks on the root */
1576        root_lock = BTRFS_READ_LOCK;
1577
1578        if (p->search_commit_root) {
1579                /*
1580                 * The commit roots are read only so we always do read locks,
1581                 * and we always must hold the commit_root_sem when doing
1582                 * searches on them, the only exception is send where we don't
1583                 * want to block transaction commits for a long time, so
1584                 * we need to clone the commit root in order to avoid races
1585                 * with transaction commits that create a snapshot of one of
1586                 * the roots used by a send operation.
1587                 */
1588                if (p->need_commit_sem) {
1589                        down_read(&fs_info->commit_root_sem);
1590                        b = btrfs_clone_extent_buffer(root->commit_root);
1591                        up_read(&fs_info->commit_root_sem);
1592                        if (!b)
1593                                return ERR_PTR(-ENOMEM);
1594
1595                } else {
1596                        b = root->commit_root;
1597                        atomic_inc(&b->refs);
1598                }
1599                level = btrfs_header_level(b);
1600                /*
1601                 * Ensure that all callers have set skip_locking when
1602                 * p->search_commit_root = 1.
1603                 */
1604                ASSERT(p->skip_locking == 1);
1605
1606                goto out;
1607        }
1608
1609        if (p->skip_locking) {
1610                b = btrfs_root_node(root);
1611                level = btrfs_header_level(b);
1612                goto out;
1613        }
1614
1615        /*
1616         * If the level is set to maximum, we can skip trying to get the read
1617         * lock.
1618         */
1619        if (write_lock_level < BTRFS_MAX_LEVEL) {
1620                /*
1621                 * We don't know the level of the root node until we actually
1622                 * have it read locked
1623                 */
1624                b = btrfs_read_lock_root_node(root);
1625                level = btrfs_header_level(b);
1626                if (level > write_lock_level)
1627                        goto out;
1628
1629                /* Whoops, must trade for write lock */
1630                btrfs_tree_read_unlock(b);
1631                free_extent_buffer(b);
1632        }
1633
1634        b = btrfs_lock_root_node(root);
1635        root_lock = BTRFS_WRITE_LOCK;
1636
1637        /* The level might have changed, check again */
1638        level = btrfs_header_level(b);
1639
1640out:
1641        p->nodes[level] = b;
1642        if (!p->skip_locking)
1643                p->locks[level] = root_lock;
1644        /*
1645         * Callers are responsible for dropping b's references.
1646         */
1647        return b;
1648}
1649
1650
1651/*
1652 * btrfs_search_slot - look for a key in a tree and perform necessary
1653 * modifications to preserve tree invariants.
1654 *
1655 * @trans:      Handle of transaction, used when modifying the tree
1656 * @p:          Holds all btree nodes along the search path
1657 * @root:       The root node of the tree
1658 * @key:        The key we are looking for
1659 * @ins_len:    Indicates purpose of search:
1660 *              >0  for inserts it's size of item inserted (*)
1661 *              <0  for deletions
1662 *               0  for plain searches, not modifying the tree
1663 *
1664 *              (*) If size of item inserted doesn't include
1665 *              sizeof(struct btrfs_item), then p->search_for_extension must
1666 *              be set.
1667 * @cow:        boolean should CoW operations be performed. Must always be 1
1668 *              when modifying the tree.
1669 *
1670 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1671 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1672 *
1673 * If @key is found, 0 is returned and you can find the item in the leaf level
1674 * of the path (level 0)
1675 *
1676 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1677 * points to the slot where it should be inserted
1678 *
1679 * If an error is encountered while searching the tree a negative error number
1680 * is returned
1681 */
1682int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1683                      const struct btrfs_key *key, struct btrfs_path *p,
1684                      int ins_len, int cow)
1685{
1686        struct extent_buffer *b;
1687        int slot;
1688        int ret;
1689        int err;
1690        int level;
1691        int lowest_unlock = 1;
1692        /* everything at write_lock_level or lower must be write locked */
1693        int write_lock_level = 0;
1694        u8 lowest_level = 0;
1695        int min_write_lock_level;
1696        int prev_cmp;
1697
1698        lowest_level = p->lowest_level;
1699        WARN_ON(lowest_level && ins_len > 0);
1700        WARN_ON(p->nodes[0] != NULL);
1701        BUG_ON(!cow && ins_len);
1702
1703        if (ins_len < 0) {
1704                lowest_unlock = 2;
1705
1706                /* when we are removing items, we might have to go up to level
1707                 * two as we update tree pointers  Make sure we keep write
1708                 * for those levels as well
1709                 */
1710                write_lock_level = 2;
1711        } else if (ins_len > 0) {
1712                /*
1713                 * for inserting items, make sure we have a write lock on
1714                 * level 1 so we can update keys
1715                 */
1716                write_lock_level = 1;
1717        }
1718
1719        if (!cow)
1720                write_lock_level = -1;
1721
1722        if (cow && (p->keep_locks || p->lowest_level))
1723                write_lock_level = BTRFS_MAX_LEVEL;
1724
1725        min_write_lock_level = write_lock_level;
1726
1727again:
1728        prev_cmp = -1;
1729        b = btrfs_search_slot_get_root(root, p, write_lock_level);
1730        if (IS_ERR(b)) {
1731                ret = PTR_ERR(b);
1732                goto done;
1733        }
1734
1735        while (b) {
1736                int dec = 0;
1737
1738                level = btrfs_header_level(b);
1739
1740                if (cow) {
1741                        bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
1742
1743                        /*
1744                         * if we don't really need to cow this block
1745                         * then we don't want to set the path blocking,
1746                         * so we test it here
1747                         */
1748                        if (!should_cow_block(trans, root, b))
1749                                goto cow_done;
1750
1751                        /*
1752                         * must have write locks on this node and the
1753                         * parent
1754                         */
1755                        if (level > write_lock_level ||
1756                            (level + 1 > write_lock_level &&
1757                            level + 1 < BTRFS_MAX_LEVEL &&
1758                            p->nodes[level + 1])) {
1759                                write_lock_level = level + 1;
1760                                btrfs_release_path(p);
1761                                goto again;
1762                        }
1763
1764                        if (last_level)
1765                                err = btrfs_cow_block(trans, root, b, NULL, 0,
1766                                                      &b,
1767                                                      BTRFS_NESTING_COW);
1768                        else
1769                                err = btrfs_cow_block(trans, root, b,
1770                                                      p->nodes[level + 1],
1771                                                      p->slots[level + 1], &b,
1772                                                      BTRFS_NESTING_COW);
1773                        if (err) {
1774                                ret = err;
1775                                goto done;
1776                        }
1777                }
1778cow_done:
1779                p->nodes[level] = b;
1780                /*
1781                 * Leave path with blocking locks to avoid massive
1782                 * lock context switch, this is made on purpose.
1783                 */
1784
1785                /*
1786                 * we have a lock on b and as long as we aren't changing
1787                 * the tree, there is no way to for the items in b to change.
1788                 * It is safe to drop the lock on our parent before we
1789                 * go through the expensive btree search on b.
1790                 *
1791                 * If we're inserting or deleting (ins_len != 0), then we might
1792                 * be changing slot zero, which may require changing the parent.
1793                 * So, we can't drop the lock until after we know which slot
1794                 * we're operating on.
1795                 */
1796                if (!ins_len && !p->keep_locks) {
1797                        int u = level + 1;
1798
1799                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
1800                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
1801                                p->locks[u] = 0;
1802                        }
1803                }
1804
1805                /*
1806                 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
1807                 * we can safely assume the target key will always be in slot 0
1808                 * on lower levels due to the invariants BTRFS' btree provides,
1809                 * namely that a btrfs_key_ptr entry always points to the
1810                 * lowest key in the child node, thus we can skip searching
1811                 * lower levels
1812                 */
1813                if (prev_cmp == 0) {
1814                        slot = 0;
1815                        ret = 0;
1816                } else {
1817                        ret = btrfs_bin_search(b, key, &slot);
1818                        prev_cmp = ret;
1819                        if (ret < 0)
1820                                goto done;
1821                }
1822
1823                if (level == 0) {
1824                        p->slots[level] = slot;
1825                        /*
1826                         * Item key already exists. In this case, if we are
1827                         * allowed to insert the item (for example, in dir_item
1828                         * case, item key collision is allowed), it will be
1829                         * merged with the original item. Only the item size
1830                         * grows, no new btrfs item will be added. If
1831                         * search_for_extension is not set, ins_len already
1832                         * accounts the size btrfs_item, deduct it here so leaf
1833                         * space check will be correct.
1834                         */
1835                        if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
1836                                ASSERT(ins_len >= sizeof(struct btrfs_item));
1837                                ins_len -= sizeof(struct btrfs_item);
1838                        }
1839                        if (ins_len > 0 &&
1840                            btrfs_leaf_free_space(b) < ins_len) {
1841                                if (write_lock_level < 1) {
1842                                        write_lock_level = 1;
1843                                        btrfs_release_path(p);
1844                                        goto again;
1845                                }
1846
1847                                err = split_leaf(trans, root, key,
1848                                                 p, ins_len, ret == 0);
1849
1850                                BUG_ON(err > 0);
1851                                if (err) {
1852                                        ret = err;
1853                                        goto done;
1854                                }
1855                        }
1856                        if (!p->search_for_split)
1857                                unlock_up(p, level, lowest_unlock,
1858                                          min_write_lock_level, NULL);
1859                        goto done;
1860                }
1861                if (ret && slot > 0) {
1862                        dec = 1;
1863                        slot--;
1864                }
1865                p->slots[level] = slot;
1866                err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
1867                                             &write_lock_level);
1868                if (err == -EAGAIN)
1869                        goto again;
1870                if (err) {
1871                        ret = err;
1872                        goto done;
1873                }
1874                b = p->nodes[level];
1875                slot = p->slots[level];
1876
1877                /*
1878                 * Slot 0 is special, if we change the key we have to update
1879                 * the parent pointer which means we must have a write lock on
1880                 * the parent
1881                 */
1882                if (slot == 0 && ins_len && write_lock_level < level + 1) {
1883                        write_lock_level = level + 1;
1884                        btrfs_release_path(p);
1885                        goto again;
1886                }
1887
1888                unlock_up(p, level, lowest_unlock, min_write_lock_level,
1889                          &write_lock_level);
1890
1891                if (level == lowest_level) {
1892                        if (dec)
1893                                p->slots[level]++;
1894                        goto done;
1895                }
1896
1897                err = read_block_for_search(root, p, &b, level, slot, key);
1898                if (err == -EAGAIN)
1899                        goto again;
1900                if (err) {
1901                        ret = err;
1902                        goto done;
1903                }
1904
1905                if (!p->skip_locking) {
1906                        level = btrfs_header_level(b);
1907                        if (level <= write_lock_level) {
1908                                btrfs_tree_lock(b);
1909                                p->locks[level] = BTRFS_WRITE_LOCK;
1910                        } else {
1911                                btrfs_tree_read_lock(b);
1912                                p->locks[level] = BTRFS_READ_LOCK;
1913                        }
1914                        p->nodes[level] = b;
1915                }
1916        }
1917        ret = 1;
1918done:
1919        if (ret < 0 && !p->skip_release_on_error)
1920                btrfs_release_path(p);
1921        return ret;
1922}
1923ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
1924
1925/*
1926 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
1927 * current state of the tree together with the operations recorded in the tree
1928 * modification log to search for the key in a previous version of this tree, as
1929 * denoted by the time_seq parameter.
1930 *
1931 * Naturally, there is no support for insert, delete or cow operations.
1932 *
1933 * The resulting path and return value will be set up as if we called
1934 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
1935 */
1936int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
1937                          struct btrfs_path *p, u64 time_seq)
1938{
1939        struct btrfs_fs_info *fs_info = root->fs_info;
1940        struct extent_buffer *b;
1941        int slot;
1942        int ret;
1943        int err;
1944        int level;
1945        int lowest_unlock = 1;
1946        u8 lowest_level = 0;
1947
1948        lowest_level = p->lowest_level;
1949        WARN_ON(p->nodes[0] != NULL);
1950
1951        if (p->search_commit_root) {
1952                BUG_ON(time_seq);
1953                return btrfs_search_slot(NULL, root, key, p, 0, 0);
1954        }
1955
1956again:
1957        b = btrfs_get_old_root(root, time_seq);
1958        if (!b) {
1959                ret = -EIO;
1960                goto done;
1961        }
1962        level = btrfs_header_level(b);
1963        p->locks[level] = BTRFS_READ_LOCK;
1964
1965        while (b) {
1966                int dec = 0;
1967
1968                level = btrfs_header_level(b);
1969                p->nodes[level] = b;
1970
1971                /*
1972                 * we have a lock on b and as long as we aren't changing
1973                 * the tree, there is no way to for the items in b to change.
1974                 * It is safe to drop the lock on our parent before we
1975                 * go through the expensive btree search on b.
1976                 */
1977                btrfs_unlock_up_safe(p, level + 1);
1978
1979                ret = btrfs_bin_search(b, key, &slot);
1980                if (ret < 0)
1981                        goto done;
1982
1983                if (level == 0) {
1984                        p->slots[level] = slot;
1985                        unlock_up(p, level, lowest_unlock, 0, NULL);
1986                        goto done;
1987                }
1988
1989                if (ret && slot > 0) {
1990                        dec = 1;
1991                        slot--;
1992                }
1993                p->slots[level] = slot;
1994                unlock_up(p, level, lowest_unlock, 0, NULL);
1995
1996                if (level == lowest_level) {
1997                        if (dec)
1998                                p->slots[level]++;
1999                        goto done;
2000                }
2001
2002                err = read_block_for_search(root, p, &b, level, slot, key);
2003                if (err == -EAGAIN)
2004                        goto again;
2005                if (err) {
2006                        ret = err;
2007                        goto done;
2008                }
2009
2010                level = btrfs_header_level(b);
2011                btrfs_tree_read_lock(b);
2012                b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2013                if (!b) {
2014                        ret = -ENOMEM;
2015                        goto done;
2016                }
2017                p->locks[level] = BTRFS_READ_LOCK;
2018                p->nodes[level] = b;
2019        }
2020        ret = 1;
2021done:
2022        if (ret < 0)
2023                btrfs_release_path(p);
2024
2025        return ret;
2026}
2027
2028/*
2029 * helper to use instead of search slot if no exact match is needed but
2030 * instead the next or previous item should be returned.
2031 * When find_higher is true, the next higher item is returned, the next lower
2032 * otherwise.
2033 * When return_any and find_higher are both true, and no higher item is found,
2034 * return the next lower instead.
2035 * When return_any is true and find_higher is false, and no lower item is found,
2036 * return the next higher instead.
2037 * It returns 0 if any item is found, 1 if none is found (tree empty), and
2038 * < 0 on error
2039 */
2040int btrfs_search_slot_for_read(struct btrfs_root *root,
2041                               const struct btrfs_key *key,
2042                               struct btrfs_path *p, int find_higher,
2043                               int return_any)
2044{
2045        int ret;
2046        struct extent_buffer *leaf;
2047
2048again:
2049        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2050        if (ret <= 0)
2051                return ret;
2052        /*
2053         * a return value of 1 means the path is at the position where the
2054         * item should be inserted. Normally this is the next bigger item,
2055         * but in case the previous item is the last in a leaf, path points
2056         * to the first free slot in the previous leaf, i.e. at an invalid
2057         * item.
2058         */
2059        leaf = p->nodes[0];
2060
2061        if (find_higher) {
2062                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2063                        ret = btrfs_next_leaf(root, p);
2064                        if (ret <= 0)
2065                                return ret;
2066                        if (!return_any)
2067                                return 1;
2068                        /*
2069                         * no higher item found, return the next
2070                         * lower instead
2071                         */
2072                        return_any = 0;
2073                        find_higher = 0;
2074                        btrfs_release_path(p);
2075                        goto again;
2076                }
2077        } else {
2078                if (p->slots[0] == 0) {
2079                        ret = btrfs_prev_leaf(root, p);
2080                        if (ret < 0)
2081                                return ret;
2082                        if (!ret) {
2083                                leaf = p->nodes[0];
2084                                if (p->slots[0] == btrfs_header_nritems(leaf))
2085                                        p->slots[0]--;
2086                                return 0;
2087                        }
2088                        if (!return_any)
2089                                return 1;
2090                        /*
2091                         * no lower item found, return the next
2092                         * higher instead
2093                         */
2094                        return_any = 0;
2095                        find_higher = 1;
2096                        btrfs_release_path(p);
2097                        goto again;
2098                } else {
2099                        --p->slots[0];
2100                }
2101        }
2102        return 0;
2103}
2104
2105/*
2106 * adjust the pointers going up the tree, starting at level
2107 * making sure the right key of each node is points to 'key'.
2108 * This is used after shifting pointers to the left, so it stops
2109 * fixing up pointers when a given leaf/node is not in slot 0 of the
2110 * higher levels
2111 *
2112 */
2113static void fixup_low_keys(struct btrfs_path *path,
2114                           struct btrfs_disk_key *key, int level)
2115{
2116        int i;
2117        struct extent_buffer *t;
2118        int ret;
2119
2120        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2121                int tslot = path->slots[i];
2122
2123                if (!path->nodes[i])
2124                        break;
2125                t = path->nodes[i];
2126                ret = btrfs_tree_mod_log_insert_key(t, tslot,
2127                                BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
2128                BUG_ON(ret < 0);
2129                btrfs_set_node_key(t, key, tslot);
2130                btrfs_mark_buffer_dirty(path->nodes[i]);
2131                if (tslot != 0)
2132                        break;
2133        }
2134}
2135
2136/*
2137 * update item key.
2138 *
2139 * This function isn't completely safe. It's the caller's responsibility
2140 * that the new key won't break the order
2141 */
2142void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
2143                             struct btrfs_path *path,
2144                             const struct btrfs_key *new_key)
2145{
2146        struct btrfs_disk_key disk_key;
2147        struct extent_buffer *eb;
2148        int slot;
2149
2150        eb = path->nodes[0];
2151        slot = path->slots[0];
2152        if (slot > 0) {
2153                btrfs_item_key(eb, &disk_key, slot - 1);
2154                if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
2155                        btrfs_crit(fs_info,
2156                "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2157                                   slot, btrfs_disk_key_objectid(&disk_key),
2158                                   btrfs_disk_key_type(&disk_key),
2159                                   btrfs_disk_key_offset(&disk_key),
2160                                   new_key->objectid, new_key->type,
2161                                   new_key->offset);
2162                        btrfs_print_leaf(eb);
2163                        BUG();
2164                }
2165        }
2166        if (slot < btrfs_header_nritems(eb) - 1) {
2167                btrfs_item_key(eb, &disk_key, slot + 1);
2168                if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
2169                        btrfs_crit(fs_info,
2170                "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2171                                   slot, btrfs_disk_key_objectid(&disk_key),
2172                                   btrfs_disk_key_type(&disk_key),
2173                                   btrfs_disk_key_offset(&disk_key),
2174                                   new_key->objectid, new_key->type,
2175                                   new_key->offset);
2176                        btrfs_print_leaf(eb);
2177                        BUG();
2178                }
2179        }
2180
2181        btrfs_cpu_key_to_disk(&disk_key, new_key);
2182        btrfs_set_item_key(eb, &disk_key, slot);
2183        btrfs_mark_buffer_dirty(eb);
2184        if (slot == 0)
2185                fixup_low_keys(path, &disk_key, 1);
2186}
2187
2188/*
2189 * Check key order of two sibling extent buffers.
2190 *
2191 * Return true if something is wrong.
2192 * Return false if everything is fine.
2193 *
2194 * Tree-checker only works inside one tree block, thus the following
2195 * corruption can not be detected by tree-checker:
2196 *
2197 * Leaf @left                   | Leaf @right
2198 * --------------------------------------------------------------
2199 * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
2200 *
2201 * Key f6 in leaf @left itself is valid, but not valid when the next
2202 * key in leaf @right is 7.
2203 * This can only be checked at tree block merge time.
2204 * And since tree checker has ensured all key order in each tree block
2205 * is correct, we only need to bother the last key of @left and the first
2206 * key of @right.
2207 */
2208static bool check_sibling_keys(struct extent_buffer *left,
2209                               struct extent_buffer *right)
2210{
2211        struct btrfs_key left_last;
2212        struct btrfs_key right_first;
2213        int level = btrfs_header_level(left);
2214        int nr_left = btrfs_header_nritems(left);
2215        int nr_right = btrfs_header_nritems(right);
2216
2217        /* No key to check in one of the tree blocks */
2218        if (!nr_left || !nr_right)
2219                return false;
2220
2221        if (level) {
2222                btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2223                btrfs_node_key_to_cpu(right, &right_first, 0);
2224        } else {
2225                btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2226                btrfs_item_key_to_cpu(right, &right_first, 0);
2227        }
2228
2229        if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
2230                btrfs_crit(left->fs_info,
2231"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2232                           left_last.objectid, left_last.type,
2233                           left_last.offset, right_first.objectid,
2234                           right_first.type, right_first.offset);
2235                return true;
2236        }
2237        return false;
2238}
2239
2240/*
2241 * try to push data from one node into the next node left in the
2242 * tree.
2243 *
2244 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2245 * error, and > 0 if there was no room in the left hand block.
2246 */
2247static int push_node_left(struct btrfs_trans_handle *trans,
2248                          struct extent_buffer *dst,
2249                          struct extent_buffer *src, int empty)
2250{
2251        struct btrfs_fs_info *fs_info = trans->fs_info;
2252        int push_items = 0;
2253        int src_nritems;
2254        int dst_nritems;
2255        int ret = 0;
2256
2257        src_nritems = btrfs_header_nritems(src);
2258        dst_nritems = btrfs_header_nritems(dst);
2259        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2260        WARN_ON(btrfs_header_generation(src) != trans->transid);
2261        WARN_ON(btrfs_header_generation(dst) != trans->transid);
2262
2263        if (!empty && src_nritems <= 8)
2264                return 1;
2265
2266        if (push_items <= 0)
2267                return 1;
2268
2269        if (empty) {
2270                push_items = min(src_nritems, push_items);
2271                if (push_items < src_nritems) {
2272                        /* leave at least 8 pointers in the node if
2273                         * we aren't going to empty it
2274                         */
2275                        if (src_nritems - push_items < 8) {
2276                                if (push_items <= 8)
2277                                        return 1;
2278                                push_items -= 8;
2279                        }
2280                }
2281        } else
2282                push_items = min(src_nritems - 8, push_items);
2283
2284        /* dst is the left eb, src is the middle eb */
2285        if (check_sibling_keys(dst, src)) {
2286                ret = -EUCLEAN;
2287                btrfs_abort_transaction(trans, ret);
2288                return ret;
2289        }
2290        ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2291        if (ret) {
2292                btrfs_abort_transaction(trans, ret);
2293                return ret;
2294        }
2295        copy_extent_buffer(dst, src,
2296                           btrfs_node_key_ptr_offset(dst_nritems),
2297                           btrfs_node_key_ptr_offset(0),
2298                           push_items * sizeof(struct btrfs_key_ptr));
2299
2300        if (push_items < src_nritems) {
2301                /*
2302                 * Don't call btrfs_tree_mod_log_insert_move() here, key removal
2303                 * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
2304                 */
2305                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
2306                                      btrfs_node_key_ptr_offset(push_items),
2307                                      (src_nritems - push_items) *
2308                                      sizeof(struct btrfs_key_ptr));
2309        }
2310        btrfs_set_header_nritems(src, src_nritems - push_items);
2311        btrfs_set_header_nritems(dst, dst_nritems + push_items);
2312        btrfs_mark_buffer_dirty(src);
2313        btrfs_mark_buffer_dirty(dst);
2314
2315        return ret;
2316}
2317
2318/*
2319 * try to push data from one node into the next node right in the
2320 * tree.
2321 *
2322 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2323 * error, and > 0 if there was no room in the right hand block.
2324 *
2325 * this will  only push up to 1/2 the contents of the left node over
2326 */
2327static int balance_node_right(struct btrfs_trans_handle *trans,
2328                              struct extent_buffer *dst,
2329                              struct extent_buffer *src)
2330{
2331        struct btrfs_fs_info *fs_info = trans->fs_info;
2332        int push_items = 0;
2333        int max_push;
2334        int src_nritems;
2335        int dst_nritems;
2336        int ret = 0;
2337
2338        WARN_ON(btrfs_header_generation(src) != trans->transid);
2339        WARN_ON(btrfs_header_generation(dst) != trans->transid);
2340
2341        src_nritems = btrfs_header_nritems(src);
2342        dst_nritems = btrfs_header_nritems(dst);
2343        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2344        if (push_items <= 0)
2345                return 1;
2346
2347        if (src_nritems < 4)
2348                return 1;
2349
2350        max_push = src_nritems / 2 + 1;
2351        /* don't try to empty the node */
2352        if (max_push >= src_nritems)
2353                return 1;
2354
2355        if (max_push < push_items)
2356                push_items = max_push;
2357
2358        /* dst is the right eb, src is the middle eb */
2359        if (check_sibling_keys(src, dst)) {
2360                ret = -EUCLEAN;
2361                btrfs_abort_transaction(trans, ret);
2362                return ret;
2363        }
2364        ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
2365        BUG_ON(ret < 0);
2366        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
2367                                      btrfs_node_key_ptr_offset(0),
2368                                      (dst_nritems) *
2369                                      sizeof(struct btrfs_key_ptr));
2370
2371        ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2372                                         push_items);
2373        if (ret) {
2374                btrfs_abort_transaction(trans, ret);
2375                return ret;
2376        }
2377        copy_extent_buffer(dst, src,
2378                           btrfs_node_key_ptr_offset(0),
2379                           btrfs_node_key_ptr_offset(src_nritems - push_items),
2380                           push_items * sizeof(struct btrfs_key_ptr));
2381
2382        btrfs_set_header_nritems(src, src_nritems - push_items);
2383        btrfs_set_header_nritems(dst, dst_nritems + push_items);
2384
2385        btrfs_mark_buffer_dirty(src);
2386        btrfs_mark_buffer_dirty(dst);
2387
2388        return ret;
2389}
2390
2391/*
2392 * helper function to insert a new root level in the tree.
2393 * A new node is allocated, and a single item is inserted to
2394 * point to the existing root
2395 *
2396 * returns zero on success or < 0 on failure.
2397 */
2398static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2399                           struct btrfs_root *root,
2400                           struct btrfs_path *path, int level)
2401{
2402        struct btrfs_fs_info *fs_info = root->fs_info;
2403        u64 lower_gen;
2404        struct extent_buffer *lower;
2405        struct extent_buffer *c;
2406        struct extent_buffer *old;
2407        struct btrfs_disk_key lower_key;
2408        int ret;
2409
2410        BUG_ON(path->nodes[level]);
2411        BUG_ON(path->nodes[level-1] != root->node);
2412
2413        lower = path->nodes[level-1];
2414        if (level == 1)
2415                btrfs_item_key(lower, &lower_key, 0);
2416        else
2417                btrfs_node_key(lower, &lower_key, 0);
2418
2419        c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2420                                   &lower_key, level, root->node->start, 0,
2421                                   BTRFS_NESTING_NEW_ROOT);
2422        if (IS_ERR(c))
2423                return PTR_ERR(c);
2424
2425        root_add_used(root, fs_info->nodesize);
2426
2427        btrfs_set_header_nritems(c, 1);
2428        btrfs_set_node_key(c, &lower_key, 0);
2429        btrfs_set_node_blockptr(c, 0, lower->start);
2430        lower_gen = btrfs_header_generation(lower);
2431        WARN_ON(lower_gen != trans->transid);
2432
2433        btrfs_set_node_ptr_generation(c, 0, lower_gen);
2434
2435        btrfs_mark_buffer_dirty(c);
2436
2437        old = root->node;
2438        ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2439        BUG_ON(ret < 0);
2440        rcu_assign_pointer(root->node, c);
2441
2442        /* the super has an extra ref to root->node */
2443        free_extent_buffer(old);
2444
2445        add_root_to_dirty_list(root);
2446        atomic_inc(&c->refs);
2447        path->nodes[level] = c;
2448        path->locks[level] = BTRFS_WRITE_LOCK;
2449        path->slots[level] = 0;
2450        return 0;
2451}
2452
2453/*
2454 * worker function to insert a single pointer in a node.
2455 * the node should have enough room for the pointer already
2456 *
2457 * slot and level indicate where you want the key to go, and
2458 * blocknr is the block the key points to.
2459 */
2460static void insert_ptr(struct btrfs_trans_handle *trans,
2461                       struct btrfs_path *path,
2462                       struct btrfs_disk_key *key, u64 bytenr,
2463                       int slot, int level)
2464{
2465        struct extent_buffer *lower;
2466        int nritems;
2467        int ret;
2468
2469        BUG_ON(!path->nodes[level]);
2470        btrfs_assert_tree_locked(path->nodes[level]);
2471        lower = path->nodes[level];
2472        nritems = btrfs_header_nritems(lower);
2473        BUG_ON(slot > nritems);
2474        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2475        if (slot != nritems) {
2476                if (level) {
2477                        ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2478                                        slot, nritems - slot);
2479                        BUG_ON(ret < 0);
2480                }
2481                memmove_extent_buffer(lower,
2482                              btrfs_node_key_ptr_offset(slot + 1),
2483                              btrfs_node_key_ptr_offset(slot),
2484                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
2485        }
2486        if (level) {
2487                ret = btrfs_tree_mod_log_insert_key(lower, slot,
2488                                            BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
2489                BUG_ON(ret < 0);
2490        }
2491        btrfs_set_node_key(lower, key, slot);
2492        btrfs_set_node_blockptr(lower, slot, bytenr);
2493        WARN_ON(trans->transid == 0);
2494        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2495        btrfs_set_header_nritems(lower, nritems + 1);
2496        btrfs_mark_buffer_dirty(lower);
2497}
2498
2499/*
2500 * split the node at the specified level in path in two.
2501 * The path is corrected to point to the appropriate node after the split
2502 *
2503 * Before splitting this tries to make some room in the node by pushing
2504 * left and right, if either one works, it returns right away.
2505 *
2506 * returns 0 on success and < 0 on failure
2507 */
2508static noinline int split_node(struct btrfs_trans_handle *trans,
2509                               struct btrfs_root *root,
2510                               struct btrfs_path *path, int level)
2511{
2512        struct btrfs_fs_info *fs_info = root->fs_info;
2513        struct extent_buffer *c;
2514        struct extent_buffer *split;
2515        struct btrfs_disk_key disk_key;
2516        int mid;
2517        int ret;
2518        u32 c_nritems;
2519
2520        c = path->nodes[level];
2521        WARN_ON(btrfs_header_generation(c) != trans->transid);
2522        if (c == root->node) {
2523                /*
2524                 * trying to split the root, lets make a new one
2525                 *
2526                 * tree mod log: We don't log_removal old root in
2527                 * insert_new_root, because that root buffer will be kept as a
2528                 * normal node. We are going to log removal of half of the
2529                 * elements below with btrfs_tree_mod_log_eb_copy(). We're
2530                 * holding a tree lock on the buffer, which is why we cannot
2531                 * race with other tree_mod_log users.
2532                 */
2533                ret = insert_new_root(trans, root, path, level + 1);
2534                if (ret)
2535                        return ret;
2536        } else {
2537                ret = push_nodes_for_insert(trans, root, path, level);
2538                c = path->nodes[level];
2539                if (!ret && btrfs_header_nritems(c) <
2540                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
2541                        return 0;
2542                if (ret < 0)
2543                        return ret;
2544        }
2545
2546        c_nritems = btrfs_header_nritems(c);
2547        mid = (c_nritems + 1) / 2;
2548        btrfs_node_key(c, &disk_key, mid);
2549
2550        split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2551                                       &disk_key, level, c->start, 0,
2552                                       BTRFS_NESTING_SPLIT);
2553        if (IS_ERR(split))
2554                return PTR_ERR(split);
2555
2556        root_add_used(root, fs_info->nodesize);
2557        ASSERT(btrfs_header_level(c) == level);
2558
2559        ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
2560        if (ret) {
2561                btrfs_abort_transaction(trans, ret);
2562                return ret;
2563        }
2564        copy_extent_buffer(split, c,
2565                           btrfs_node_key_ptr_offset(0),
2566                           btrfs_node_key_ptr_offset(mid),
2567                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
2568        btrfs_set_header_nritems(split, c_nritems - mid);
2569        btrfs_set_header_nritems(c, mid);
2570
2571        btrfs_mark_buffer_dirty(c);
2572        btrfs_mark_buffer_dirty(split);
2573
2574        insert_ptr(trans, path, &disk_key, split->start,
2575                   path->slots[level + 1] + 1, level + 1);
2576
2577        if (path->slots[level] >= mid) {
2578                path->slots[level] -= mid;
2579                btrfs_tree_unlock(c);
2580                free_extent_buffer(c);
2581                path->nodes[level] = split;
2582                path->slots[level + 1] += 1;
2583        } else {
2584                btrfs_tree_unlock(split);
2585                free_extent_buffer(split);
2586        }
2587        return 0;
2588}
2589
2590/*
2591 * how many bytes are required to store the items in a leaf.  start
2592 * and nr indicate which items in the leaf to check.  This totals up the
2593 * space used both by the item structs and the item data
2594 */
2595static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2596{
2597        struct btrfs_item *start_item;
2598        struct btrfs_item *end_item;
2599        int data_len;
2600        int nritems = btrfs_header_nritems(l);
2601        int end = min(nritems, start + nr) - 1;
2602
2603        if (!nr)
2604                return 0;
2605        start_item = btrfs_item_nr(start);
2606        end_item = btrfs_item_nr(end);
2607        data_len = btrfs_item_offset(l, start_item) +
2608                   btrfs_item_size(l, start_item);
2609        data_len = data_len - btrfs_item_offset(l, end_item);
2610        data_len += sizeof(struct btrfs_item) * nr;
2611        WARN_ON(data_len < 0);
2612        return data_len;
2613}
2614
2615/*
2616 * The space between the end of the leaf items and
2617 * the start of the leaf data.  IOW, how much room
2618 * the leaf has left for both items and data
2619 */
2620noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
2621{
2622        struct btrfs_fs_info *fs_info = leaf->fs_info;
2623        int nritems = btrfs_header_nritems(leaf);
2624        int ret;
2625
2626        ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
2627        if (ret < 0) {
2628                btrfs_crit(fs_info,
2629                           "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
2630                           ret,
2631                           (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
2632                           leaf_space_used(leaf, 0, nritems), nritems);
2633        }
2634        return ret;
2635}
2636
2637/*
2638 * min slot controls the lowest index we're willing to push to the
2639 * right.  We'll push up to and including min_slot, but no lower
2640 */
2641static noinline int __push_leaf_right(struct btrfs_path *path,
2642                                      int data_size, int empty,
2643                                      struct extent_buffer *right,
2644                                      int free_space, u32 left_nritems,
2645                                      u32 min_slot)
2646{
2647        struct btrfs_fs_info *fs_info = right->fs_info;
2648        struct extent_buffer *left = path->nodes[0];
2649        struct extent_buffer *upper = path->nodes[1];
2650        struct btrfs_map_token token;
2651        struct btrfs_disk_key disk_key;
2652        int slot;
2653        u32 i;
2654        int push_space = 0;
2655        int push_items = 0;
2656        struct btrfs_item *item;
2657        u32 nr;
2658        u32 right_nritems;
2659        u32 data_end;
2660        u32 this_item_size;
2661
2662        if (empty)
2663                nr = 0;
2664        else
2665                nr = max_t(u32, 1, min_slot);
2666
2667        if (path->slots[0] >= left_nritems)
2668                push_space += data_size;
2669
2670        slot = path->slots[1];
2671        i = left_nritems - 1;
2672        while (i >= nr) {
2673                item = btrfs_item_nr(i);
2674
2675                if (!empty && push_items > 0) {
2676                        if (path->slots[0] > i)
2677                                break;
2678                        if (path->slots[0] == i) {
2679                                int space = btrfs_leaf_free_space(left);
2680
2681                                if (space + push_space * 2 > free_space)
2682                                        break;
2683                        }
2684                }
2685
2686                if (path->slots[0] == i)
2687                        push_space += data_size;
2688
2689                this_item_size = btrfs_item_size(left, item);
2690                if (this_item_size + sizeof(*item) + push_space > free_space)
2691                        break;
2692
2693                push_items++;
2694                push_space += this_item_size + sizeof(*item);
2695                if (i == 0)
2696                        break;
2697                i--;
2698        }
2699
2700        if (push_items == 0)
2701                goto out_unlock;
2702
2703        WARN_ON(!empty && push_items == left_nritems);
2704
2705        /* push left to right */
2706        right_nritems = btrfs_header_nritems(right);
2707
2708        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
2709        push_space -= leaf_data_end(left);
2710
2711        /* make room in the right data area */
2712        data_end = leaf_data_end(right);
2713        memmove_extent_buffer(right,
2714                              BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
2715                              BTRFS_LEAF_DATA_OFFSET + data_end,
2716                              BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
2717
2718        /* copy from the left data area */
2719        copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
2720                     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2721                     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
2722                     push_space);
2723
2724        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
2725                              btrfs_item_nr_offset(0),
2726                              right_nritems * sizeof(struct btrfs_item));
2727
2728        /* copy the items from left to right */
2729        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
2730                   btrfs_item_nr_offset(left_nritems - push_items),
2731                   push_items * sizeof(struct btrfs_item));
2732
2733        /* update the item pointers */
2734        btrfs_init_map_token(&token, right);
2735        right_nritems += push_items;
2736        btrfs_set_header_nritems(right, right_nritems);
2737        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
2738        for (i = 0; i < right_nritems; i++) {
2739                item = btrfs_item_nr(i);
2740                push_space -= btrfs_token_item_size(&token, item);
2741                btrfs_set_token_item_offset(&token, item, push_space);
2742        }
2743
2744        left_nritems -= push_items;
2745        btrfs_set_header_nritems(left, left_nritems);
2746
2747        if (left_nritems)
2748                btrfs_mark_buffer_dirty(left);
2749        else
2750                btrfs_clean_tree_block(left);
2751
2752        btrfs_mark_buffer_dirty(right);
2753
2754        btrfs_item_key(right, &disk_key, 0);
2755        btrfs_set_node_key(upper, &disk_key, slot + 1);
2756        btrfs_mark_buffer_dirty(upper);
2757
2758        /* then fixup the leaf pointer in the path */
2759        if (path->slots[0] >= left_nritems) {
2760                path->slots[0] -= left_nritems;
2761                if (btrfs_header_nritems(path->nodes[0]) == 0)
2762                        btrfs_clean_tree_block(path->nodes[0]);
2763                btrfs_tree_unlock(path->nodes[0]);
2764                free_extent_buffer(path->nodes[0]);
2765                path->nodes[0] = right;
2766                path->slots[1] += 1;
2767        } else {
2768                btrfs_tree_unlock(right);
2769                free_extent_buffer(right);
2770        }
2771        return 0;
2772
2773out_unlock:
2774        btrfs_tree_unlock(right);
2775        free_extent_buffer(right);
2776        return 1;
2777}
2778
2779/*
2780 * push some data in the path leaf to the right, trying to free up at
2781 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2782 *
2783 * returns 1 if the push failed because the other node didn't have enough
2784 * room, 0 if everything worked out and < 0 if there were major errors.
2785 *
2786 * this will push starting from min_slot to the end of the leaf.  It won't
2787 * push any slot lower than min_slot
2788 */
2789static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2790                           *root, struct btrfs_path *path,
2791                           int min_data_size, int data_size,
2792                           int empty, u32 min_slot)
2793{
2794        struct extent_buffer *left = path->nodes[0];
2795        struct extent_buffer *right;
2796        struct extent_buffer *upper;
2797        int slot;
2798        int free_space;
2799        u32 left_nritems;
2800        int ret;
2801
2802        if (!path->nodes[1])
2803                return 1;
2804
2805        slot = path->slots[1];
2806        upper = path->nodes[1];
2807        if (slot >= btrfs_header_nritems(upper) - 1)
2808                return 1;
2809
2810        btrfs_assert_tree_locked(path->nodes[1]);
2811
2812        right = btrfs_read_node_slot(upper, slot + 1);
2813        /*
2814         * slot + 1 is not valid or we fail to read the right node,
2815         * no big deal, just return.
2816         */
2817        if (IS_ERR(right))
2818                return 1;
2819
2820        __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2821
2822        free_space = btrfs_leaf_free_space(right);
2823        if (free_space < data_size)
2824                goto out_unlock;
2825
2826        /* cow and double check */
2827        ret = btrfs_cow_block(trans, root, right, upper,
2828                              slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
2829        if (ret)
2830                goto out_unlock;
2831
2832        free_space = btrfs_leaf_free_space(right);
2833        if (free_space < data_size)
2834                goto out_unlock;
2835
2836        left_nritems = btrfs_header_nritems(left);
2837        if (left_nritems == 0)
2838                goto out_unlock;
2839
2840        if (check_sibling_keys(left, right)) {
2841                ret = -EUCLEAN;
2842                btrfs_tree_unlock(right);
2843                free_extent_buffer(right);
2844                return ret;
2845        }
2846        if (path->slots[0] == left_nritems && !empty) {
2847                /* Key greater than all keys in the leaf, right neighbor has
2848                 * enough room for it and we're not emptying our leaf to delete
2849                 * it, therefore use right neighbor to insert the new item and
2850                 * no need to touch/dirty our left leaf. */
2851                btrfs_tree_unlock(left);
2852                free_extent_buffer(left);
2853                path->nodes[0] = right;
2854                path->slots[0] = 0;
2855                path->slots[1]++;
2856                return 0;
2857        }
2858
2859        return __push_leaf_right(path, min_data_size, empty,
2860                                right, free_space, left_nritems, min_slot);
2861out_unlock:
2862        btrfs_tree_unlock(right);
2863        free_extent_buffer(right);
2864        return 1;
2865}
2866
2867/*
2868 * push some data in the path leaf to the left, trying to free up at
2869 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
2870 *
2871 * max_slot can put a limit on how far into the leaf we'll push items.  The
2872 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
2873 * items
2874 */
2875static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
2876                                     int empty, struct extent_buffer *left,
2877                                     int free_space, u32 right_nritems,
2878                                     u32 max_slot)
2879{
2880        struct btrfs_fs_info *fs_info = left->fs_info;
2881        struct btrfs_disk_key disk_key;
2882        struct extent_buffer *right = path->nodes[0];
2883        int i;
2884        int push_space = 0;
2885        int push_items = 0;
2886        struct btrfs_item *item;
2887        u32 old_left_nritems;
2888        u32 nr;
2889        int ret = 0;
2890        u32 this_item_size;
2891        u32 old_left_item_size;
2892        struct btrfs_map_token token;
2893
2894        if (empty)
2895                nr = min(right_nritems, max_slot);
2896        else
2897                nr = min(right_nritems - 1, max_slot);
2898
2899        for (i = 0; i < nr; i++) {
2900                item = btrfs_item_nr(i);
2901
2902                if (!empty && push_items > 0) {
2903                        if (path->slots[0] < i)
2904                                break;
2905                        if (path->slots[0] == i) {
2906                                int space = btrfs_leaf_free_space(right);
2907
2908                                if (space + push_space * 2 > free_space)
2909                                        break;
2910                        }
2911                }
2912
2913                if (path->slots[0] == i)
2914                        push_space += data_size;
2915
2916                this_item_size = btrfs_item_size(right, item);
2917                if (this_item_size + sizeof(*item) + push_space > free_space)
2918                        break;
2919
2920                push_items++;
2921                push_space += this_item_size + sizeof(*item);
2922        }
2923
2924        if (push_items == 0) {
2925                ret = 1;
2926                goto out;
2927        }
2928        WARN_ON(!empty && push_items == btrfs_header_nritems(right));
2929
2930        /* push data from right to left */
2931        copy_extent_buffer(left, right,
2932                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
2933                           btrfs_item_nr_offset(0),
2934                           push_items * sizeof(struct btrfs_item));
2935
2936        push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
2937                     btrfs_item_offset_nr(right, push_items - 1);
2938
2939        copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
2940                     leaf_data_end(left) - push_space,
2941                     BTRFS_LEAF_DATA_OFFSET +
2942                     btrfs_item_offset_nr(right, push_items - 1),
2943                     push_space);
2944        old_left_nritems = btrfs_header_nritems(left);
2945        BUG_ON(old_left_nritems <= 0);
2946
2947        btrfs_init_map_token(&token, left);
2948        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
2949        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2950                u32 ioff;
2951
2952                item = btrfs_item_nr(i);
2953
2954                ioff = btrfs_token_item_offset(&token, item);
2955                btrfs_set_token_item_offset(&token, item,
2956                      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
2957        }
2958        btrfs_set_header_nritems(left, old_left_nritems + push_items);
2959
2960        /* fixup right node */
2961        if (push_items > right_nritems)
2962                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
2963                       right_nritems);
2964
2965        if (push_items < right_nritems) {
2966                push_space = btrfs_item_offset_nr(right, push_items - 1) -
2967                                                  leaf_data_end(right);
2968                memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
2969                                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
2970                                      BTRFS_LEAF_DATA_OFFSET +
2971                                      leaf_data_end(right), push_space);
2972
2973                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2974                              btrfs_item_nr_offset(push_items),
2975                             (btrfs_header_nritems(right) - push_items) *
2976                             sizeof(struct btrfs_item));
2977        }
2978
2979        btrfs_init_map_token(&token, right);
2980        right_nritems -= push_items;
2981        btrfs_set_header_nritems(right, right_nritems);
2982        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
2983        for (i = 0; i < right_nritems; i++) {
2984                item = btrfs_item_nr(i);
2985
2986                push_space = push_space - btrfs_token_item_size(&token, item);
2987                btrfs_set_token_item_offset(&token, item, push_space);
2988        }
2989
2990        btrfs_mark_buffer_dirty(left);
2991        if (right_nritems)
2992                btrfs_mark_buffer_dirty(right);
2993        else
2994                btrfs_clean_tree_block(right);
2995
2996        btrfs_item_key(right, &disk_key, 0);
2997        fixup_low_keys(path, &disk_key, 1);
2998
2999        /* then fixup the leaf pointer in the path */
3000        if (path->slots[0] < push_items) {
3001                path->slots[0] += old_left_nritems;
3002                btrfs_tree_unlock(path->nodes[0]);
3003                free_extent_buffer(path->nodes[0]);
3004                path->nodes[0] = left;
3005                path->slots[1] -= 1;
3006        } else {
3007                btrfs_tree_unlock(left);
3008                free_extent_buffer(left);
3009                path->slots[0] -= push_items;
3010        }
3011        BUG_ON(path->slots[0] < 0);
3012        return ret;
3013out:
3014        btrfs_tree_unlock(left);
3015        free_extent_buffer(left);
3016        return ret;
3017}
3018
3019/*
3020 * push some data in the path leaf to the left, trying to free up at
3021 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3022 *
3023 * max_slot can put a limit on how far into the leaf we'll push items.  The
3024 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3025 * items
3026 */
3027static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3028                          *root, struct btrfs_path *path, int min_data_size,
3029                          int data_size, int empty, u32 max_slot)
3030{
3031        struct extent_buffer *right = path->nodes[0];
3032        struct extent_buffer *left;
3033        int slot;
3034        int free_space;
3035        u32 right_nritems;
3036        int ret = 0;
3037
3038        slot = path->slots[1];
3039        if (slot == 0)
3040                return 1;
3041        if (!path->nodes[1])
3042                return 1;
3043
3044        right_nritems = btrfs_header_nritems(right);
3045        if (right_nritems == 0)
3046                return 1;
3047
3048        btrfs_assert_tree_locked(path->nodes[1]);
3049
3050        left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3051        /*
3052         * slot - 1 is not valid or we fail to read the left node,
3053         * no big deal, just return.
3054         */
3055        if (IS_ERR(left))
3056                return 1;
3057
3058        __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3059
3060        free_space = btrfs_leaf_free_space(left);
3061        if (free_space < data_size) {
3062                ret = 1;
3063                goto out;
3064        }
3065
3066        /* cow and double check */
3067        ret = btrfs_cow_block(trans, root, left,
3068                              path->nodes[1], slot - 1, &left,
3069                              BTRFS_NESTING_LEFT_COW);
3070        if (ret) {
3071                /* we hit -ENOSPC, but it isn't fatal here */
3072                if (ret == -ENOSPC)
3073                        ret = 1;
3074                goto out;
3075        }
3076
3077        free_space = btrfs_leaf_free_space(left);
3078        if (free_space < data_size) {
3079                ret = 1;
3080                goto out;
3081        }
3082
3083        if (check_sibling_keys(left, right)) {
3084                ret = -EUCLEAN;
3085                goto out;
3086        }
3087        return __push_leaf_left(path, min_data_size,
3088                               empty, left, free_space, right_nritems,
3089                               max_slot);
3090out:
3091        btrfs_tree_unlock(left);
3092        free_extent_buffer(left);
3093        return ret;
3094}
3095
3096/*
3097 * split the path's leaf in two, making sure there is at least data_size
3098 * available for the resulting leaf level of the path.
3099 */
3100static noinline void copy_for_split(struct btrfs_trans_handle *trans,
3101                                    struct btrfs_path *path,
3102                                    struct extent_buffer *l,
3103                                    struct extent_buffer *right,
3104                                    int slot, int mid, int nritems)
3105{
3106        struct btrfs_fs_info *fs_info = trans->fs_info;
3107        int data_copy_size;
3108        int rt_data_off;
3109        int i;
3110        struct btrfs_disk_key disk_key;
3111        struct btrfs_map_token token;
3112
3113        nritems = nritems - mid;
3114        btrfs_set_header_nritems(right, nritems);
3115        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
3116
3117        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
3118                           btrfs_item_nr_offset(mid),
3119                           nritems * sizeof(struct btrfs_item));
3120
3121        copy_extent_buffer(right, l,
3122                     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
3123                     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
3124                     leaf_data_end(l), data_copy_size);
3125
3126        rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
3127
3128        btrfs_init_map_token(&token, right);
3129        for (i = 0; i < nritems; i++) {
3130                struct btrfs_item *item = btrfs_item_nr(i);
3131                u32 ioff;
3132
3133                ioff = btrfs_token_item_offset(&token, item);
3134                btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
3135        }
3136
3137        btrfs_set_header_nritems(l, mid);
3138        btrfs_item_key(right, &disk_key, 0);
3139        insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3140
3141        btrfs_mark_buffer_dirty(right);
3142        btrfs_mark_buffer_dirty(l);
3143        BUG_ON(path->slots[0] != slot);
3144
3145        if (mid <= slot) {
3146                btrfs_tree_unlock(path->nodes[0]);
3147                free_extent_buffer(path->nodes[0]);
3148                path->nodes[0] = right;
3149                path->slots[0] -= mid;
3150                path->slots[1] += 1;
3151        } else {
3152                btrfs_tree_unlock(right);
3153                free_extent_buffer(right);
3154        }
3155
3156        BUG_ON(path->slots[0] < 0);
3157}
3158
3159/*
3160 * double splits happen when we need to insert a big item in the middle
3161 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3162 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3163 *          A                 B                 C
3164 *
3165 * We avoid this by trying to push the items on either side of our target
3166 * into the adjacent leaves.  If all goes well we can avoid the double split
3167 * completely.
3168 */
3169static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3170                                          struct btrfs_root *root,
3171                                          struct btrfs_path *path,
3172                                          int data_size)
3173{
3174        int ret;
3175        int progress = 0;
3176        int slot;
3177        u32 nritems;
3178        int space_needed = data_size;
3179
3180        slot = path->slots[0];
3181        if (slot < btrfs_header_nritems(path->nodes[0]))
3182                space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3183
3184        /*
3185         * try to push all the items after our slot into the
3186         * right leaf
3187         */
3188        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3189        if (ret < 0)
3190                return ret;
3191
3192        if (ret == 0)
3193                progress++;
3194
3195        nritems = btrfs_header_nritems(path->nodes[0]);
3196        /*
3197         * our goal is to get our slot at the start or end of a leaf.  If
3198         * we've done so we're done
3199         */
3200        if (path->slots[0] == 0 || path->slots[0] == nritems)
3201                return 0;
3202
3203        if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3204                return 0;
3205
3206        /* try to push all the items before our slot into the next leaf */
3207        slot = path->slots[0];
3208        space_needed = data_size;
3209        if (slot > 0)
3210                space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3211        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3212        if (ret < 0)
3213                return ret;
3214
3215        if (ret == 0)
3216                progress++;
3217
3218        if (progress)
3219                return 0;
3220        return 1;
3221}
3222
3223/*
3224 * split the path's leaf in two, making sure there is at least data_size
3225 * available for the resulting leaf level of the path.
3226 *
3227 * returns 0 if all went well and < 0 on failure.
3228 */
3229static noinline int split_leaf(struct btrfs_trans_handle *trans,
3230                               struct btrfs_root *root,
3231                               const struct btrfs_key *ins_key,
3232                               struct btrfs_path *path, int data_size,
3233                               int extend)
3234{
3235        struct btrfs_disk_key disk_key;
3236        struct extent_buffer *l;
3237        u32 nritems;
3238        int mid;
3239        int slot;
3240        struct extent_buffer *right;
3241        struct btrfs_fs_info *fs_info = root->fs_info;
3242        int ret = 0;
3243        int wret;
3244        int split;
3245        int num_doubles = 0;
3246        int tried_avoid_double = 0;
3247
3248        l = path->nodes[0];
3249        slot = path->slots[0];
3250        if (extend && data_size + btrfs_item_size_nr(l, slot) +
3251            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3252                return -EOVERFLOW;
3253
3254        /* first try to make some room by pushing left and right */
3255        if (data_size && path->nodes[1]) {
3256                int space_needed = data_size;
3257
3258                if (slot < btrfs_header_nritems(l))
3259                        space_needed -= btrfs_leaf_free_space(l);
3260
3261                wret = push_leaf_right(trans, root, path, space_needed,
3262                                       space_needed, 0, 0);
3263                if (wret < 0)
3264                        return wret;
3265                if (wret) {
3266                        space_needed = data_size;
3267                        if (slot > 0)
3268                                space_needed -= btrfs_leaf_free_space(l);
3269                        wret = push_leaf_left(trans, root, path, space_needed,
3270                                              space_needed, 0, (u32)-1);
3271                        if (wret < 0)
3272                                return wret;
3273                }
3274                l = path->nodes[0];
3275
3276                /* did the pushes work? */
3277                if (btrfs_leaf_free_space(l) >= data_size)
3278                        return 0;
3279        }
3280
3281        if (!path->nodes[1]) {
3282                ret = insert_new_root(trans, root, path, 1);
3283                if (ret)
3284                        return ret;
3285        }
3286again:
3287        split = 1;
3288        l = path->nodes[0];
3289        slot = path->slots[0];
3290        nritems = btrfs_header_nritems(l);
3291        mid = (nritems + 1) / 2;
3292
3293        if (mid <= slot) {
3294                if (nritems == 1 ||
3295                    leaf_space_used(l, mid, nritems - mid) + data_size >
3296                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
3297                        if (slot >= nritems) {
3298                                split = 0;
3299                        } else {
3300                                mid = slot;
3301                                if (mid != nritems &&
3302                                    leaf_space_used(l, mid, nritems - mid) +
3303                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3304                                        if (data_size && !tried_avoid_double)
3305                                                goto push_for_double;
3306                                        split = 2;
3307                                }
3308                        }
3309                }
3310        } else {
3311                if (leaf_space_used(l, 0, mid) + data_size >
3312                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
3313                        if (!extend && data_size && slot == 0) {
3314                                split = 0;
3315                        } else if ((extend || !data_size) && slot == 0) {
3316                                mid = 1;
3317                        } else {
3318                                mid = slot;
3319                                if (mid != nritems &&
3320                                    leaf_space_used(l, mid, nritems - mid) +
3321                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3322                                        if (data_size && !tried_avoid_double)
3323                                                goto push_for_double;
3324                                        split = 2;
3325                                }
3326                        }
3327                }
3328        }
3329
3330        if (split == 0)
3331                btrfs_cpu_key_to_disk(&disk_key, ins_key);
3332        else
3333                btrfs_item_key(l, &disk_key, mid);
3334
3335        /*
3336         * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3337         * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3338         * subclasses, which is 8 at the time of this patch, and we've maxed it
3339         * out.  In the future we could add a
3340         * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3341         * use BTRFS_NESTING_NEW_ROOT.
3342         */
3343        right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3344                                       &disk_key, 0, l->start, 0,
3345                                       num_doubles ? BTRFS_NESTING_NEW_ROOT :
3346                                       BTRFS_NESTING_SPLIT);
3347        if (IS_ERR(right))
3348                return PTR_ERR(right);
3349
3350        root_add_used(root, fs_info->nodesize);
3351
3352        if (split == 0) {
3353                if (mid <= slot) {
3354                        btrfs_set_header_nritems(right, 0);
3355                        insert_ptr(trans, path, &disk_key,
3356                                   right->start, path->slots[1] + 1, 1);
3357                        btrfs_tree_unlock(path->nodes[0]);
3358                        free_extent_buffer(path->nodes[0]);
3359                        path->nodes[0] = right;
3360                        path->slots[0] = 0;
3361                        path->slots[1] += 1;
3362                } else {
3363                        btrfs_set_header_nritems(right, 0);
3364                        insert_ptr(trans, path, &disk_key,
3365                                   right->start, path->slots[1], 1);
3366                        btrfs_tree_unlock(path->nodes[0]);
3367                        free_extent_buffer(path->nodes[0]);
3368                        path->nodes[0] = right;
3369                        path->slots[0] = 0;
3370                        if (path->slots[1] == 0)
3371                                fixup_low_keys(path, &disk_key, 1);
3372                }
3373                /*
3374                 * We create a new leaf 'right' for the required ins_len and
3375                 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3376                 * the content of ins_len to 'right'.
3377                 */
3378                return ret;
3379        }
3380
3381        copy_for_split(trans, path, l, right, slot, mid, nritems);
3382
3383        if (split == 2) {
3384                BUG_ON(num_doubles != 0);
3385                num_doubles++;
3386                goto again;
3387        }
3388
3389        return 0;
3390
3391push_for_double:
3392        push_for_double_split(trans, root, path, data_size);
3393        tried_avoid_double = 1;
3394        if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3395                return 0;
3396        goto again;
3397}
3398
3399static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3400                                         struct btrfs_root *root,
3401                                         struct btrfs_path *path, int ins_len)
3402{
3403        struct btrfs_key key;
3404        struct extent_buffer *leaf;
3405        struct btrfs_file_extent_item *fi;
3406        u64 extent_len = 0;
3407        u32 item_size;
3408        int ret;
3409
3410        leaf = path->nodes[0];
3411        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3412
3413        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3414               key.type != BTRFS_EXTENT_CSUM_KEY);
3415
3416        if (btrfs_leaf_free_space(leaf) >= ins_len)
3417                return 0;
3418
3419        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3420        if (key.type == BTRFS_EXTENT_DATA_KEY) {
3421                fi = btrfs_item_ptr(leaf, path->slots[0],
3422                                    struct btrfs_file_extent_item);
3423                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3424        }
3425        btrfs_release_path(path);
3426
3427        path->keep_locks = 1;
3428        path->search_for_split = 1;
3429        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3430        path->search_for_split = 0;
3431        if (ret > 0)
3432                ret = -EAGAIN;
3433        if (ret < 0)
3434                goto err;
3435
3436        ret = -EAGAIN;
3437        leaf = path->nodes[0];
3438        /* if our item isn't there, return now */
3439        if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3440                goto err;
3441
3442        /* the leaf has  changed, it now has room.  return now */
3443        if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3444                goto err;
3445
3446        if (key.type == BTRFS_EXTENT_DATA_KEY) {
3447                fi = btrfs_item_ptr(leaf, path->slots[0],
3448                                    struct btrfs_file_extent_item);
3449                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3450                        goto err;
3451        }
3452
3453        ret = split_leaf(trans, root, &key, path, ins_len, 1);
3454        if (ret)
3455                goto err;
3456
3457        path->keep_locks = 0;
3458        btrfs_unlock_up_safe(path, 1);
3459        return 0;
3460err:
3461        path->keep_locks = 0;
3462        return ret;
3463}
3464
3465static noinline int split_item(struct btrfs_path *path,
3466                               const struct btrfs_key *new_key,
3467                               unsigned long split_offset)
3468{
3469        struct extent_buffer *leaf;
3470        struct btrfs_item *item;
3471        struct btrfs_item *new_item;
3472        int slot;
3473        char *buf;
3474        u32 nritems;
3475        u32 item_size;
3476        u32 orig_offset;
3477        struct btrfs_disk_key disk_key;
3478
3479        leaf = path->nodes[0];
3480        BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
3481
3482        item = btrfs_item_nr(path->slots[0]);
3483        orig_offset = btrfs_item_offset(leaf, item);
3484        item_size = btrfs_item_size(leaf, item);
3485
3486        buf = kmalloc(item_size, GFP_NOFS);
3487        if (!buf)
3488                return -ENOMEM;
3489
3490        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3491                            path->slots[0]), item_size);
3492
3493        slot = path->slots[0] + 1;
3494        nritems = btrfs_header_nritems(leaf);
3495        if (slot != nritems) {
3496                /* shift the items */
3497                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
3498                                btrfs_item_nr_offset(slot),
3499                                (nritems - slot) * sizeof(struct btrfs_item));
3500        }
3501
3502        btrfs_cpu_key_to_disk(&disk_key, new_key);
3503        btrfs_set_item_key(leaf, &disk_key, slot);
3504
3505        new_item = btrfs_item_nr(slot);
3506
3507        btrfs_set_item_offset(leaf, new_item, orig_offset);
3508        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
3509
3510        btrfs_set_item_offset(leaf, item,
3511                              orig_offset + item_size - split_offset);
3512        btrfs_set_item_size(leaf, item, split_offset);
3513
3514        btrfs_set_header_nritems(leaf, nritems + 1);
3515
3516        /* write the data for the start of the original item */
3517        write_extent_buffer(leaf, buf,
3518                            btrfs_item_ptr_offset(leaf, path->slots[0]),
3519                            split_offset);
3520
3521        /* write the data for the new item */
3522        write_extent_buffer(leaf, buf + split_offset,
3523                            btrfs_item_ptr_offset(leaf, slot),
3524                            item_size - split_offset);
3525        btrfs_mark_buffer_dirty(leaf);
3526
3527        BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3528        kfree(buf);
3529        return 0;
3530}
3531
3532/*
3533 * This function splits a single item into two items,
3534 * giving 'new_key' to the new item and splitting the
3535 * old one at split_offset (from the start of the item).
3536 *
3537 * The path may be released by this operation.  After
3538 * the split, the path is pointing to the old item.  The
3539 * new item is going to be in the same node as the old one.
3540 *
3541 * Note, the item being split must be smaller enough to live alone on
3542 * a tree block with room for one extra struct btrfs_item
3543 *
3544 * This allows us to split the item in place, keeping a lock on the
3545 * leaf the entire time.
3546 */
3547int btrfs_split_item(struct btrfs_trans_handle *trans,
3548                     struct btrfs_root *root,
3549                     struct btrfs_path *path,
3550                     const struct btrfs_key *new_key,
3551                     unsigned long split_offset)
3552{
3553        int ret;
3554        ret = setup_leaf_for_split(trans, root, path,
3555                                   sizeof(struct btrfs_item));
3556        if (ret)
3557                return ret;
3558
3559        ret = split_item(path, new_key, split_offset);
3560        return ret;
3561}
3562
3563/*
3564 * This function duplicate a item, giving 'new_key' to the new item.
3565 * It guarantees both items live in the same tree leaf and the new item
3566 * is contiguous with the original item.
3567 *
3568 * This allows us to split file extent in place, keeping a lock on the
3569 * leaf the entire time.
3570 */
3571int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
3572                         struct btrfs_root *root,
3573                         struct btrfs_path *path,
3574                         const struct btrfs_key *new_key)
3575{
3576        struct extent_buffer *leaf;
3577        int ret;
3578        u32 item_size;
3579
3580        leaf = path->nodes[0];
3581        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3582        ret = setup_leaf_for_split(trans, root, path,
3583                                   item_size + sizeof(struct btrfs_item));
3584        if (ret)
3585                return ret;
3586
3587        path->slots[0]++;
3588        setup_items_for_insert(root, path, new_key, &item_size, 1);
3589        leaf = path->nodes[0];
3590        memcpy_extent_buffer(leaf,
3591                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3592                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
3593                             item_size);
3594        return 0;
3595}
3596
3597/*
3598 * make the item pointed to by the path smaller.  new_size indicates
3599 * how small to make it, and from_end tells us if we just chop bytes
3600 * off the end of the item or if we shift the item to chop bytes off
3601 * the front.
3602 */
3603void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
3604{
3605        int slot;
3606        struct extent_buffer *leaf;
3607        struct btrfs_item *item;
3608        u32 nritems;
3609        unsigned int data_end;
3610        unsigned int old_data_start;
3611        unsigned int old_size;
3612        unsigned int size_diff;
3613        int i;
3614        struct btrfs_map_token token;
3615
3616        leaf = path->nodes[0];
3617        slot = path->slots[0];
3618
3619        old_size = btrfs_item_size_nr(leaf, slot);
3620        if (old_size == new_size)
3621                return;
3622
3623        nritems = btrfs_header_nritems(leaf);
3624        data_end = leaf_data_end(leaf);
3625
3626        old_data_start = btrfs_item_offset_nr(leaf, slot);
3627
3628        size_diff = old_size - new_size;
3629
3630        BUG_ON(slot < 0);
3631        BUG_ON(slot >= nritems);
3632
3633        /*
3634         * item0..itemN ... dataN.offset..dataN.size .. data0.size
3635         */
3636        /* first correct the data pointers */
3637        btrfs_init_map_token(&token, leaf);
3638        for (i = slot; i < nritems; i++) {
3639                u32 ioff;
3640                item = btrfs_item_nr(i);
3641
3642                ioff = btrfs_token_item_offset(&token, item);
3643                btrfs_set_token_item_offset(&token, item, ioff + size_diff);
3644        }
3645
3646        /* shift the data */
3647        if (from_end) {
3648                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3649                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3650                              data_end, old_data_start + new_size - data_end);
3651        } else {
3652                struct btrfs_disk_key disk_key;
3653                u64 offset;
3654
3655                btrfs_item_key(leaf, &disk_key, slot);
3656
3657                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
3658                        unsigned long ptr;
3659                        struct btrfs_file_extent_item *fi;
3660
3661                        fi = btrfs_item_ptr(leaf, slot,
3662                                            struct btrfs_file_extent_item);
3663                        fi = (struct btrfs_file_extent_item *)(
3664                             (unsigned long)fi - size_diff);
3665
3666                        if (btrfs_file_extent_type(leaf, fi) ==
3667                            BTRFS_FILE_EXTENT_INLINE) {
3668                                ptr = btrfs_item_ptr_offset(leaf, slot);
3669                                memmove_extent_buffer(leaf, ptr,
3670                                      (unsigned long)fi,
3671                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
3672                        }
3673                }
3674
3675                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3676                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
3677                              data_end, old_data_start - data_end);
3678
3679                offset = btrfs_disk_key_offset(&disk_key);
3680                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
3681                btrfs_set_item_key(leaf, &disk_key, slot);
3682                if (slot == 0)
3683                        fixup_low_keys(path, &disk_key, 1);
3684        }
3685
3686        item = btrfs_item_nr(slot);
3687        btrfs_set_item_size(leaf, item, new_size);
3688        btrfs_mark_buffer_dirty(leaf);
3689
3690        if (btrfs_leaf_free_space(leaf) < 0) {
3691                btrfs_print_leaf(leaf);
3692                BUG();
3693        }
3694}
3695
3696/*
3697 * make the item pointed to by the path bigger, data_size is the added size.
3698 */
3699void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
3700{
3701        int slot;
3702        struct extent_buffer *leaf;
3703        struct btrfs_item *item;
3704        u32 nritems;
3705        unsigned int data_end;
3706        unsigned int old_data;
3707        unsigned int old_size;
3708        int i;
3709        struct btrfs_map_token token;
3710
3711        leaf = path->nodes[0];
3712
3713        nritems = btrfs_header_nritems(leaf);
3714        data_end = leaf_data_end(leaf);
3715
3716        if (btrfs_leaf_free_space(leaf) < data_size) {
3717                btrfs_print_leaf(leaf);
3718                BUG();
3719        }
3720        slot = path->slots[0];
3721        old_data = btrfs_item_end_nr(leaf, slot);
3722
3723        BUG_ON(slot < 0);
3724        if (slot >= nritems) {
3725                btrfs_print_leaf(leaf);
3726                btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
3727                           slot, nritems);
3728                BUG();
3729        }
3730
3731        /*
3732         * item0..itemN ... dataN.offset..dataN.size .. data0.size
3733         */
3734        /* first correct the data pointers */
3735        btrfs_init_map_token(&token, leaf);
3736        for (i = slot; i < nritems; i++) {
3737                u32 ioff;
3738                item = btrfs_item_nr(i);
3739
3740                ioff = btrfs_token_item_offset(&token, item);
3741                btrfs_set_token_item_offset(&token, item, ioff - data_size);
3742        }
3743
3744        /* shift the data */
3745        memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3746                      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
3747                      data_end, old_data - data_end);
3748
3749        data_end = old_data;
3750        old_size = btrfs_item_size_nr(leaf, slot);
3751        item = btrfs_item_nr(slot);
3752        btrfs_set_item_size(leaf, item, old_size + data_size);
3753        btrfs_mark_buffer_dirty(leaf);
3754
3755        if (btrfs_leaf_free_space(leaf) < 0) {
3756                btrfs_print_leaf(leaf);
3757                BUG();
3758        }
3759}
3760
3761/**
3762 * setup_items_for_insert - Helper called before inserting one or more items
3763 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
3764 * in a function that doesn't call btrfs_search_slot
3765 *
3766 * @root:       root we are inserting items to
3767 * @path:       points to the leaf/slot where we are going to insert new items
3768 * @cpu_key:    array of keys for items to be inserted
3769 * @data_size:  size of the body of each item we are going to insert
3770 * @nr:         size of @cpu_key/@data_size arrays
3771 */
3772void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
3773                            const struct btrfs_key *cpu_key, u32 *data_size,
3774                            int nr)
3775{
3776        struct btrfs_fs_info *fs_info = root->fs_info;
3777        struct btrfs_item *item;
3778        int i;
3779        u32 nritems;
3780        unsigned int data_end;
3781        struct btrfs_disk_key disk_key;
3782        struct extent_buffer *leaf;
3783        int slot;
3784        struct btrfs_map_token token;
3785        u32 total_size;
3786        u32 total_data = 0;
3787
3788        for (i = 0; i < nr; i++)
3789                total_data += data_size[i];
3790        total_size = total_data + (nr * sizeof(struct btrfs_item));
3791
3792        if (path->slots[0] == 0) {
3793                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3794                fixup_low_keys(path, &disk_key, 1);
3795        }
3796        btrfs_unlock_up_safe(path, 1);
3797
3798        leaf = path->nodes[0];
3799        slot = path->slots[0];
3800
3801        nritems = btrfs_header_nritems(leaf);
3802        data_end = leaf_data_end(leaf);
3803
3804        if (btrfs_leaf_free_space(leaf) < total_size) {
3805                btrfs_print_leaf(leaf);
3806                btrfs_crit(fs_info, "not enough freespace need %u have %d",
3807                           total_size, btrfs_leaf_free_space(leaf));
3808                BUG();
3809        }
3810
3811        btrfs_init_map_token(&token, leaf);
3812        if (slot != nritems) {
3813                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3814
3815                if (old_data < data_end) {
3816                        btrfs_print_leaf(leaf);
3817                        btrfs_crit(fs_info,
3818                "item at slot %d with data offset %u beyond data end of leaf %u",
3819                                   slot, old_data, data_end);
3820                        BUG();
3821                }
3822                /*
3823                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3824                 */
3825                /* first correct the data pointers */
3826                for (i = slot; i < nritems; i++) {
3827                        u32 ioff;
3828
3829                        item = btrfs_item_nr(i);
3830                        ioff = btrfs_token_item_offset(&token, item);
3831                        btrfs_set_token_item_offset(&token, item,
3832                                                    ioff - total_data);
3833                }
3834                /* shift the items */
3835                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3836                              btrfs_item_nr_offset(slot),
3837                              (nritems - slot) * sizeof(struct btrfs_item));
3838
3839                /* shift the data */
3840                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
3841                              data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
3842                              data_end, old_data - data_end);
3843                data_end = old_data;
3844        }
3845
3846        /* setup the item for the new data */
3847        for (i = 0; i < nr; i++) {
3848                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
3849                btrfs_set_item_key(leaf, &disk_key, slot + i);
3850                item = btrfs_item_nr(slot + i);
3851                data_end -= data_size[i];
3852                btrfs_set_token_item_offset(&token, item, data_end);
3853                btrfs_set_token_item_size(&token, item, data_size[i]);
3854        }
3855
3856        btrfs_set_header_nritems(leaf, nritems + nr);
3857        btrfs_mark_buffer_dirty(leaf);
3858
3859        if (btrfs_leaf_free_space(leaf) < 0) {
3860                btrfs_print_leaf(leaf);
3861                BUG();
3862        }
3863}
3864
3865/*
3866 * Given a key and some data, insert items into the tree.
3867 * This does all the path init required, making room in the tree if needed.
3868 */
3869int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3870                            struct btrfs_root *root,
3871                            struct btrfs_path *path,
3872                            const struct btrfs_key *cpu_key, u32 *data_size,
3873                            int nr)
3874{
3875        int ret = 0;
3876        int slot;
3877        int i;
3878        u32 total_size = 0;
3879        u32 total_data = 0;
3880
3881        for (i = 0; i < nr; i++)
3882                total_data += data_size[i];
3883
3884        total_size = total_data + (nr * sizeof(struct btrfs_item));
3885        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
3886        if (ret == 0)
3887                return -EEXIST;
3888        if (ret < 0)
3889                return ret;
3890
3891        slot = path->slots[0];
3892        BUG_ON(slot < 0);
3893
3894        setup_items_for_insert(root, path, cpu_key, data_size, nr);
3895        return 0;
3896}
3897
3898/*
3899 * Given a key and some data, insert an item into the tree.
3900 * This does all the path init required, making room in the tree if needed.
3901 */
3902int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3903                      const struct btrfs_key *cpu_key, void *data,
3904                      u32 data_size)
3905{
3906        int ret = 0;
3907        struct btrfs_path *path;
3908        struct extent_buffer *leaf;
3909        unsigned long ptr;
3910
3911        path = btrfs_alloc_path();
3912        if (!path)
3913                return -ENOMEM;
3914        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3915        if (!ret) {
3916                leaf = path->nodes[0];
3917                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3918                write_extent_buffer(leaf, data, ptr, data_size);
3919                btrfs_mark_buffer_dirty(leaf);
3920        }
3921        btrfs_free_path(path);
3922        return ret;
3923}
3924
3925/*
3926 * delete the pointer from a given node.
3927 *
3928 * the tree should have been previously balanced so the deletion does not
3929 * empty a node.
3930 */
3931static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
3932                    int level, int slot)
3933{
3934        struct extent_buffer *parent = path->nodes[level];
3935        u32 nritems;
3936        int ret;
3937
3938        nritems = btrfs_header_nritems(parent);
3939        if (slot != nritems - 1) {
3940                if (level) {
3941                        ret = btrfs_tree_mod_log_insert_move(parent, slot,
3942                                        slot + 1, nritems - slot - 1);
3943                        BUG_ON(ret < 0);
3944                }
3945                memmove_extent_buffer(parent,
3946                              btrfs_node_key_ptr_offset(slot),
3947                              btrfs_node_key_ptr_offset(slot + 1),
3948                              sizeof(struct btrfs_key_ptr) *
3949                              (nritems - slot - 1));
3950        } else if (level) {
3951                ret = btrfs_tree_mod_log_insert_key(parent, slot,
3952                                BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
3953                BUG_ON(ret < 0);
3954        }
3955
3956        nritems--;
3957        btrfs_set_header_nritems(parent, nritems);
3958        if (nritems == 0 && parent == root->node) {
3959                BUG_ON(btrfs_header_level(root->node) != 1);
3960                /* just turn the root into a leaf and break */
3961                btrfs_set_header_level(root->node, 0);
3962        } else if (slot == 0) {
3963                struct btrfs_disk_key disk_key;
3964
3965                btrfs_node_key(parent, &disk_key, 0);
3966                fixup_low_keys(path, &disk_key, level + 1);
3967        }
3968        btrfs_mark_buffer_dirty(parent);
3969}
3970
3971/*
3972 * a helper function to delete the leaf pointed to by path->slots[1] and
3973 * path->nodes[1].
3974 *
3975 * This deletes the pointer in path->nodes[1] and frees the leaf
3976 * block extent.  zero is returned if it all worked out, < 0 otherwise.
3977 *
3978 * The path must have already been setup for deleting the leaf, including
3979 * all the proper balancing.  path->nodes[1] must be locked.
3980 */
3981static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
3982                                    struct btrfs_root *root,
3983                                    struct btrfs_path *path,
3984                                    struct extent_buffer *leaf)
3985{
3986        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
3987        del_ptr(root, path, 1, path->slots[1]);
3988
3989        /*
3990         * btrfs_free_extent is expensive, we want to make sure we
3991         * aren't holding any locks when we call it
3992         */
3993        btrfs_unlock_up_safe(path, 0);
3994
3995        root_sub_used(root, leaf->len);
3996
3997        atomic_inc(&leaf->refs);
3998        btrfs_free_tree_block(trans, root, leaf, 0, 1);
3999        free_extent_buffer_stale(leaf);
4000}
4001/*
4002 * delete the item at the leaf level in path.  If that empties
4003 * the leaf, remove it from the tree
4004 */
4005int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4006                    struct btrfs_path *path, int slot, int nr)
4007{
4008        struct btrfs_fs_info *fs_info = root->fs_info;
4009        struct extent_buffer *leaf;
4010        struct btrfs_item *item;
4011        u32 last_off;
4012        u32 dsize = 0;
4013        int ret = 0;
4014        int wret;
4015        int i;
4016        u32 nritems;
4017
4018        leaf = path->nodes[0];
4019        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4020
4021        for (i = 0; i < nr; i++)
4022                dsize += btrfs_item_size_nr(leaf, slot + i);
4023
4024        nritems = btrfs_header_nritems(leaf);
4025
4026        if (slot + nr != nritems) {
4027                int data_end = leaf_data_end(leaf);
4028                struct btrfs_map_token token;
4029
4030                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4031                              data_end + dsize,
4032                              BTRFS_LEAF_DATA_OFFSET + data_end,
4033                              last_off - data_end);
4034
4035                btrfs_init_map_token(&token, leaf);
4036                for (i = slot + nr; i < nritems; i++) {
4037                        u32 ioff;
4038
4039                        item = btrfs_item_nr(i);
4040                        ioff = btrfs_token_item_offset(&token, item);
4041                        btrfs_set_token_item_offset(&token, item, ioff + dsize);
4042                }
4043
4044                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4045                              btrfs_item_nr_offset(slot + nr),
4046                              sizeof(struct btrfs_item) *
4047                              (nritems - slot - nr));
4048        }
4049        btrfs_set_header_nritems(leaf, nritems - nr);
4050        nritems -= nr;
4051
4052        /* delete the leaf if we've emptied it */
4053        if (nritems == 0) {
4054                if (leaf == root->node) {
4055                        btrfs_set_header_level(leaf, 0);
4056                } else {
4057                        btrfs_clean_tree_block(leaf);
4058                        btrfs_del_leaf(trans, root, path, leaf);
4059                }
4060        } else {
4061                int used = leaf_space_used(leaf, 0, nritems);
4062                if (slot == 0) {
4063                        struct btrfs_disk_key disk_key;
4064
4065                        btrfs_item_key(leaf, &disk_key, 0);
4066                        fixup_low_keys(path, &disk_key, 1);
4067                }
4068
4069                /* delete the leaf if it is mostly empty */
4070                if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4071                        /* push_leaf_left fixes the path.
4072                         * make sure the path still points to our leaf
4073                         * for possible call to del_ptr below
4074                         */
4075                        slot = path->slots[1];
4076                        atomic_inc(&leaf->refs);
4077
4078                        wret = push_leaf_left(trans, root, path, 1, 1,
4079                                              1, (u32)-1);
4080                        if (wret < 0 && wret != -ENOSPC)
4081                                ret = wret;
4082
4083                        if (path->nodes[0] == leaf &&
4084                            btrfs_header_nritems(leaf)) {
4085                                wret = push_leaf_right(trans, root, path, 1,
4086                                                       1, 1, 0);
4087                                if (wret < 0 && wret != -ENOSPC)
4088                                        ret = wret;
4089                        }
4090
4091                        if (btrfs_header_nritems(leaf) == 0) {
4092                                path->slots[1] = slot;
4093                                btrfs_del_leaf(trans, root, path, leaf);
4094                                free_extent_buffer(leaf);
4095                                ret = 0;
4096                        } else {
4097                                /* if we're still in the path, make sure
4098                                 * we're dirty.  Otherwise, one of the
4099                                 * push_leaf functions must have already
4100                                 * dirtied this buffer
4101                                 */
4102                                if (path->nodes[0] == leaf)
4103                                        btrfs_mark_buffer_dirty(leaf);
4104                                free_extent_buffer(leaf);
4105                        }
4106                } else {
4107                        btrfs_mark_buffer_dirty(leaf);
4108                }
4109        }
4110        return ret;
4111}
4112
4113/*
4114 * search the tree again to find a leaf with lesser keys
4115 * returns 0 if it found something or 1 if there are no lesser leaves.
4116 * returns < 0 on io errors.
4117 *
4118 * This may release the path, and so you may lose any locks held at the
4119 * time you call it.
4120 */
4121int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4122{
4123        struct btrfs_key key;
4124        struct btrfs_disk_key found_key;
4125        int ret;
4126
4127        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
4128
4129        if (key.offset > 0) {
4130                key.offset--;
4131        } else if (key.type > 0) {
4132                key.type--;
4133                key.offset = (u64)-1;
4134        } else if (key.objectid > 0) {
4135                key.objectid--;
4136                key.type = (u8)-1;
4137                key.offset = (u64)-1;
4138        } else {
4139                return 1;
4140        }
4141
4142        btrfs_release_path(path);
4143        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4144        if (ret < 0)
4145                return ret;
4146        btrfs_item_key(path->nodes[0], &found_key, 0);
4147        ret = comp_keys(&found_key, &key);
4148        /*
4149         * We might have had an item with the previous key in the tree right
4150         * before we released our path. And after we released our path, that
4151         * item might have been pushed to the first slot (0) of the leaf we
4152         * were holding due to a tree balance. Alternatively, an item with the
4153         * previous key can exist as the only element of a leaf (big fat item).
4154         * Therefore account for these 2 cases, so that our callers (like
4155         * btrfs_previous_item) don't miss an existing item with a key matching
4156         * the previous key we computed above.
4157         */
4158        if (ret <= 0)
4159                return 0;
4160        return 1;
4161}
4162
4163/*
4164 * A helper function to walk down the tree starting at min_key, and looking
4165 * for nodes or leaves that are have a minimum transaction id.
4166 * This is used by the btree defrag code, and tree logging
4167 *
4168 * This does not cow, but it does stuff the starting key it finds back
4169 * into min_key, so you can call btrfs_search_slot with cow=1 on the
4170 * key and get a writable path.
4171 *
4172 * This honors path->lowest_level to prevent descent past a given level
4173 * of the tree.
4174 *
4175 * min_trans indicates the oldest transaction that you are interested
4176 * in walking through.  Any nodes or leaves older than min_trans are
4177 * skipped over (without reading them).
4178 *
4179 * returns zero if something useful was found, < 0 on error and 1 if there
4180 * was nothing in the tree that matched the search criteria.
4181 */
4182int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4183                         struct btrfs_path *path,
4184                         u64 min_trans)
4185{
4186        struct extent_buffer *cur;
4187        struct btrfs_key found_key;
4188        int slot;
4189        int sret;
4190        u32 nritems;
4191        int level;
4192        int ret = 1;
4193        int keep_locks = path->keep_locks;
4194
4195        path->keep_locks = 1;
4196again:
4197        cur = btrfs_read_lock_root_node(root);
4198        level = btrfs_header_level(cur);
4199        WARN_ON(path->nodes[level]);
4200        path->nodes[level] = cur;
4201        path->locks[level] = BTRFS_READ_LOCK;
4202
4203        if (btrfs_header_generation(cur) < min_trans) {
4204                ret = 1;
4205                goto out;
4206        }
4207        while (1) {
4208                nritems = btrfs_header_nritems(cur);
4209                level = btrfs_header_level(cur);
4210                sret = btrfs_bin_search(cur, min_key, &slot);
4211                if (sret < 0) {
4212                        ret = sret;
4213                        goto out;
4214                }
4215
4216                /* at the lowest level, we're done, setup the path and exit */
4217                if (level == path->lowest_level) {
4218                        if (slot >= nritems)
4219                                goto find_next_key;
4220                        ret = 0;
4221                        path->slots[level] = slot;
4222                        btrfs_item_key_to_cpu(cur, &found_key, slot);
4223                        goto out;
4224                }
4225                if (sret && slot > 0)
4226                        slot--;
4227                /*
4228                 * check this node pointer against the min_trans parameters.
4229                 * If it is too old, skip to the next one.
4230                 */
4231                while (slot < nritems) {
4232                        u64 gen;
4233
4234                        gen = btrfs_node_ptr_generation(cur, slot);
4235                        if (gen < min_trans) {
4236                                slot++;
4237                                continue;
4238                        }
4239                        break;
4240                }
4241find_next_key:
4242                /*
4243                 * we didn't find a candidate key in this node, walk forward
4244                 * and find another one
4245                 */
4246                if (slot >= nritems) {
4247                        path->slots[level] = slot;
4248                        sret = btrfs_find_next_key(root, path, min_key, level,
4249                                                  min_trans);
4250                        if (sret == 0) {
4251                                btrfs_release_path(path);
4252                                goto again;
4253                        } else {
4254                                goto out;
4255                        }
4256                }
4257                /* save our key for returning back */
4258                btrfs_node_key_to_cpu(cur, &found_key, slot);
4259                path->slots[level] = slot;
4260                if (level == path->lowest_level) {
4261                        ret = 0;
4262                        goto out;
4263                }
4264                cur = btrfs_read_node_slot(cur, slot);
4265                if (IS_ERR(cur)) {
4266                        ret = PTR_ERR(cur);
4267                        goto out;
4268                }
4269
4270                btrfs_tree_read_lock(cur);
4271
4272                path->locks[level - 1] = BTRFS_READ_LOCK;
4273                path->nodes[level - 1] = cur;
4274                unlock_up(path, level, 1, 0, NULL);
4275        }
4276out:
4277        path->keep_locks = keep_locks;
4278        if (ret == 0) {
4279                btrfs_unlock_up_safe(path, path->lowest_level + 1);
4280                memcpy(min_key, &found_key, sizeof(found_key));
4281        }
4282        return ret;
4283}
4284
4285/*
4286 * this is similar to btrfs_next_leaf, but does not try to preserve
4287 * and fixup the path.  It looks for and returns the next key in the
4288 * tree based on the current path and the min_trans parameters.
4289 *
4290 * 0 is returned if another key is found, < 0 if there are any errors
4291 * and 1 is returned if there are no higher keys in the tree
4292 *
4293 * path->keep_locks should be set to 1 on the search made before
4294 * calling this function.
4295 */
4296int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4297                        struct btrfs_key *key, int level, u64 min_trans)
4298{
4299        int slot;
4300        struct extent_buffer *c;
4301
4302        WARN_ON(!path->keep_locks && !path->skip_locking);
4303        while (level < BTRFS_MAX_LEVEL) {
4304                if (!path->nodes[level])
4305                        return 1;
4306
4307                slot = path->slots[level] + 1;
4308                c = path->nodes[level];
4309next:
4310                if (slot >= btrfs_header_nritems(c)) {
4311                        int ret;
4312                        int orig_lowest;
4313                        struct btrfs_key cur_key;
4314                        if (level + 1 >= BTRFS_MAX_LEVEL ||
4315                            !path->nodes[level + 1])
4316                                return 1;
4317
4318                        if (path->locks[level + 1] || path->skip_locking) {
4319                                level++;
4320                                continue;
4321                        }
4322
4323                        slot = btrfs_header_nritems(c) - 1;
4324                        if (level == 0)
4325                                btrfs_item_key_to_cpu(c, &cur_key, slot);
4326                        else
4327                                btrfs_node_key_to_cpu(c, &cur_key, slot);
4328
4329                        orig_lowest = path->lowest_level;
4330                        btrfs_release_path(path);
4331                        path->lowest_level = level;
4332                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
4333                                                0, 0);
4334                        path->lowest_level = orig_lowest;
4335                        if (ret < 0)
4336                                return ret;
4337
4338                        c = path->nodes[level];
4339                        slot = path->slots[level];
4340                        if (ret == 0)
4341                                slot++;
4342                        goto next;
4343                }
4344
4345                if (level == 0)
4346                        btrfs_item_key_to_cpu(c, key, slot);
4347                else {
4348                        u64 gen = btrfs_node_ptr_generation(c, slot);
4349
4350                        if (gen < min_trans) {
4351                                slot++;
4352                                goto next;
4353                        }
4354                        btrfs_node_key_to_cpu(c, key, slot);
4355                }
4356                return 0;
4357        }
4358        return 1;
4359}
4360
4361/*
4362 * search the tree again to find a leaf with greater keys
4363 * returns 0 if it found something or 1 if there are no greater leaves.
4364 * returns < 0 on io errors.
4365 */
4366int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4367{
4368        return btrfs_next_old_leaf(root, path, 0);
4369}
4370
4371int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4372                        u64 time_seq)
4373{
4374        int slot;
4375        int level;
4376        struct extent_buffer *c;
4377        struct extent_buffer *next;
4378        struct btrfs_key key;
4379        u32 nritems;
4380        int ret;
4381        int i;
4382
4383        nritems = btrfs_header_nritems(path->nodes[0]);
4384        if (nritems == 0)
4385                return 1;
4386
4387        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4388again:
4389        level = 1;
4390        next = NULL;
4391        btrfs_release_path(path);
4392
4393        path->keep_locks = 1;
4394
4395        if (time_seq)
4396                ret = btrfs_search_old_slot(root, &key, path, time_seq);
4397        else
4398                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4399        path->keep_locks = 0;
4400
4401        if (ret < 0)
4402                return ret;
4403
4404        nritems = btrfs_header_nritems(path->nodes[0]);
4405        /*
4406         * by releasing the path above we dropped all our locks.  A balance
4407         * could have added more items next to the key that used to be
4408         * at the very end of the block.  So, check again here and
4409         * advance the path if there are now more items available.
4410         */
4411        if (nritems > 0 && path->slots[0] < nritems - 1) {
4412                if (ret == 0)
4413                        path->slots[0]++;
4414                ret = 0;
4415                goto done;
4416        }
4417        /*
4418         * So the above check misses one case:
4419         * - after releasing the path above, someone has removed the item that
4420         *   used to be at the very end of the block, and balance between leafs
4421         *   gets another one with bigger key.offset to replace it.
4422         *
4423         * This one should be returned as well, or we can get leaf corruption
4424         * later(esp. in __btrfs_drop_extents()).
4425         *
4426         * And a bit more explanation about this check,
4427         * with ret > 0, the key isn't found, the path points to the slot
4428         * where it should be inserted, so the path->slots[0] item must be the
4429         * bigger one.
4430         */
4431        if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4432                ret = 0;
4433                goto done;
4434        }
4435
4436        while (level < BTRFS_MAX_LEVEL) {
4437                if (!path->nodes[level]) {
4438                        ret = 1;
4439                        goto done;
4440                }
4441
4442                slot = path->slots[level] + 1;
4443                c = path->nodes[level];
4444                if (slot >= btrfs_header_nritems(c)) {
4445                        level++;
4446                        if (level == BTRFS_MAX_LEVEL) {
4447                                ret = 1;
4448                                goto done;
4449                        }
4450                        continue;
4451                }
4452
4453
4454                /*
4455                 * Our current level is where we're going to start from, and to
4456                 * make sure lockdep doesn't complain we need to drop our locks
4457                 * and nodes from 0 to our current level.
4458                 */
4459                for (i = 0; i < level; i++) {
4460                        if (path->locks[level]) {
4461                                btrfs_tree_read_unlock(path->nodes[i]);
4462                                path->locks[i] = 0;
4463                        }
4464                        free_extent_buffer(path->nodes[i]);
4465                        path->nodes[i] = NULL;
4466                }
4467
4468                next = c;
4469                ret = read_block_for_search(root, path, &next, level,
4470                                            slot, &key);
4471                if (ret == -EAGAIN)
4472                        goto again;
4473
4474                if (ret < 0) {
4475                        btrfs_release_path(path);
4476                        goto done;
4477                }
4478
4479                if (!path->skip_locking) {
4480                        ret = btrfs_try_tree_read_lock(next);
4481                        if (!ret && time_seq) {
4482                                /*
4483                                 * If we don't get the lock, we may be racing
4484                                 * with push_leaf_left, holding that lock while
4485                                 * itself waiting for the leaf we've currently
4486                                 * locked. To solve this situation, we give up
4487                                 * on our lock and cycle.
4488                                 */
4489                                free_extent_buffer(next);
4490                                btrfs_release_path(path);
4491                                cond_resched();
4492                                goto again;
4493                        }
4494                        if (!ret)
4495                                btrfs_tree_read_lock(next);
4496                }
4497                break;
4498        }
4499        path->slots[level] = slot;
4500        while (1) {
4501                level--;
4502                path->nodes[level] = next;
4503                path->slots[level] = 0;
4504                if (!path->skip_locking)
4505                        path->locks[level] = BTRFS_READ_LOCK;
4506                if (!level)
4507                        break;
4508
4509                ret = read_block_for_search(root, path, &next, level,
4510                                            0, &key);
4511                if (ret == -EAGAIN)
4512                        goto again;
4513
4514                if (ret < 0) {
4515                        btrfs_release_path(path);
4516                        goto done;
4517                }
4518
4519                if (!path->skip_locking)
4520                        btrfs_tree_read_lock(next);
4521        }
4522        ret = 0;
4523done:
4524        unlock_up(path, 0, 1, 0, NULL);
4525
4526        return ret;
4527}
4528
4529/*
4530 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
4531 * searching until it gets past min_objectid or finds an item of 'type'
4532 *
4533 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4534 */
4535int btrfs_previous_item(struct btrfs_root *root,
4536                        struct btrfs_path *path, u64 min_objectid,
4537                        int type)
4538{
4539        struct btrfs_key found_key;
4540        struct extent_buffer *leaf;
4541        u32 nritems;
4542        int ret;
4543
4544        while (1) {
4545                if (path->slots[0] == 0) {
4546                        ret = btrfs_prev_leaf(root, path);
4547                        if (ret != 0)
4548                                return ret;
4549                } else {
4550                        path->slots[0]--;
4551                }
4552                leaf = path->nodes[0];
4553                nritems = btrfs_header_nritems(leaf);
4554                if (nritems == 0)
4555                        return 1;
4556                if (path->slots[0] == nritems)
4557                        path->slots[0]--;
4558
4559                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4560                if (found_key.objectid < min_objectid)
4561                        break;
4562                if (found_key.type == type)
4563                        return 0;
4564                if (found_key.objectid == min_objectid &&
4565                    found_key.type < type)
4566                        break;
4567        }
4568        return 1;
4569}
4570
4571/*
4572 * search in extent tree to find a previous Metadata/Data extent item with
4573 * min objecitd.
4574 *
4575 * returns 0 if something is found, 1 if nothing was found and < 0 on error
4576 */
4577int btrfs_previous_extent_item(struct btrfs_root *root,
4578                        struct btrfs_path *path, u64 min_objectid)
4579{
4580        struct btrfs_key found_key;
4581        struct extent_buffer *leaf;
4582        u32 nritems;
4583        int ret;
4584
4585        while (1) {
4586                if (path->slots[0] == 0) {
4587                        ret = btrfs_prev_leaf(root, path);
4588                        if (ret != 0)
4589                                return ret;
4590                } else {
4591                        path->slots[0]--;
4592                }
4593                leaf = path->nodes[0];
4594                nritems = btrfs_header_nritems(leaf);
4595                if (nritems == 0)
4596                        return 1;
4597                if (path->slots[0] == nritems)
4598                        path->slots[0]--;
4599
4600                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4601                if (found_key.objectid < min_objectid)
4602                        break;
4603                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
4604                    found_key.type == BTRFS_METADATA_ITEM_KEY)
4605                        return 0;
4606                if (found_key.objectid == min_objectid &&
4607                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
4608                        break;
4609        }
4610        return 1;
4611}
4612