linux/fs/btrfs/ctree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/slab.h>
   8#include <linux/rbtree.h>
   9#include <linux/mm.h>
  10#include "ctree.h"
  11#include "disk-io.h"
  12#include "transaction.h"
  13#include "print-tree.h"
  14#include "locking.h"
  15#include "volumes.h"
  16#include "qgroup.h"
  17
  18static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
  19                      *root, struct btrfs_path *path, int level);
  20static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  21                      const struct btrfs_key *ins_key, struct btrfs_path *path,
  22                      int data_size, int extend);
  23static int push_node_left(struct btrfs_trans_handle *trans,
  24                          struct extent_buffer *dst,
  25                          struct extent_buffer *src, int empty);
  26static int balance_node_right(struct btrfs_trans_handle *trans,
  27                              struct extent_buffer *dst_buf,
  28                              struct extent_buffer *src_buf);
  29static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
  30                    int level, int slot);
  31
  32static const struct btrfs_csums {
  33        u16             size;
  34        const char      name[10];
  35        const char      driver[12];
  36} btrfs_csums[] = {
  37        [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
  38        [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
  39        [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
  40        [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
  41                                     .driver = "blake2b-256" },
  42};
  43
  44int btrfs_super_csum_size(const struct btrfs_super_block *s)
  45{
  46        u16 t = btrfs_super_csum_type(s);
  47        /*
  48         * csum type is validated at mount time
  49         */
  50        return btrfs_csums[t].size;
  51}
  52
  53const char *btrfs_super_csum_name(u16 csum_type)
  54{
  55        /* csum type is validated at mount time */
  56        return btrfs_csums[csum_type].name;
  57}
  58
  59/*
  60 * Return driver name if defined, otherwise the name that's also a valid driver
  61 * name
  62 */
  63const char *btrfs_super_csum_driver(u16 csum_type)
  64{
  65        /* csum type is validated at mount time */
  66        return btrfs_csums[csum_type].driver[0] ?
  67                btrfs_csums[csum_type].driver :
  68                btrfs_csums[csum_type].name;
  69}
  70
  71size_t __attribute_const__ btrfs_get_num_csums(void)
  72{
  73        return ARRAY_SIZE(btrfs_csums);
  74}
  75
  76struct btrfs_path *btrfs_alloc_path(void)
  77{
  78        return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
  79}
  80
  81/* this also releases the path */
  82void btrfs_free_path(struct btrfs_path *p)
  83{
  84        if (!p)
  85                return;
  86        btrfs_release_path(p);
  87        kmem_cache_free(btrfs_path_cachep, p);
  88}
  89
  90/*
  91 * path release drops references on the extent buffers in the path
  92 * and it drops any locks held by this path
  93 *
  94 * It is safe to call this on paths that no locks or extent buffers held.
  95 */
  96noinline void btrfs_release_path(struct btrfs_path *p)
  97{
  98        int i;
  99
 100        for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
 101                p->slots[i] = 0;
 102                if (!p->nodes[i])
 103                        continue;
 104                if (p->locks[i]) {
 105                        btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
 106                        p->locks[i] = 0;
 107                }
 108                free_extent_buffer(p->nodes[i]);
 109                p->nodes[i] = NULL;
 110        }
 111}
 112
 113/*
 114 * safely gets a reference on the root node of a tree.  A lock
 115 * is not taken, so a concurrent writer may put a different node
 116 * at the root of the tree.  See btrfs_lock_root_node for the
 117 * looping required.
 118 *
 119 * The extent buffer returned by this has a reference taken, so
 120 * it won't disappear.  It may stop being the root of the tree
 121 * at any time because there are no locks held.
 122 */
 123struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 124{
 125        struct extent_buffer *eb;
 126
 127        while (1) {
 128                rcu_read_lock();
 129                eb = rcu_dereference(root->node);
 130
 131                /*
 132                 * RCU really hurts here, we could free up the root node because
 133                 * it was COWed but we may not get the new root node yet so do
 134                 * the inc_not_zero dance and if it doesn't work then
 135                 * synchronize_rcu and try again.
 136                 */
 137                if (atomic_inc_not_zero(&eb->refs)) {
 138                        rcu_read_unlock();
 139                        break;
 140                }
 141                rcu_read_unlock();
 142                synchronize_rcu();
 143        }
 144        return eb;
 145}
 146
 147/*
 148 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
 149 * just get put onto a simple dirty list.  Transaction walks this list to make
 150 * sure they get properly updated on disk.
 151 */
 152static void add_root_to_dirty_list(struct btrfs_root *root)
 153{
 154        struct btrfs_fs_info *fs_info = root->fs_info;
 155
 156        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
 157            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
 158                return;
 159
 160        spin_lock(&fs_info->trans_lock);
 161        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
 162                /* Want the extent tree to be the last on the list */
 163                if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
 164                        list_move_tail(&root->dirty_list,
 165                                       &fs_info->dirty_cowonly_roots);
 166                else
 167                        list_move(&root->dirty_list,
 168                                  &fs_info->dirty_cowonly_roots);
 169        }
 170        spin_unlock(&fs_info->trans_lock);
 171}
 172
 173/*
 174 * used by snapshot creation to make a copy of a root for a tree with
 175 * a given objectid.  The buffer with the new root node is returned in
 176 * cow_ret, and this func returns zero on success or a negative error code.
 177 */
 178int btrfs_copy_root(struct btrfs_trans_handle *trans,
 179                      struct btrfs_root *root,
 180                      struct extent_buffer *buf,
 181                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 182{
 183        struct btrfs_fs_info *fs_info = root->fs_info;
 184        struct extent_buffer *cow;
 185        int ret = 0;
 186        int level;
 187        struct btrfs_disk_key disk_key;
 188
 189        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 190                trans->transid != fs_info->running_transaction->transid);
 191        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 192                trans->transid != root->last_trans);
 193
 194        level = btrfs_header_level(buf);
 195        if (level == 0)
 196                btrfs_item_key(buf, &disk_key, 0);
 197        else
 198                btrfs_node_key(buf, &disk_key, 0);
 199
 200        cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
 201                                     &disk_key, level, buf->start, 0,
 202                                     BTRFS_NESTING_NEW_ROOT);
 203        if (IS_ERR(cow))
 204                return PTR_ERR(cow);
 205
 206        copy_extent_buffer_full(cow, buf);
 207        btrfs_set_header_bytenr(cow, cow->start);
 208        btrfs_set_header_generation(cow, trans->transid);
 209        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
 210        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
 211                                     BTRFS_HEADER_FLAG_RELOC);
 212        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 213                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
 214        else
 215                btrfs_set_header_owner(cow, new_root_objectid);
 216
 217        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
 218
 219        WARN_ON(btrfs_header_generation(buf) > trans->transid);
 220        if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
 221                ret = btrfs_inc_ref(trans, root, cow, 1);
 222        else
 223                ret = btrfs_inc_ref(trans, root, cow, 0);
 224
 225        if (ret)
 226                return ret;
 227
 228        btrfs_mark_buffer_dirty(cow);
 229        *cow_ret = cow;
 230        return 0;
 231}
 232
 233enum mod_log_op {
 234        MOD_LOG_KEY_REPLACE,
 235        MOD_LOG_KEY_ADD,
 236        MOD_LOG_KEY_REMOVE,
 237        MOD_LOG_KEY_REMOVE_WHILE_FREEING,
 238        MOD_LOG_KEY_REMOVE_WHILE_MOVING,
 239        MOD_LOG_MOVE_KEYS,
 240        MOD_LOG_ROOT_REPLACE,
 241};
 242
 243struct tree_mod_root {
 244        u64 logical;
 245        u8 level;
 246};
 247
 248struct tree_mod_elem {
 249        struct rb_node node;
 250        u64 logical;
 251        u64 seq;
 252        enum mod_log_op op;
 253
 254        /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
 255        int slot;
 256
 257        /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
 258        u64 generation;
 259
 260        /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
 261        struct btrfs_disk_key key;
 262        u64 blockptr;
 263
 264        /* this is used for op == MOD_LOG_MOVE_KEYS */
 265        struct {
 266                int dst_slot;
 267                int nr_items;
 268        } move;
 269
 270        /* this is used for op == MOD_LOG_ROOT_REPLACE */
 271        struct tree_mod_root old_root;
 272};
 273
 274/*
 275 * Pull a new tree mod seq number for our operation.
 276 */
 277static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
 278{
 279        return atomic64_inc_return(&fs_info->tree_mod_seq);
 280}
 281
 282/*
 283 * This adds a new blocker to the tree mod log's blocker list if the @elem
 284 * passed does not already have a sequence number set. So when a caller expects
 285 * to record tree modifications, it should ensure to set elem->seq to zero
 286 * before calling btrfs_get_tree_mod_seq.
 287 * Returns a fresh, unused tree log modification sequence number, even if no new
 288 * blocker was added.
 289 */
 290u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
 291                           struct seq_list *elem)
 292{
 293        write_lock(&fs_info->tree_mod_log_lock);
 294        if (!elem->seq) {
 295                elem->seq = btrfs_inc_tree_mod_seq(fs_info);
 296                list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
 297        }
 298        write_unlock(&fs_info->tree_mod_log_lock);
 299
 300        return elem->seq;
 301}
 302
 303void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
 304                            struct seq_list *elem)
 305{
 306        struct rb_root *tm_root;
 307        struct rb_node *node;
 308        struct rb_node *next;
 309        struct tree_mod_elem *tm;
 310        u64 min_seq = (u64)-1;
 311        u64 seq_putting = elem->seq;
 312
 313        if (!seq_putting)
 314                return;
 315
 316        write_lock(&fs_info->tree_mod_log_lock);
 317        list_del(&elem->list);
 318        elem->seq = 0;
 319
 320        if (!list_empty(&fs_info->tree_mod_seq_list)) {
 321                struct seq_list *first;
 322
 323                first = list_first_entry(&fs_info->tree_mod_seq_list,
 324                                         struct seq_list, list);
 325                if (seq_putting > first->seq) {
 326                        /*
 327                         * Blocker with lower sequence number exists, we
 328                         * cannot remove anything from the log.
 329                         */
 330                        write_unlock(&fs_info->tree_mod_log_lock);
 331                        return;
 332                }
 333                min_seq = first->seq;
 334        }
 335
 336        /*
 337         * anything that's lower than the lowest existing (read: blocked)
 338         * sequence number can be removed from the tree.
 339         */
 340        tm_root = &fs_info->tree_mod_log;
 341        for (node = rb_first(tm_root); node; node = next) {
 342                next = rb_next(node);
 343                tm = rb_entry(node, struct tree_mod_elem, node);
 344                if (tm->seq >= min_seq)
 345                        continue;
 346                rb_erase(node, tm_root);
 347                kfree(tm);
 348        }
 349        write_unlock(&fs_info->tree_mod_log_lock);
 350}
 351
 352/*
 353 * key order of the log:
 354 *       node/leaf start address -> sequence
 355 *
 356 * The 'start address' is the logical address of the *new* root node
 357 * for root replace operations, or the logical address of the affected
 358 * block for all other operations.
 359 */
 360static noinline int
 361__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 362{
 363        struct rb_root *tm_root;
 364        struct rb_node **new;
 365        struct rb_node *parent = NULL;
 366        struct tree_mod_elem *cur;
 367
 368        lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
 369
 370        tm->seq = btrfs_inc_tree_mod_seq(fs_info);
 371
 372        tm_root = &fs_info->tree_mod_log;
 373        new = &tm_root->rb_node;
 374        while (*new) {
 375                cur = rb_entry(*new, struct tree_mod_elem, node);
 376                parent = *new;
 377                if (cur->logical < tm->logical)
 378                        new = &((*new)->rb_left);
 379                else if (cur->logical > tm->logical)
 380                        new = &((*new)->rb_right);
 381                else if (cur->seq < tm->seq)
 382                        new = &((*new)->rb_left);
 383                else if (cur->seq > tm->seq)
 384                        new = &((*new)->rb_right);
 385                else
 386                        return -EEXIST;
 387        }
 388
 389        rb_link_node(&tm->node, parent, new);
 390        rb_insert_color(&tm->node, tm_root);
 391        return 0;
 392}
 393
 394/*
 395 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 396 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 397 * this until all tree mod log insertions are recorded in the rb tree and then
 398 * write unlock fs_info::tree_mod_log_lock.
 399 */
 400static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 401                                    struct extent_buffer *eb) {
 402        smp_mb();
 403        if (list_empty(&(fs_info)->tree_mod_seq_list))
 404                return 1;
 405        if (eb && btrfs_header_level(eb) == 0)
 406                return 1;
 407
 408        write_lock(&fs_info->tree_mod_log_lock);
 409        if (list_empty(&(fs_info)->tree_mod_seq_list)) {
 410                write_unlock(&fs_info->tree_mod_log_lock);
 411                return 1;
 412        }
 413
 414        return 0;
 415}
 416
 417/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
 418static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
 419                                    struct extent_buffer *eb)
 420{
 421        smp_mb();
 422        if (list_empty(&(fs_info)->tree_mod_seq_list))
 423                return 0;
 424        if (eb && btrfs_header_level(eb) == 0)
 425                return 0;
 426
 427        return 1;
 428}
 429
 430static struct tree_mod_elem *
 431alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
 432                    enum mod_log_op op, gfp_t flags)
 433{
 434        struct tree_mod_elem *tm;
 435
 436        tm = kzalloc(sizeof(*tm), flags);
 437        if (!tm)
 438                return NULL;
 439
 440        tm->logical = eb->start;
 441        if (op != MOD_LOG_KEY_ADD) {
 442                btrfs_node_key(eb, &tm->key, slot);
 443                tm->blockptr = btrfs_node_blockptr(eb, slot);
 444        }
 445        tm->op = op;
 446        tm->slot = slot;
 447        tm->generation = btrfs_node_ptr_generation(eb, slot);
 448        RB_CLEAR_NODE(&tm->node);
 449
 450        return tm;
 451}
 452
 453static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
 454                enum mod_log_op op, gfp_t flags)
 455{
 456        struct tree_mod_elem *tm;
 457        int ret;
 458
 459        if (!tree_mod_need_log(eb->fs_info, eb))
 460                return 0;
 461
 462        tm = alloc_tree_mod_elem(eb, slot, op, flags);
 463        if (!tm)
 464                return -ENOMEM;
 465
 466        if (tree_mod_dont_log(eb->fs_info, eb)) {
 467                kfree(tm);
 468                return 0;
 469        }
 470
 471        ret = __tree_mod_log_insert(eb->fs_info, tm);
 472        write_unlock(&eb->fs_info->tree_mod_log_lock);
 473        if (ret)
 474                kfree(tm);
 475
 476        return ret;
 477}
 478
 479static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
 480                int dst_slot, int src_slot, int nr_items)
 481{
 482        struct tree_mod_elem *tm = NULL;
 483        struct tree_mod_elem **tm_list = NULL;
 484        int ret = 0;
 485        int i;
 486        int locked = 0;
 487
 488        if (!tree_mod_need_log(eb->fs_info, eb))
 489                return 0;
 490
 491        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
 492        if (!tm_list)
 493                return -ENOMEM;
 494
 495        tm = kzalloc(sizeof(*tm), GFP_NOFS);
 496        if (!tm) {
 497                ret = -ENOMEM;
 498                goto free_tms;
 499        }
 500
 501        tm->logical = eb->start;
 502        tm->slot = src_slot;
 503        tm->move.dst_slot = dst_slot;
 504        tm->move.nr_items = nr_items;
 505        tm->op = MOD_LOG_MOVE_KEYS;
 506
 507        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 508                tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
 509                    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
 510                if (!tm_list[i]) {
 511                        ret = -ENOMEM;
 512                        goto free_tms;
 513                }
 514        }
 515
 516        if (tree_mod_dont_log(eb->fs_info, eb))
 517                goto free_tms;
 518        locked = 1;
 519
 520        /*
 521         * When we override something during the move, we log these removals.
 522         * This can only happen when we move towards the beginning of the
 523         * buffer, i.e. dst_slot < src_slot.
 524         */
 525        for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
 526                ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
 527                if (ret)
 528                        goto free_tms;
 529        }
 530
 531        ret = __tree_mod_log_insert(eb->fs_info, tm);
 532        if (ret)
 533                goto free_tms;
 534        write_unlock(&eb->fs_info->tree_mod_log_lock);
 535        kfree(tm_list);
 536
 537        return 0;
 538free_tms:
 539        for (i = 0; i < nr_items; i++) {
 540                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 541                        rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
 542                kfree(tm_list[i]);
 543        }
 544        if (locked)
 545                write_unlock(&eb->fs_info->tree_mod_log_lock);
 546        kfree(tm_list);
 547        kfree(tm);
 548
 549        return ret;
 550}
 551
 552static inline int
 553__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
 554                       struct tree_mod_elem **tm_list,
 555                       int nritems)
 556{
 557        int i, j;
 558        int ret;
 559
 560        for (i = nritems - 1; i >= 0; i--) {
 561                ret = __tree_mod_log_insert(fs_info, tm_list[i]);
 562                if (ret) {
 563                        for (j = nritems - 1; j > i; j--)
 564                                rb_erase(&tm_list[j]->node,
 565                                         &fs_info->tree_mod_log);
 566                        return ret;
 567                }
 568        }
 569
 570        return 0;
 571}
 572
 573static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
 574                         struct extent_buffer *new_root, int log_removal)
 575{
 576        struct btrfs_fs_info *fs_info = old_root->fs_info;
 577        struct tree_mod_elem *tm = NULL;
 578        struct tree_mod_elem **tm_list = NULL;
 579        int nritems = 0;
 580        int ret = 0;
 581        int i;
 582
 583        if (!tree_mod_need_log(fs_info, NULL))
 584                return 0;
 585
 586        if (log_removal && btrfs_header_level(old_root) > 0) {
 587                nritems = btrfs_header_nritems(old_root);
 588                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
 589                                  GFP_NOFS);
 590                if (!tm_list) {
 591                        ret = -ENOMEM;
 592                        goto free_tms;
 593                }
 594                for (i = 0; i < nritems; i++) {
 595                        tm_list[i] = alloc_tree_mod_elem(old_root, i,
 596                            MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 597                        if (!tm_list[i]) {
 598                                ret = -ENOMEM;
 599                                goto free_tms;
 600                        }
 601                }
 602        }
 603
 604        tm = kzalloc(sizeof(*tm), GFP_NOFS);
 605        if (!tm) {
 606                ret = -ENOMEM;
 607                goto free_tms;
 608        }
 609
 610        tm->logical = new_root->start;
 611        tm->old_root.logical = old_root->start;
 612        tm->old_root.level = btrfs_header_level(old_root);
 613        tm->generation = btrfs_header_generation(old_root);
 614        tm->op = MOD_LOG_ROOT_REPLACE;
 615
 616        if (tree_mod_dont_log(fs_info, NULL))
 617                goto free_tms;
 618
 619        if (tm_list)
 620                ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
 621        if (!ret)
 622                ret = __tree_mod_log_insert(fs_info, tm);
 623
 624        write_unlock(&fs_info->tree_mod_log_lock);
 625        if (ret)
 626                goto free_tms;
 627        kfree(tm_list);
 628
 629        return ret;
 630
 631free_tms:
 632        if (tm_list) {
 633                for (i = 0; i < nritems; i++)
 634                        kfree(tm_list[i]);
 635                kfree(tm_list);
 636        }
 637        kfree(tm);
 638
 639        return ret;
 640}
 641
 642static struct tree_mod_elem *
 643__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
 644                      int smallest)
 645{
 646        struct rb_root *tm_root;
 647        struct rb_node *node;
 648        struct tree_mod_elem *cur = NULL;
 649        struct tree_mod_elem *found = NULL;
 650
 651        read_lock(&fs_info->tree_mod_log_lock);
 652        tm_root = &fs_info->tree_mod_log;
 653        node = tm_root->rb_node;
 654        while (node) {
 655                cur = rb_entry(node, struct tree_mod_elem, node);
 656                if (cur->logical < start) {
 657                        node = node->rb_left;
 658                } else if (cur->logical > start) {
 659                        node = node->rb_right;
 660                } else if (cur->seq < min_seq) {
 661                        node = node->rb_left;
 662                } else if (!smallest) {
 663                        /* we want the node with the highest seq */
 664                        if (found)
 665                                BUG_ON(found->seq > cur->seq);
 666                        found = cur;
 667                        node = node->rb_left;
 668                } else if (cur->seq > min_seq) {
 669                        /* we want the node with the smallest seq */
 670                        if (found)
 671                                BUG_ON(found->seq < cur->seq);
 672                        found = cur;
 673                        node = node->rb_right;
 674                } else {
 675                        found = cur;
 676                        break;
 677                }
 678        }
 679        read_unlock(&fs_info->tree_mod_log_lock);
 680
 681        return found;
 682}
 683
 684/*
 685 * this returns the element from the log with the smallest time sequence
 686 * value that's in the log (the oldest log item). any element with a time
 687 * sequence lower than min_seq will be ignored.
 688 */
 689static struct tree_mod_elem *
 690tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
 691                           u64 min_seq)
 692{
 693        return __tree_mod_log_search(fs_info, start, min_seq, 1);
 694}
 695
 696/*
 697 * this returns the element from the log with the largest time sequence
 698 * value that's in the log (the most recent log item). any element with
 699 * a time sequence lower than min_seq will be ignored.
 700 */
 701static struct tree_mod_elem *
 702tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
 703{
 704        return __tree_mod_log_search(fs_info, start, min_seq, 0);
 705}
 706
 707static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
 708                     struct extent_buffer *src, unsigned long dst_offset,
 709                     unsigned long src_offset, int nr_items)
 710{
 711        struct btrfs_fs_info *fs_info = dst->fs_info;
 712        int ret = 0;
 713        struct tree_mod_elem **tm_list = NULL;
 714        struct tree_mod_elem **tm_list_add, **tm_list_rem;
 715        int i;
 716        int locked = 0;
 717
 718        if (!tree_mod_need_log(fs_info, NULL))
 719                return 0;
 720
 721        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 722                return 0;
 723
 724        tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
 725                          GFP_NOFS);
 726        if (!tm_list)
 727                return -ENOMEM;
 728
 729        tm_list_add = tm_list;
 730        tm_list_rem = tm_list + nr_items;
 731        for (i = 0; i < nr_items; i++) {
 732                tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
 733                    MOD_LOG_KEY_REMOVE, GFP_NOFS);
 734                if (!tm_list_rem[i]) {
 735                        ret = -ENOMEM;
 736                        goto free_tms;
 737                }
 738
 739                tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
 740                    MOD_LOG_KEY_ADD, GFP_NOFS);
 741                if (!tm_list_add[i]) {
 742                        ret = -ENOMEM;
 743                        goto free_tms;
 744                }
 745        }
 746
 747        if (tree_mod_dont_log(fs_info, NULL))
 748                goto free_tms;
 749        locked = 1;
 750
 751        for (i = 0; i < nr_items; i++) {
 752                ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
 753                if (ret)
 754                        goto free_tms;
 755                ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
 756                if (ret)
 757                        goto free_tms;
 758        }
 759
 760        write_unlock(&fs_info->tree_mod_log_lock);
 761        kfree(tm_list);
 762
 763        return 0;
 764
 765free_tms:
 766        for (i = 0; i < nr_items * 2; i++) {
 767                if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
 768                        rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
 769                kfree(tm_list[i]);
 770        }
 771        if (locked)
 772                write_unlock(&fs_info->tree_mod_log_lock);
 773        kfree(tm_list);
 774
 775        return ret;
 776}
 777
 778static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
 779{
 780        struct tree_mod_elem **tm_list = NULL;
 781        int nritems = 0;
 782        int i;
 783        int ret = 0;
 784
 785        if (btrfs_header_level(eb) == 0)
 786                return 0;
 787
 788        if (!tree_mod_need_log(eb->fs_info, NULL))
 789                return 0;
 790
 791        nritems = btrfs_header_nritems(eb);
 792        tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
 793        if (!tm_list)
 794                return -ENOMEM;
 795
 796        for (i = 0; i < nritems; i++) {
 797                tm_list[i] = alloc_tree_mod_elem(eb, i,
 798                    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 799                if (!tm_list[i]) {
 800                        ret = -ENOMEM;
 801                        goto free_tms;
 802                }
 803        }
 804
 805        if (tree_mod_dont_log(eb->fs_info, eb))
 806                goto free_tms;
 807
 808        ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
 809        write_unlock(&eb->fs_info->tree_mod_log_lock);
 810        if (ret)
 811                goto free_tms;
 812        kfree(tm_list);
 813
 814        return 0;
 815
 816free_tms:
 817        for (i = 0; i < nritems; i++)
 818                kfree(tm_list[i]);
 819        kfree(tm_list);
 820
 821        return ret;
 822}
 823
 824/*
 825 * check if the tree block can be shared by multiple trees
 826 */
 827int btrfs_block_can_be_shared(struct btrfs_root *root,
 828                              struct extent_buffer *buf)
 829{
 830        /*
 831         * Tree blocks not in shareable trees and tree roots are never shared.
 832         * If a block was allocated after the last snapshot and the block was
 833         * not allocated by tree relocation, we know the block is not shared.
 834         */
 835        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
 836            buf != root->node && buf != root->commit_root &&
 837            (btrfs_header_generation(buf) <=
 838             btrfs_root_last_snapshot(&root->root_item) ||
 839             btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
 840                return 1;
 841
 842        return 0;
 843}
 844
 845static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
 846                                       struct btrfs_root *root,
 847                                       struct extent_buffer *buf,
 848                                       struct extent_buffer *cow,
 849                                       int *last_ref)
 850{
 851        struct btrfs_fs_info *fs_info = root->fs_info;
 852        u64 refs;
 853        u64 owner;
 854        u64 flags;
 855        u64 new_flags = 0;
 856        int ret;
 857
 858        /*
 859         * Backrefs update rules:
 860         *
 861         * Always use full backrefs for extent pointers in tree block
 862         * allocated by tree relocation.
 863         *
 864         * If a shared tree block is no longer referenced by its owner
 865         * tree (btrfs_header_owner(buf) == root->root_key.objectid),
 866         * use full backrefs for extent pointers in tree block.
 867         *
 868         * If a tree block is been relocating
 869         * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
 870         * use full backrefs for extent pointers in tree block.
 871         * The reason for this is some operations (such as drop tree)
 872         * are only allowed for blocks use full backrefs.
 873         */
 874
 875        if (btrfs_block_can_be_shared(root, buf)) {
 876                ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
 877                                               btrfs_header_level(buf), 1,
 878                                               &refs, &flags);
 879                if (ret)
 880                        return ret;
 881                if (refs == 0) {
 882                        ret = -EROFS;
 883                        btrfs_handle_fs_error(fs_info, ret, NULL);
 884                        return ret;
 885                }
 886        } else {
 887                refs = 1;
 888                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
 889                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
 890                        flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 891                else
 892                        flags = 0;
 893        }
 894
 895        owner = btrfs_header_owner(buf);
 896        BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
 897               !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 898
 899        if (refs > 1) {
 900                if ((owner == root->root_key.objectid ||
 901                     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 902                    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
 903                        ret = btrfs_inc_ref(trans, root, buf, 1);
 904                        if (ret)
 905                                return ret;
 906
 907                        if (root->root_key.objectid ==
 908                            BTRFS_TREE_RELOC_OBJECTID) {
 909                                ret = btrfs_dec_ref(trans, root, buf, 0);
 910                                if (ret)
 911                                        return ret;
 912                                ret = btrfs_inc_ref(trans, root, cow, 1);
 913                                if (ret)
 914                                        return ret;
 915                        }
 916                        new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 917                } else {
 918
 919                        if (root->root_key.objectid ==
 920                            BTRFS_TREE_RELOC_OBJECTID)
 921                                ret = btrfs_inc_ref(trans, root, cow, 1);
 922                        else
 923                                ret = btrfs_inc_ref(trans, root, cow, 0);
 924                        if (ret)
 925                                return ret;
 926                }
 927                if (new_flags != 0) {
 928                        int level = btrfs_header_level(buf);
 929
 930                        ret = btrfs_set_disk_extent_flags(trans, buf,
 931                                                          new_flags, level, 0);
 932                        if (ret)
 933                                return ret;
 934                }
 935        } else {
 936                if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 937                        if (root->root_key.objectid ==
 938                            BTRFS_TREE_RELOC_OBJECTID)
 939                                ret = btrfs_inc_ref(trans, root, cow, 1);
 940                        else
 941                                ret = btrfs_inc_ref(trans, root, cow, 0);
 942                        if (ret)
 943                                return ret;
 944                        ret = btrfs_dec_ref(trans, root, buf, 1);
 945                        if (ret)
 946                                return ret;
 947                }
 948                btrfs_clean_tree_block(buf);
 949                *last_ref = 1;
 950        }
 951        return 0;
 952}
 953
 954static struct extent_buffer *alloc_tree_block_no_bg_flush(
 955                                          struct btrfs_trans_handle *trans,
 956                                          struct btrfs_root *root,
 957                                          u64 parent_start,
 958                                          const struct btrfs_disk_key *disk_key,
 959                                          int level,
 960                                          u64 hint,
 961                                          u64 empty_size,
 962                                          enum btrfs_lock_nesting nest)
 963{
 964        struct btrfs_fs_info *fs_info = root->fs_info;
 965        struct extent_buffer *ret;
 966
 967        /*
 968         * If we are COWing a node/leaf from the extent, chunk, device or free
 969         * space trees, make sure that we do not finish block group creation of
 970         * pending block groups. We do this to avoid a deadlock.
 971         * COWing can result in allocation of a new chunk, and flushing pending
 972         * block groups (btrfs_create_pending_block_groups()) can be triggered
 973         * when finishing allocation of a new chunk. Creation of a pending block
 974         * group modifies the extent, chunk, device and free space trees,
 975         * therefore we could deadlock with ourselves since we are holding a
 976         * lock on an extent buffer that btrfs_create_pending_block_groups() may
 977         * try to COW later.
 978         * For similar reasons, we also need to delay flushing pending block
 979         * groups when splitting a leaf or node, from one of those trees, since
 980         * we are holding a write lock on it and its parent or when inserting a
 981         * new root node for one of those trees.
 982         */
 983        if (root == fs_info->extent_root ||
 984            root == fs_info->chunk_root ||
 985            root == fs_info->dev_root ||
 986            root == fs_info->free_space_root)
 987                trans->can_flush_pending_bgs = false;
 988
 989        ret = btrfs_alloc_tree_block(trans, root, parent_start,
 990                                     root->root_key.objectid, disk_key, level,
 991                                     hint, empty_size, nest);
 992        trans->can_flush_pending_bgs = true;
 993
 994        return ret;
 995}
 996
 997/*
 998 * does the dirty work in cow of a single block.  The parent block (if
 999 * supplied) is updated to point to the new cow copy.  The new buffer is marked
1000 * dirty and returned locked.  If you modify the block it needs to be marked
1001 * dirty again.
1002 *
1003 * search_start -- an allocation hint for the new block
1004 *
1005 * empty_size -- a hint that you plan on doing more cow.  This is the size in
1006 * bytes the allocator should try to find free next to the block it returns.
1007 * This is just a hint and may be ignored by the allocator.
1008 */
1009static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1010                             struct btrfs_root *root,
1011                             struct extent_buffer *buf,
1012                             struct extent_buffer *parent, int parent_slot,
1013                             struct extent_buffer **cow_ret,
1014                             u64 search_start, u64 empty_size,
1015                             enum btrfs_lock_nesting nest)
1016{
1017        struct btrfs_fs_info *fs_info = root->fs_info;
1018        struct btrfs_disk_key disk_key;
1019        struct extent_buffer *cow;
1020        int level, ret;
1021        int last_ref = 0;
1022        int unlock_orig = 0;
1023        u64 parent_start = 0;
1024
1025        if (*cow_ret == buf)
1026                unlock_orig = 1;
1027
1028        btrfs_assert_tree_locked(buf);
1029
1030        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1031                trans->transid != fs_info->running_transaction->transid);
1032        WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1033                trans->transid != root->last_trans);
1034
1035        level = btrfs_header_level(buf);
1036
1037        if (level == 0)
1038                btrfs_item_key(buf, &disk_key, 0);
1039        else
1040                btrfs_node_key(buf, &disk_key, 0);
1041
1042        if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1043                parent_start = parent->start;
1044
1045        cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1046                                           level, search_start, empty_size, nest);
1047        if (IS_ERR(cow))
1048                return PTR_ERR(cow);
1049
1050        /* cow is set to blocking by btrfs_init_new_buffer */
1051
1052        copy_extent_buffer_full(cow, buf);
1053        btrfs_set_header_bytenr(cow, cow->start);
1054        btrfs_set_header_generation(cow, trans->transid);
1055        btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1056        btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1057                                     BTRFS_HEADER_FLAG_RELOC);
1058        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1059                btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1060        else
1061                btrfs_set_header_owner(cow, root->root_key.objectid);
1062
1063        write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1064
1065        ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1066        if (ret) {
1067                btrfs_tree_unlock(cow);
1068                free_extent_buffer(cow);
1069                btrfs_abort_transaction(trans, ret);
1070                return ret;
1071        }
1072
1073        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
1074                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1075                if (ret) {
1076                        btrfs_tree_unlock(cow);
1077                        free_extent_buffer(cow);
1078                        btrfs_abort_transaction(trans, ret);
1079                        return ret;
1080                }
1081        }
1082
1083        if (buf == root->node) {
1084                WARN_ON(parent && parent != buf);
1085                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1086                    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1087                        parent_start = buf->start;
1088
1089                atomic_inc(&cow->refs);
1090                ret = tree_mod_log_insert_root(root->node, cow, 1);
1091                BUG_ON(ret < 0);
1092                rcu_assign_pointer(root->node, cow);
1093
1094                btrfs_free_tree_block(trans, root, buf, parent_start,
1095                                      last_ref);
1096                free_extent_buffer(buf);
1097                add_root_to_dirty_list(root);
1098        } else {
1099                WARN_ON(trans->transid != btrfs_header_generation(parent));
1100                tree_mod_log_insert_key(parent, parent_slot,
1101                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1102                btrfs_set_node_blockptr(parent, parent_slot,
1103                                        cow->start);
1104                btrfs_set_node_ptr_generation(parent, parent_slot,
1105                                              trans->transid);
1106                btrfs_mark_buffer_dirty(parent);
1107                if (last_ref) {
1108                        ret = tree_mod_log_free_eb(buf);
1109                        if (ret) {
1110                                btrfs_tree_unlock(cow);
1111                                free_extent_buffer(cow);
1112                                btrfs_abort_transaction(trans, ret);
1113                                return ret;
1114                        }
1115                }
1116                btrfs_free_tree_block(trans, root, buf, parent_start,
1117                                      last_ref);
1118        }
1119        if (unlock_orig)
1120                btrfs_tree_unlock(buf);
1121        free_extent_buffer_stale(buf);
1122        btrfs_mark_buffer_dirty(cow);
1123        *cow_ret = cow;
1124        return 0;
1125}
1126
1127/*
1128 * returns the logical address of the oldest predecessor of the given root.
1129 * entries older than time_seq are ignored.
1130 */
1131static struct tree_mod_elem *__tree_mod_log_oldest_root(
1132                struct extent_buffer *eb_root, u64 time_seq)
1133{
1134        struct tree_mod_elem *tm;
1135        struct tree_mod_elem *found = NULL;
1136        u64 root_logical = eb_root->start;
1137        int looped = 0;
1138
1139        if (!time_seq)
1140                return NULL;
1141
1142        /*
1143         * the very last operation that's logged for a root is the
1144         * replacement operation (if it is replaced at all). this has
1145         * the logical address of the *new* root, making it the very
1146         * first operation that's logged for this root.
1147         */
1148        while (1) {
1149                tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1150                                                time_seq);
1151                if (!looped && !tm)
1152                        return NULL;
1153                /*
1154                 * if there are no tree operation for the oldest root, we simply
1155                 * return it. this should only happen if that (old) root is at
1156                 * level 0.
1157                 */
1158                if (!tm)
1159                        break;
1160
1161                /*
1162                 * if there's an operation that's not a root replacement, we
1163                 * found the oldest version of our root. normally, we'll find a
1164                 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1165                 */
1166                if (tm->op != MOD_LOG_ROOT_REPLACE)
1167                        break;
1168
1169                found = tm;
1170                root_logical = tm->old_root.logical;
1171                looped = 1;
1172        }
1173
1174        /* if there's no old root to return, return what we found instead */
1175        if (!found)
1176                found = tm;
1177
1178        return found;
1179}
1180
1181/*
1182 * tm is a pointer to the first operation to rewind within eb. then, all
1183 * previous operations will be rewound (until we reach something older than
1184 * time_seq).
1185 */
1186static void
1187__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1188                      u64 time_seq, struct tree_mod_elem *first_tm)
1189{
1190        u32 n;
1191        struct rb_node *next;
1192        struct tree_mod_elem *tm = first_tm;
1193        unsigned long o_dst;
1194        unsigned long o_src;
1195        unsigned long p_size = sizeof(struct btrfs_key_ptr);
1196
1197        n = btrfs_header_nritems(eb);
1198        read_lock(&fs_info->tree_mod_log_lock);
1199        while (tm && tm->seq >= time_seq) {
1200                /*
1201                 * all the operations are recorded with the operator used for
1202                 * the modification. as we're going backwards, we do the
1203                 * opposite of each operation here.
1204                 */
1205                switch (tm->op) {
1206                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1207                        BUG_ON(tm->slot < n);
1208                        fallthrough;
1209                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1210                case MOD_LOG_KEY_REMOVE:
1211                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1212                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1213                        btrfs_set_node_ptr_generation(eb, tm->slot,
1214                                                      tm->generation);
1215                        n++;
1216                        break;
1217                case MOD_LOG_KEY_REPLACE:
1218                        BUG_ON(tm->slot >= n);
1219                        btrfs_set_node_key(eb, &tm->key, tm->slot);
1220                        btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1221                        btrfs_set_node_ptr_generation(eb, tm->slot,
1222                                                      tm->generation);
1223                        break;
1224                case MOD_LOG_KEY_ADD:
1225                        /* if a move operation is needed it's in the log */
1226                        n--;
1227                        break;
1228                case MOD_LOG_MOVE_KEYS:
1229                        o_dst = btrfs_node_key_ptr_offset(tm->slot);
1230                        o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1231                        memmove_extent_buffer(eb, o_dst, o_src,
1232                                              tm->move.nr_items * p_size);
1233                        break;
1234                case MOD_LOG_ROOT_REPLACE:
1235                        /*
1236                         * this operation is special. for roots, this must be
1237                         * handled explicitly before rewinding.
1238                         * for non-roots, this operation may exist if the node
1239                         * was a root: root A -> child B; then A gets empty and
1240                         * B is promoted to the new root. in the mod log, we'll
1241                         * have a root-replace operation for B, a tree block
1242                         * that is no root. we simply ignore that operation.
1243                         */
1244                        break;
1245                }
1246                next = rb_next(&tm->node);
1247                if (!next)
1248                        break;
1249                tm = rb_entry(next, struct tree_mod_elem, node);
1250                if (tm->logical != first_tm->logical)
1251                        break;
1252        }
1253        read_unlock(&fs_info->tree_mod_log_lock);
1254        btrfs_set_header_nritems(eb, n);
1255}
1256
1257/*
1258 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1259 * is returned. If rewind operations happen, a fresh buffer is returned. The
1260 * returned buffer is always read-locked. If the returned buffer is not the
1261 * input buffer, the lock on the input buffer is released and the input buffer
1262 * is freed (its refcount is decremented).
1263 */
1264static struct extent_buffer *
1265tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1266                    struct extent_buffer *eb, u64 time_seq)
1267{
1268        struct extent_buffer *eb_rewin;
1269        struct tree_mod_elem *tm;
1270
1271        if (!time_seq)
1272                return eb;
1273
1274        if (btrfs_header_level(eb) == 0)
1275                return eb;
1276
1277        tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1278        if (!tm)
1279                return eb;
1280
1281        btrfs_set_path_blocking(path);
1282        btrfs_set_lock_blocking_read(eb);
1283
1284        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1285                BUG_ON(tm->slot != 0);
1286                eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1287                if (!eb_rewin) {
1288                        btrfs_tree_read_unlock_blocking(eb);
1289                        free_extent_buffer(eb);
1290                        return NULL;
1291                }
1292                btrfs_set_header_bytenr(eb_rewin, eb->start);
1293                btrfs_set_header_backref_rev(eb_rewin,
1294                                             btrfs_header_backref_rev(eb));
1295                btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1296                btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1297        } else {
1298                eb_rewin = btrfs_clone_extent_buffer(eb);
1299                if (!eb_rewin) {
1300                        btrfs_tree_read_unlock_blocking(eb);
1301                        free_extent_buffer(eb);
1302                        return NULL;
1303                }
1304        }
1305
1306        btrfs_tree_read_unlock_blocking(eb);
1307        free_extent_buffer(eb);
1308
1309        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1310                                       eb_rewin, btrfs_header_level(eb_rewin));
1311        btrfs_tree_read_lock(eb_rewin);
1312        __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1313        WARN_ON(btrfs_header_nritems(eb_rewin) >
1314                BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1315
1316        return eb_rewin;
1317}
1318
1319/*
1320 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1321 * value. If there are no changes, the current root->root_node is returned. If
1322 * anything changed in between, there's a fresh buffer allocated on which the
1323 * rewind operations are done. In any case, the returned buffer is read locked.
1324 * Returns NULL on error (with no locks held).
1325 */
1326static inline struct extent_buffer *
1327get_old_root(struct btrfs_root *root, u64 time_seq)
1328{
1329        struct btrfs_fs_info *fs_info = root->fs_info;
1330        struct tree_mod_elem *tm;
1331        struct extent_buffer *eb = NULL;
1332        struct extent_buffer *eb_root;
1333        u64 eb_root_owner = 0;
1334        struct extent_buffer *old;
1335        struct tree_mod_root *old_root = NULL;
1336        u64 old_generation = 0;
1337        u64 logical;
1338        int level;
1339
1340        eb_root = btrfs_read_lock_root_node(root);
1341        tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1342        if (!tm)
1343                return eb_root;
1344
1345        if (tm->op == MOD_LOG_ROOT_REPLACE) {
1346                old_root = &tm->old_root;
1347                old_generation = tm->generation;
1348                logical = old_root->logical;
1349                level = old_root->level;
1350        } else {
1351                logical = eb_root->start;
1352                level = btrfs_header_level(eb_root);
1353        }
1354
1355        tm = tree_mod_log_search(fs_info, logical, time_seq);
1356        if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1357                btrfs_tree_read_unlock(eb_root);
1358                free_extent_buffer(eb_root);
1359                old = read_tree_block(fs_info, logical, 0, level, NULL);
1360                if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1361                        if (!IS_ERR(old))
1362                                free_extent_buffer(old);
1363                        btrfs_warn(fs_info,
1364                                   "failed to read tree block %llu from get_old_root",
1365                                   logical);
1366                } else {
1367                        eb = btrfs_clone_extent_buffer(old);
1368                        free_extent_buffer(old);
1369                }
1370        } else if (old_root) {
1371                eb_root_owner = btrfs_header_owner(eb_root);
1372                btrfs_tree_read_unlock(eb_root);
1373                free_extent_buffer(eb_root);
1374                eb = alloc_dummy_extent_buffer(fs_info, logical);
1375        } else {
1376                btrfs_set_lock_blocking_read(eb_root);
1377                eb = btrfs_clone_extent_buffer(eb_root);
1378                btrfs_tree_read_unlock_blocking(eb_root);
1379                free_extent_buffer(eb_root);
1380        }
1381
1382        if (!eb)
1383                return NULL;
1384        if (old_root) {
1385                btrfs_set_header_bytenr(eb, eb->start);
1386                btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1387                btrfs_set_header_owner(eb, eb_root_owner);
1388                btrfs_set_header_level(eb, old_root->level);
1389                btrfs_set_header_generation(eb, old_generation);
1390        }
1391        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1392                                       btrfs_header_level(eb));
1393        btrfs_tree_read_lock(eb);
1394        if (tm)
1395                __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1396        else
1397                WARN_ON(btrfs_header_level(eb) != 0);
1398        WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1399
1400        return eb;
1401}
1402
1403int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1404{
1405        struct tree_mod_elem *tm;
1406        int level;
1407        struct extent_buffer *eb_root = btrfs_root_node(root);
1408
1409        tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1410        if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1411                level = tm->old_root.level;
1412        } else {
1413                level = btrfs_header_level(eb_root);
1414        }
1415        free_extent_buffer(eb_root);
1416
1417        return level;
1418}
1419
1420static inline int should_cow_block(struct btrfs_trans_handle *trans,
1421                                   struct btrfs_root *root,
1422                                   struct extent_buffer *buf)
1423{
1424        if (btrfs_is_testing(root->fs_info))
1425                return 0;
1426
1427        /* Ensure we can see the FORCE_COW bit */
1428        smp_mb__before_atomic();
1429
1430        /*
1431         * We do not need to cow a block if
1432         * 1) this block is not created or changed in this transaction;
1433         * 2) this block does not belong to TREE_RELOC tree;
1434         * 3) the root is not forced COW.
1435         *
1436         * What is forced COW:
1437         *    when we create snapshot during committing the transaction,
1438         *    after we've finished copying src root, we must COW the shared
1439         *    block to ensure the metadata consistency.
1440         */
1441        if (btrfs_header_generation(buf) == trans->transid &&
1442            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1443            !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1444              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1445            !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1446                return 0;
1447        return 1;
1448}
1449
1450/*
1451 * cows a single block, see __btrfs_cow_block for the real work.
1452 * This version of it has extra checks so that a block isn't COWed more than
1453 * once per transaction, as long as it hasn't been written yet
1454 */
1455noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1456                    struct btrfs_root *root, struct extent_buffer *buf,
1457                    struct extent_buffer *parent, int parent_slot,
1458                    struct extent_buffer **cow_ret,
1459                    enum btrfs_lock_nesting nest)
1460{
1461        struct btrfs_fs_info *fs_info = root->fs_info;
1462        u64 search_start;
1463        int ret;
1464
1465        if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1466                btrfs_err(fs_info,
1467                        "COW'ing blocks on a fs root that's being dropped");
1468
1469        if (trans->transaction != fs_info->running_transaction)
1470                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1471                       trans->transid,
1472                       fs_info->running_transaction->transid);
1473
1474        if (trans->transid != fs_info->generation)
1475                WARN(1, KERN_CRIT "trans %llu running %llu\n",
1476                       trans->transid, fs_info->generation);
1477
1478        if (!should_cow_block(trans, root, buf)) {
1479                trans->dirty = true;
1480                *cow_ret = buf;
1481                return 0;
1482        }
1483
1484        search_start = buf->start & ~((u64)SZ_1G - 1);
1485
1486        if (parent)
1487                btrfs_set_lock_blocking_write(parent);
1488        btrfs_set_lock_blocking_write(buf);
1489
1490        /*
1491         * Before CoWing this block for later modification, check if it's
1492         * the subtree root and do the delayed subtree trace if needed.
1493         *
1494         * Also We don't care about the error, as it's handled internally.
1495         */
1496        btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1497        ret = __btrfs_cow_block(trans, root, buf, parent,
1498                                 parent_slot, cow_ret, search_start, 0, nest);
1499
1500        trace_btrfs_cow_block(root, buf, *cow_ret);
1501
1502        return ret;
1503}
1504
1505/*
1506 * helper function for defrag to decide if two blocks pointed to by a
1507 * node are actually close by
1508 */
1509static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1510{
1511        if (blocknr < other && other - (blocknr + blocksize) < 32768)
1512                return 1;
1513        if (blocknr > other && blocknr - (other + blocksize) < 32768)
1514                return 1;
1515        return 0;
1516}
1517
1518#ifdef __LITTLE_ENDIAN
1519
1520/*
1521 * Compare two keys, on little-endian the disk order is same as CPU order and
1522 * we can avoid the conversion.
1523 */
1524static int comp_keys(const struct btrfs_disk_key *disk_key,
1525                     const struct btrfs_key *k2)
1526{
1527        const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
1528
1529        return btrfs_comp_cpu_keys(k1, k2);
1530}
1531
1532#else
1533
1534/*
1535 * compare two keys in a memcmp fashion
1536 */
1537static int comp_keys(const struct btrfs_disk_key *disk,
1538                     const struct btrfs_key *k2)
1539{
1540        struct btrfs_key k1;
1541
1542        btrfs_disk_key_to_cpu(&k1, disk);
1543
1544        return btrfs_comp_cpu_keys(&k1, k2);
1545}
1546#endif
1547
1548/*
1549 * same as comp_keys only with two btrfs_key's
1550 */
1551int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1552{
1553        if (k1->objectid > k2->objectid)
1554                return 1;
1555        if (k1->objectid < k2->objectid)
1556                return -1;
1557        if (k1->type > k2->type)
1558                return 1;
1559        if (k1->type < k2->type)
1560                return -1;
1561        if (k1->offset > k2->offset)
1562                return 1;
1563        if (k1->offset < k2->offset)
1564                return -1;
1565        return 0;
1566}
1567
1568/*
1569 * this is used by the defrag code to go through all the
1570 * leaves pointed to by a node and reallocate them so that
1571 * disk order is close to key order
1572 */
1573int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1574                       struct btrfs_root *root, struct extent_buffer *parent,
1575                       int start_slot, u64 *last_ret,
1576                       struct btrfs_key *progress)
1577{
1578        struct btrfs_fs_info *fs_info = root->fs_info;
1579        struct extent_buffer *cur;
1580        u64 blocknr;
1581        u64 gen;
1582        u64 search_start = *last_ret;
1583        u64 last_block = 0;
1584        u64 other;
1585        u32 parent_nritems;
1586        int end_slot;
1587        int i;
1588        int err = 0;
1589        int parent_level;
1590        int uptodate;
1591        u32 blocksize;
1592        int progress_passed = 0;
1593        struct btrfs_disk_key disk_key;
1594
1595        parent_level = btrfs_header_level(parent);
1596
1597        WARN_ON(trans->transaction != fs_info->running_transaction);
1598        WARN_ON(trans->transid != fs_info->generation);
1599
1600        parent_nritems = btrfs_header_nritems(parent);
1601        blocksize = fs_info->nodesize;
1602        end_slot = parent_nritems - 1;
1603
1604        if (parent_nritems <= 1)
1605                return 0;
1606
1607        btrfs_set_lock_blocking_write(parent);
1608
1609        for (i = start_slot; i <= end_slot; i++) {
1610                struct btrfs_key first_key;
1611                int close = 1;
1612
1613                btrfs_node_key(parent, &disk_key, i);
1614                if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1615                        continue;
1616
1617                progress_passed = 1;
1618                blocknr = btrfs_node_blockptr(parent, i);
1619                gen = btrfs_node_ptr_generation(parent, i);
1620                btrfs_node_key_to_cpu(parent, &first_key, i);
1621                if (last_block == 0)
1622                        last_block = blocknr;
1623
1624                if (i > 0) {
1625                        other = btrfs_node_blockptr(parent, i - 1);
1626                        close = close_blocks(blocknr, other, blocksize);
1627                }
1628                if (!close && i < end_slot) {
1629                        other = btrfs_node_blockptr(parent, i + 1);
1630                        close = close_blocks(blocknr, other, blocksize);
1631                }
1632                if (close) {
1633                        last_block = blocknr;
1634                        continue;
1635                }
1636
1637                cur = find_extent_buffer(fs_info, blocknr);
1638                if (cur)
1639                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1640                else
1641                        uptodate = 0;
1642                if (!cur || !uptodate) {
1643                        if (!cur) {
1644                                cur = read_tree_block(fs_info, blocknr, gen,
1645                                                      parent_level - 1,
1646                                                      &first_key);
1647                                if (IS_ERR(cur)) {
1648                                        return PTR_ERR(cur);
1649                                } else if (!extent_buffer_uptodate(cur)) {
1650                                        free_extent_buffer(cur);
1651                                        return -EIO;
1652                                }
1653                        } else if (!uptodate) {
1654                                err = btrfs_read_buffer(cur, gen,
1655                                                parent_level - 1,&first_key);
1656                                if (err) {
1657                                        free_extent_buffer(cur);
1658                                        return err;
1659                                }
1660                        }
1661                }
1662                if (search_start == 0)
1663                        search_start = last_block;
1664
1665                btrfs_tree_lock(cur);
1666                btrfs_set_lock_blocking_write(cur);
1667                err = __btrfs_cow_block(trans, root, cur, parent, i,
1668                                        &cur, search_start,
1669                                        min(16 * blocksize,
1670                                            (end_slot - i) * blocksize),
1671                                        BTRFS_NESTING_COW);
1672                if (err) {
1673                        btrfs_tree_unlock(cur);
1674                        free_extent_buffer(cur);
1675                        break;
1676                }
1677                search_start = cur->start;
1678                last_block = cur->start;
1679                *last_ret = search_start;
1680                btrfs_tree_unlock(cur);
1681                free_extent_buffer(cur);
1682        }
1683        return err;
1684}
1685
1686/*
1687 * search for key in the extent_buffer.  The items start at offset p,
1688 * and they are item_size apart.  There are 'max' items in p.
1689 *
1690 * the slot in the array is returned via slot, and it points to
1691 * the place where you would insert key if it is not found in
1692 * the array.
1693 *
1694 * slot may point to max if the key is bigger than all of the keys
1695 */
1696static noinline int generic_bin_search(struct extent_buffer *eb,
1697                                       unsigned long p, int item_size,
1698                                       const struct btrfs_key *key,
1699                                       int max, int *slot)
1700{
1701        int low = 0;
1702        int high = max;
1703        int ret;
1704        const int key_size = sizeof(struct btrfs_disk_key);
1705
1706        if (low > high) {
1707                btrfs_err(eb->fs_info,
1708                 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1709                          __func__, low, high, eb->start,
1710                          btrfs_header_owner(eb), btrfs_header_level(eb));
1711                return -EINVAL;
1712        }
1713
1714        while (low < high) {
1715                unsigned long oip;
1716                unsigned long offset;
1717                struct btrfs_disk_key *tmp;
1718                struct btrfs_disk_key unaligned;
1719                int mid;
1720
1721                mid = (low + high) / 2;
1722                offset = p + mid * item_size;
1723                oip = offset_in_page(offset);
1724
1725                if (oip + key_size <= PAGE_SIZE) {
1726                        const unsigned long idx = offset >> PAGE_SHIFT;
1727                        char *kaddr = page_address(eb->pages[idx]);
1728
1729                        tmp = (struct btrfs_disk_key *)(kaddr + oip);
1730                } else {
1731                        read_extent_buffer(eb, &unaligned, offset, key_size);
1732                        tmp = &unaligned;
1733                }
1734
1735                ret = comp_keys(tmp, key);
1736
1737                if (ret < 0)
1738                        low = mid + 1;
1739                else if (ret > 0)
1740                        high = mid;
1741                else {
1742                        *slot = mid;
1743                        return 0;
1744                }
1745        }
1746        *slot = low;
1747        return 1;
1748}
1749
1750/*
1751 * simple bin_search frontend that does the right thing for
1752 * leaves vs nodes
1753 */
1754int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1755                     int *slot)
1756{
1757        if (btrfs_header_level(eb) == 0)
1758                return generic_bin_search(eb,
1759                                          offsetof(struct btrfs_leaf, items),
1760                                          sizeof(struct btrfs_item),
1761                                          key, btrfs_header_nritems(eb),
1762                                          slot);
1763        else
1764                return generic_bin_search(eb,
1765                                          offsetof(struct btrfs_node, ptrs),
1766                                          sizeof(struct btrfs_key_ptr),
1767                                          key, btrfs_header_nritems(eb),
1768                                          slot);
1769}
1770
1771static void root_add_used(struct btrfs_root *root, u32 size)
1772{
1773        spin_lock(&root->accounting_lock);
1774        btrfs_set_root_used(&root->root_item,
1775                            btrfs_root_used(&root->root_item) + size);
1776        spin_unlock(&root->accounting_lock);
1777}
1778
1779static void root_sub_used(struct btrfs_root *root, u32 size)
1780{
1781        spin_lock(&root->accounting_lock);
1782        btrfs_set_root_used(&root->root_item,
1783                            btrfs_root_used(&root->root_item) - size);
1784        spin_unlock(&root->accounting_lock);
1785}
1786
1787/* given a node and slot number, this reads the blocks it points to.  The
1788 * extent buffer is returned with a reference taken (but unlocked).
1789 */
1790struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1791                                           int slot)
1792{
1793        int level = btrfs_header_level(parent);
1794        struct extent_buffer *eb;
1795        struct btrfs_key first_key;
1796
1797        if (slot < 0 || slot >= btrfs_header_nritems(parent))
1798                return ERR_PTR(-ENOENT);
1799
1800        BUG_ON(level == 0);
1801
1802        btrfs_node_key_to_cpu(parent, &first_key, slot);
1803        eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1804                             btrfs_node_ptr_generation(parent, slot),
1805                             level - 1, &first_key);
1806        if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1807                free_extent_buffer(eb);
1808                eb = ERR_PTR(-EIO);
1809        }
1810
1811        return eb;
1812}
1813
1814/*
1815 * node level balancing, used to make sure nodes are in proper order for
1816 * item deletion.  We balance from the top down, so we have to make sure
1817 * that a deletion won't leave an node completely empty later on.
1818 */
1819static noinline int balance_level(struct btrfs_trans_handle *trans,
1820                         struct btrfs_root *root,
1821                         struct btrfs_path *path, int level)
1822{
1823        struct btrfs_fs_info *fs_info = root->fs_info;
1824        struct extent_buffer *right = NULL;
1825        struct extent_buffer *mid;
1826        struct extent_buffer *left = NULL;
1827        struct extent_buffer *parent = NULL;
1828        int ret = 0;
1829        int wret;
1830        int pslot;
1831        int orig_slot = path->slots[level];
1832        u64 orig_ptr;
1833
1834        ASSERT(level > 0);
1835
1836        mid = path->nodes[level];
1837
1838        WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1839                path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1840        WARN_ON(btrfs_header_generation(mid) != trans->transid);
1841
1842        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1843
1844        if (level < BTRFS_MAX_LEVEL - 1) {
1845                parent = path->nodes[level + 1];
1846                pslot = path->slots[level + 1];
1847        }
1848
1849        /*
1850         * deal with the case where there is only one pointer in the root
1851         * by promoting the node below to a root
1852         */
1853        if (!parent) {
1854                struct extent_buffer *child;
1855
1856                if (btrfs_header_nritems(mid) != 1)
1857                        return 0;
1858
1859                /* promote the child to a root */
1860                child = btrfs_read_node_slot(mid, 0);
1861                if (IS_ERR(child)) {
1862                        ret = PTR_ERR(child);
1863                        btrfs_handle_fs_error(fs_info, ret, NULL);
1864                        goto enospc;
1865                }
1866
1867                btrfs_tree_lock(child);
1868                btrfs_set_lock_blocking_write(child);
1869                ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1870                                      BTRFS_NESTING_COW);
1871                if (ret) {
1872                        btrfs_tree_unlock(child);
1873                        free_extent_buffer(child);
1874                        goto enospc;
1875                }
1876
1877                ret = tree_mod_log_insert_root(root->node, child, 1);
1878                BUG_ON(ret < 0);
1879                rcu_assign_pointer(root->node, child);
1880
1881                add_root_to_dirty_list(root);
1882                btrfs_tree_unlock(child);
1883
1884                path->locks[level] = 0;
1885                path->nodes[level] = NULL;
1886                btrfs_clean_tree_block(mid);
1887                btrfs_tree_unlock(mid);
1888                /* once for the path */
1889                free_extent_buffer(mid);
1890
1891                root_sub_used(root, mid->len);
1892                btrfs_free_tree_block(trans, root, mid, 0, 1);
1893                /* once for the root ptr */
1894                free_extent_buffer_stale(mid);
1895                return 0;
1896        }
1897        if (btrfs_header_nritems(mid) >
1898            BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1899                return 0;
1900
1901        left = btrfs_read_node_slot(parent, pslot - 1);
1902        if (IS_ERR(left))
1903                left = NULL;
1904
1905        if (left) {
1906                __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1907                btrfs_set_lock_blocking_write(left);
1908                wret = btrfs_cow_block(trans, root, left,
1909                                       parent, pslot - 1, &left,
1910                                       BTRFS_NESTING_LEFT_COW);
1911                if (wret) {
1912                        ret = wret;
1913                        goto enospc;
1914                }
1915        }
1916
1917        right = btrfs_read_node_slot(parent, pslot + 1);
1918        if (IS_ERR(right))
1919                right = NULL;
1920
1921        if (right) {
1922                __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1923                btrfs_set_lock_blocking_write(right);
1924                wret = btrfs_cow_block(trans, root, right,
1925                                       parent, pslot + 1, &right,
1926                                       BTRFS_NESTING_RIGHT_COW);
1927                if (wret) {
1928                        ret = wret;
1929                        goto enospc;
1930                }
1931        }
1932
1933        /* first, try to make some room in the middle buffer */
1934        if (left) {
1935                orig_slot += btrfs_header_nritems(left);
1936                wret = push_node_left(trans, left, mid, 1);
1937                if (wret < 0)
1938                        ret = wret;
1939        }
1940
1941        /*
1942         * then try to empty the right most buffer into the middle
1943         */
1944        if (right) {
1945                wret = push_node_left(trans, mid, right, 1);
1946                if (wret < 0 && wret != -ENOSPC)
1947                        ret = wret;
1948                if (btrfs_header_nritems(right) == 0) {
1949                        btrfs_clean_tree_block(right);
1950                        btrfs_tree_unlock(right);
1951                        del_ptr(root, path, level + 1, pslot + 1);
1952                        root_sub_used(root, right->len);
1953                        btrfs_free_tree_block(trans, root, right, 0, 1);
1954                        free_extent_buffer_stale(right);
1955                        right = NULL;
1956                } else {
1957                        struct btrfs_disk_key right_key;
1958                        btrfs_node_key(right, &right_key, 0);
1959                        ret = tree_mod_log_insert_key(parent, pslot + 1,
1960                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
1961                        BUG_ON(ret < 0);
1962                        btrfs_set_node_key(parent, &right_key, pslot + 1);
1963                        btrfs_mark_buffer_dirty(parent);
1964                }
1965        }
1966        if (btrfs_header_nritems(mid) == 1) {
1967                /*
1968                 * we're not allowed to leave a node with one item in the
1969                 * tree during a delete.  A deletion from lower in the tree
1970                 * could try to delete the only pointer in this node.
1971                 * So, pull some keys from the left.
1972                 * There has to be a left pointer at this point because
1973                 * otherwise we would have pulled some pointers from the
1974                 * right
1975                 */
1976                if (!left) {
1977                        ret = -EROFS;
1978                        btrfs_handle_fs_error(fs_info, ret, NULL);
1979                        goto enospc;
1980                }
1981                wret = balance_node_right(trans, mid, left);
1982                if (wret < 0) {
1983                        ret = wret;
1984                        goto enospc;
1985                }
1986                if (wret == 1) {
1987                        wret = push_node_left(trans, left, mid, 1);
1988                        if (wret < 0)
1989                                ret = wret;
1990                }
1991                BUG_ON(wret == 1);
1992        }
1993        if (btrfs_header_nritems(mid) == 0) {
1994                btrfs_clean_tree_block(mid);
1995                btrfs_tree_unlock(mid);
1996                del_ptr(root, path, level + 1, pslot);
1997                root_sub_used(root, mid->len);
1998                btrfs_free_tree_block(trans, root, mid, 0, 1);
1999                free_extent_buffer_stale(mid);
2000                mid = NULL;
2001        } else {
2002                /* update the parent key to reflect our changes */
2003                struct btrfs_disk_key mid_key;
2004                btrfs_node_key(mid, &mid_key, 0);
2005                ret = tree_mod_log_insert_key(parent, pslot,
2006                                MOD_LOG_KEY_REPLACE, GFP_NOFS);
2007                BUG_ON(ret < 0);
2008                btrfs_set_node_key(parent, &mid_key, pslot);
2009                btrfs_mark_buffer_dirty(parent);
2010        }
2011
2012        /* update the path */
2013        if (left) {
2014                if (btrfs_header_nritems(left) > orig_slot) {
2015                        atomic_inc(&left->refs);
2016                        /* left was locked after cow */
2017                        path->nodes[level] = left;
2018                        path->slots[level + 1] -= 1;
2019                        path->slots[level] = orig_slot;
2020                        if (mid) {
2021                                btrfs_tree_unlock(mid);
2022                                free_extent_buffer(mid);
2023                        }
2024                } else {
2025                        orig_slot -= btrfs_header_nritems(left);
2026                        path->slots[level] = orig_slot;
2027                }
2028        }
2029        /* double check we haven't messed things up */
2030        if (orig_ptr !=
2031            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2032                BUG();
2033enospc:
2034        if (right) {
2035                btrfs_tree_unlock(right);
2036                free_extent_buffer(right);
2037        }
2038        if (left) {
2039                if (path->nodes[level] != left)
2040                        btrfs_tree_unlock(left);
2041                free_extent_buffer(left);
2042        }
2043        return ret;
2044}
2045
2046/* Node balancing for insertion.  Here we only split or push nodes around
2047 * when they are completely full.  This is also done top down, so we
2048 * have to be pessimistic.
2049 */
2050static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2051                                          struct btrfs_root *root,
2052                                          struct btrfs_path *path, int level)
2053{
2054        struct btrfs_fs_info *fs_info = root->fs_info;
2055        struct extent_buffer *right = NULL;
2056        struct extent_buffer *mid;
2057        struct extent_buffer *left = NULL;
2058        struct extent_buffer *parent = NULL;
2059        int ret = 0;
2060        int wret;
2061        int pslot;
2062        int orig_slot = path->slots[level];
2063
2064        if (level == 0)
2065                return 1;
2066
2067        mid = path->nodes[level];
2068        WARN_ON(btrfs_header_generation(mid) != trans->transid);
2069
2070        if (level < BTRFS_MAX_LEVEL - 1) {
2071                parent = path->nodes[level + 1];
2072                pslot = path->slots[level + 1];
2073        }
2074
2075        if (!parent)
2076                return 1;
2077
2078        left = btrfs_read_node_slot(parent, pslot - 1);
2079        if (IS_ERR(left))
2080                left = NULL;
2081
2082        /* first, try to make some room in the middle buffer */
2083        if (left) {
2084                u32 left_nr;
2085
2086                __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
2087                btrfs_set_lock_blocking_write(left);
2088
2089                left_nr = btrfs_header_nritems(left);
2090                if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2091                        wret = 1;
2092                } else {
2093                        ret = btrfs_cow_block(trans, root, left, parent,
2094                                              pslot - 1, &left,
2095                                              BTRFS_NESTING_LEFT_COW);
2096                        if (ret)
2097                                wret = 1;
2098                        else {
2099                                wret = push_node_left(trans, left, mid, 0);
2100                        }
2101                }
2102                if (wret < 0)
2103                        ret = wret;
2104                if (wret == 0) {
2105                        struct btrfs_disk_key disk_key;
2106                        orig_slot += left_nr;
2107                        btrfs_node_key(mid, &disk_key, 0);
2108                        ret = tree_mod_log_insert_key(parent, pslot,
2109                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
2110                        BUG_ON(ret < 0);
2111                        btrfs_set_node_key(parent, &disk_key, pslot);
2112                        btrfs_mark_buffer_dirty(parent);
2113                        if (btrfs_header_nritems(left) > orig_slot) {
2114                                path->nodes[level] = left;
2115                                path->slots[level + 1] -= 1;
2116                                path->slots[level] = orig_slot;
2117                                btrfs_tree_unlock(mid);
2118                                free_extent_buffer(mid);
2119                        } else {
2120                                orig_slot -=
2121                                        btrfs_header_nritems(left);
2122                                path->slots[level] = orig_slot;
2123                                btrfs_tree_unlock(left);
2124                                free_extent_buffer(left);
2125                        }
2126                        return 0;
2127                }
2128                btrfs_tree_unlock(left);
2129                free_extent_buffer(left);
2130        }
2131        right = btrfs_read_node_slot(parent, pslot + 1);
2132        if (IS_ERR(right))
2133                right = NULL;
2134
2135        /*
2136         * then try to empty the right most buffer into the middle
2137         */
2138        if (right) {
2139                u32 right_nr;
2140
2141                __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2142                btrfs_set_lock_blocking_write(right);
2143
2144                right_nr = btrfs_header_nritems(right);
2145                if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2146                        wret = 1;
2147                } else {
2148                        ret = btrfs_cow_block(trans, root, right,
2149                                              parent, pslot + 1,
2150                                              &right, BTRFS_NESTING_RIGHT_COW);
2151                        if (ret)
2152                                wret = 1;
2153                        else {
2154                                wret = balance_node_right(trans, right, mid);
2155                        }
2156                }
2157                if (wret < 0)
2158                        ret = wret;
2159                if (wret == 0) {
2160                        struct btrfs_disk_key disk_key;
2161
2162                        btrfs_node_key(right, &disk_key, 0);
2163                        ret = tree_mod_log_insert_key(parent, pslot + 1,
2164                                        MOD_LOG_KEY_REPLACE, GFP_NOFS);
2165                        BUG_ON(ret < 0);
2166                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
2167                        btrfs_mark_buffer_dirty(parent);
2168
2169                        if (btrfs_header_nritems(mid) <= orig_slot) {
2170                                path->nodes[level] = right;
2171                                path->slots[level + 1] += 1;
2172                                path->slots[level] = orig_slot -
2173                                        btrfs_header_nritems(mid);
2174                                btrfs_tree_unlock(mid);
2175                                free_extent_buffer(mid);
2176                        } else {
2177                                btrfs_tree_unlock(right);
2178                                free_extent_buffer(right);
2179                        }
2180                        return 0;
2181                }
2182                btrfs_tree_unlock(right);
2183                free_extent_buffer(right);
2184        }
2185        return 1;
2186}
2187
2188/*
2189 * readahead one full node of leaves, finding things that are close
2190 * to the block in 'slot', and triggering ra on them.
2191 */
2192static void reada_for_search(struct btrfs_fs_info *fs_info,
2193                             struct btrfs_path *path,
2194                             int level, int slot, u64 objectid)
2195{
2196        struct extent_buffer *node;
2197        struct btrfs_disk_key disk_key;
2198        u32 nritems;
2199        u64 search;
2200        u64 target;
2201        u64 nread = 0;
2202        struct extent_buffer *eb;
2203        u32 nr;
2204        u32 blocksize;
2205        u32 nscan = 0;
2206
2207        if (level != 1)
2208                return;
2209
2210        if (!path->nodes[level])
2211                return;
2212
2213        node = path->nodes[level];
2214
2215        search = btrfs_node_blockptr(node, slot);
2216        blocksize = fs_info->nodesize;
2217        eb = find_extent_buffer(fs_info, search);
2218        if (eb) {
2219                free_extent_buffer(eb);
2220                return;
2221        }
2222
2223        target = search;
2224
2225        nritems = btrfs_header_nritems(node);
2226        nr = slot;
2227
2228        while (1) {
2229                if (path->reada == READA_BACK) {
2230                        if (nr == 0)
2231                                break;
2232                        nr--;
2233                } else if (path->reada == READA_FORWARD) {
2234                        nr++;
2235                        if (nr >= nritems)
2236                                break;
2237                }
2238                if (path->reada == READA_BACK && objectid) {
2239                        btrfs_node_key(node, &disk_key, nr);
2240                        if (btrfs_disk_key_objectid(&disk_key) != objectid)
2241                                break;
2242                }
2243                search = btrfs_node_blockptr(node, nr);
2244                if ((search <= target && target - search <= 65536) ||
2245                    (search > target && search - target <= 65536)) {
2246                        readahead_tree_block(fs_info, search);
2247                        nread += blocksize;
2248                }
2249                nscan++;
2250                if ((nread > 65536 || nscan > 32))
2251                        break;
2252        }
2253}
2254
2255static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2256                                       struct btrfs_path *path, int level)
2257{
2258        int slot;
2259        int nritems;
2260        struct extent_buffer *parent;
2261        struct extent_buffer *eb;
2262        u64 gen;
2263        u64 block1 = 0;
2264        u64 block2 = 0;
2265
2266        parent = path->nodes[level + 1];
2267        if (!parent)
2268                return;
2269
2270        nritems = btrfs_header_nritems(parent);
2271        slot = path->slots[level + 1];
2272
2273        if (slot > 0) {
2274                block1 = btrfs_node_blockptr(parent, slot - 1);
2275                gen = btrfs_node_ptr_generation(parent, slot - 1);
2276                eb = find_extent_buffer(fs_info, block1);
2277                /*
2278                 * if we get -eagain from btrfs_buffer_uptodate, we
2279                 * don't want to return eagain here.  That will loop
2280                 * forever
2281                 */
2282                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2283                        block1 = 0;
2284                free_extent_buffer(eb);
2285        }
2286        if (slot + 1 < nritems) {
2287                block2 = btrfs_node_blockptr(parent, slot + 1);
2288                gen = btrfs_node_ptr_generation(parent, slot + 1);
2289                eb = find_extent_buffer(fs_info, block2);
2290                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2291                        block2 = 0;
2292                free_extent_buffer(eb);
2293        }
2294
2295        if (block1)
2296                readahead_tree_block(fs_info, block1);
2297        if (block2)
2298                readahead_tree_block(fs_info, block2);
2299}
2300
2301
2302/*
2303 * when we walk down the tree, it is usually safe to unlock the higher layers
2304 * in the tree.  The exceptions are when our path goes through slot 0, because
2305 * operations on the tree might require changing key pointers higher up in the
2306 * tree.
2307 *
2308 * callers might also have set path->keep_locks, which tells this code to keep
2309 * the lock if the path points to the last slot in the block.  This is part of
2310 * walking through the tree, and selecting the next slot in the higher block.
2311 *
2312 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2313 * if lowest_unlock is 1, level 0 won't be unlocked
2314 */
2315static noinline void unlock_up(struct btrfs_path *path, int level,
2316                               int lowest_unlock, int min_write_lock_level,
2317                               int *write_lock_level)
2318{
2319        int i;
2320        int skip_level = level;
2321        int no_skips = 0;
2322        struct extent_buffer *t;
2323
2324        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2325                if (!path->nodes[i])
2326                        break;
2327                if (!path->locks[i])
2328                        break;
2329                if (!no_skips && path->slots[i] == 0) {
2330                        skip_level = i + 1;
2331                        continue;
2332                }
2333                if (!no_skips && path->keep_locks) {
2334                        u32 nritems;
2335                        t = path->nodes[i];
2336                        nritems = btrfs_header_nritems(t);
2337                        if (nritems < 1 || path->slots[i] >= nritems - 1) {
2338                                skip_level = i + 1;
2339                                continue;
2340                        }
2341                }
2342                if (skip_level < i && i >= lowest_unlock)
2343                        no_skips = 1;
2344
2345                t = path->nodes[i];
2346                if (i >= lowest_unlock && i > skip_level) {
2347                        btrfs_tree_unlock_rw(t, path->locks[i]);
2348                        path->locks[i] = 0;
2349                        if (write_lock_level &&
2350                            i > min_write_lock_level &&
2351                            i <= *write_lock_level) {
2352                                *write_lock_level = i - 1;
2353                        }
2354                }
2355        }
2356}
2357
2358/*
2359 * helper function for btrfs_search_slot.  The goal is to find a block
2360 * in cache without setting the path to blocking.  If we find the block
2361 * we return zero and the path is unchanged.
2362 *
2363 * If we can't find the block, we set the path blocking and do some
2364 * reada.  -EAGAIN is returned and the search must be repeated.
2365 */
2366static int
2367read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2368                      struct extent_buffer **eb_ret, int level, int slot,
2369                      const struct btrfs_key *key)
2370{
2371        struct btrfs_fs_info *fs_info = root->fs_info;
2372        u64 blocknr;
2373        u64 gen;
2374        struct extent_buffer *tmp;
2375        struct btrfs_key first_key;
2376        int ret;
2377        int parent_level;
2378
2379        blocknr = btrfs_node_blockptr(*eb_ret, slot);
2380        gen = btrfs_node_ptr_generation(*eb_ret, slot);
2381        parent_level = btrfs_header_level(*eb_ret);
2382        btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
2383
2384        tmp = find_extent_buffer(fs_info, blocknr);
2385        if (tmp) {
2386                /* first we do an atomic uptodate check */
2387                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2388                        /*
2389                         * Do extra check for first_key, eb can be stale due to
2390                         * being cached, read from scrub, or have multiple
2391                         * parents (shared tree blocks).
2392                         */
2393                        if (btrfs_verify_level_key(tmp,
2394                                        parent_level - 1, &first_key, gen)) {
2395                                free_extent_buffer(tmp);
2396                                return -EUCLEAN;
2397                        }
2398                        *eb_ret = tmp;
2399                        return 0;
2400                }
2401
2402                /* the pages were up to date, but we failed
2403                 * the generation number check.  Do a full
2404                 * read for the generation number that is correct.
2405                 * We must do this without dropping locks so
2406                 * we can trust our generation number
2407                 */
2408                btrfs_set_path_blocking(p);
2409
2410                /* now we're allowed to do a blocking uptodate check */
2411                ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2412                if (!ret) {
2413                        *eb_ret = tmp;
2414                        return 0;
2415                }
2416                free_extent_buffer(tmp);
2417                btrfs_release_path(p);
2418                return -EIO;
2419        }
2420
2421        /*
2422         * reduce lock contention at high levels
2423         * of the btree by dropping locks before
2424         * we read.  Don't release the lock on the current
2425         * level because we need to walk this node to figure
2426         * out which blocks to read.
2427         */
2428        btrfs_unlock_up_safe(p, level + 1);
2429        btrfs_set_path_blocking(p);
2430
2431        if (p->reada != READA_NONE)
2432                reada_for_search(fs_info, p, level, slot, key->objectid);
2433
2434        ret = -EAGAIN;
2435        tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2436                              &first_key);
2437        if (!IS_ERR(tmp)) {
2438                /*
2439                 * If the read above didn't mark this buffer up to date,
2440                 * it will never end up being up to date.  Set ret to EIO now
2441                 * and give up so that our caller doesn't loop forever
2442                 * on our EAGAINs.
2443                 */
2444                if (!extent_buffer_uptodate(tmp))
2445                        ret = -EIO;
2446                free_extent_buffer(tmp);
2447        } else {
2448                ret = PTR_ERR(tmp);
2449        }
2450
2451        btrfs_release_path(p);
2452        return ret;
2453}
2454
2455/*
2456 * helper function for btrfs_search_slot.  This does all of the checks
2457 * for node-level blocks and does any balancing required based on
2458 * the ins_len.
2459 *
2460 * If no extra work was required, zero is returned.  If we had to
2461 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2462 * start over
2463 */
2464static int
2465setup_nodes_for_search(struct btrfs_trans_handle *trans,
2466                       struct btrfs_root *root, struct btrfs_path *p,
2467                       struct extent_buffer *b, int level, int ins_len,
2468                       int *write_lock_level)
2469{
2470        struct btrfs_fs_info *fs_info = root->fs_info;
2471        int ret;
2472
2473        if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2474            BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2475                int sret;
2476
2477                if (*write_lock_level < level + 1) {
2478                        *write_lock_level = level + 1;
2479                        btrfs_release_path(p);
2480                        goto again;
2481                }
2482
2483                btrfs_set_path_blocking(p);
2484                reada_for_balance(fs_info, p, level);
2485                sret = split_node(trans, root, p, level);
2486
2487                BUG_ON(sret > 0);
2488                if (sret) {
2489                        ret = sret;
2490                        goto done;
2491                }
2492                b = p->nodes[level];
2493        } else if (ins_len < 0 && btrfs_header_nritems(b) <
2494                   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2495                int sret;
2496
2497                if (*write_lock_level < level + 1) {
2498                        *write_lock_level = level + 1;
2499                        btrfs_release_path(p);
2500                        goto again;
2501                }
2502
2503                btrfs_set_path_blocking(p);
2504                reada_for_balance(fs_info, p, level);
2505                sret = balance_level(trans, root, p, level);
2506
2507                if (sret) {
2508                        ret = sret;
2509                        goto done;
2510                }
2511                b = p->nodes[level];
2512                if (!b) {
2513                        btrfs_release_path(p);
2514                        goto again;
2515                }
2516                BUG_ON(btrfs_header_nritems(b) == 1);
2517        }
2518        return 0;
2519
2520again:
2521        ret = -EAGAIN;
2522done:
2523        return ret;
2524}
2525
2526int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2527                u64 iobjectid, u64 ioff, u8 key_type,
2528                struct btrfs_key *found_key)
2529{
2530        int ret;
2531        struct btrfs_key key;
2532        struct extent_buffer *eb;
2533
2534        ASSERT(path);
2535        ASSERT(found_key);
2536
2537        key.type = key_type;
2538        key.objectid = iobjectid;
2539        key.offset = ioff;
2540
2541        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2542        if (ret < 0)
2543                return ret;
2544
2545        eb = path->nodes[0];
2546        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2547                ret = btrfs_next_leaf(fs_root, path);
2548                if (ret)
2549                        return ret;
2550                eb = path->nodes[0];
2551        }
2552
2553        btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2554        if (found_key->type != key.type ||
2555                        found_key->objectid != key.objectid)
2556                return 1;
2557
2558        return 0;
2559}
2560
2561static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2562                                                        struct btrfs_path *p,
2563                                                        int write_lock_level)
2564{
2565        struct btrfs_fs_info *fs_info = root->fs_info;
2566        struct extent_buffer *b;
2567        int root_lock;
2568        int level = 0;
2569
2570        /* We try very hard to do read locks on the root */
2571        root_lock = BTRFS_READ_LOCK;
2572
2573        if (p->search_commit_root) {
2574                /*
2575                 * The commit roots are read only so we always do read locks,
2576                 * and we always must hold the commit_root_sem when doing
2577                 * searches on them, the only exception is send where we don't
2578                 * want to block transaction commits for a long time, so
2579                 * we need to clone the commit root in order to avoid races
2580                 * with transaction commits that create a snapshot of one of
2581                 * the roots used by a send operation.
2582                 */
2583                if (p->need_commit_sem) {
2584                        down_read(&fs_info->commit_root_sem);
2585                        b = btrfs_clone_extent_buffer(root->commit_root);
2586                        up_read(&fs_info->commit_root_sem);
2587                        if (!b)
2588                                return ERR_PTR(-ENOMEM);
2589
2590                } else {
2591                        b = root->commit_root;
2592                        atomic_inc(&b->refs);
2593                }
2594                level = btrfs_header_level(b);
2595                /*
2596                 * Ensure that all callers have set skip_locking when
2597                 * p->search_commit_root = 1.
2598                 */
2599                ASSERT(p->skip_locking == 1);
2600
2601                goto out;
2602        }
2603
2604        if (p->skip_locking) {
2605                b = btrfs_root_node(root);
2606                level = btrfs_header_level(b);
2607                goto out;
2608        }
2609
2610        /*
2611         * If the level is set to maximum, we can skip trying to get the read
2612         * lock.
2613         */
2614        if (write_lock_level < BTRFS_MAX_LEVEL) {
2615                /*
2616                 * We don't know the level of the root node until we actually
2617                 * have it read locked
2618                 */
2619                b = __btrfs_read_lock_root_node(root, p->recurse);
2620                level = btrfs_header_level(b);
2621                if (level > write_lock_level)
2622                        goto out;
2623
2624                /* Whoops, must trade for write lock */
2625                btrfs_tree_read_unlock(b);
2626                free_extent_buffer(b);
2627        }
2628
2629        b = btrfs_lock_root_node(root);
2630        root_lock = BTRFS_WRITE_LOCK;
2631
2632        /* The level might have changed, check again */
2633        level = btrfs_header_level(b);
2634
2635out:
2636        p->nodes[level] = b;
2637        if (!p->skip_locking)
2638                p->locks[level] = root_lock;
2639        /*
2640         * Callers are responsible for dropping b's references.
2641         */
2642        return b;
2643}
2644
2645
2646/*
2647 * btrfs_search_slot - look for a key in a tree and perform necessary
2648 * modifications to preserve tree invariants.
2649 *
2650 * @trans:      Handle of transaction, used when modifying the tree
2651 * @p:          Holds all btree nodes along the search path
2652 * @root:       The root node of the tree
2653 * @key:        The key we are looking for
2654 * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2655 *              deletions it's -1. 0 for plain searches
2656 * @cow:        boolean should CoW operations be performed. Must always be 1
2657 *              when modifying the tree.
2658 *
2659 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2660 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2661 *
2662 * If @key is found, 0 is returned and you can find the item in the leaf level
2663 * of the path (level 0)
2664 *
2665 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2666 * points to the slot where it should be inserted
2667 *
2668 * If an error is encountered while searching the tree a negative error number
2669 * is returned
2670 */
2671int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2672                      const struct btrfs_key *key, struct btrfs_path *p,
2673                      int ins_len, int cow)
2674{
2675        struct extent_buffer *b;
2676        int slot;
2677        int ret;
2678        int err;
2679        int level;
2680        int lowest_unlock = 1;
2681        /* everything at write_lock_level or lower must be write locked */
2682        int write_lock_level = 0;
2683        u8 lowest_level = 0;
2684        int min_write_lock_level;
2685        int prev_cmp;
2686
2687        lowest_level = p->lowest_level;
2688        WARN_ON(lowest_level && ins_len > 0);
2689        WARN_ON(p->nodes[0] != NULL);
2690        BUG_ON(!cow && ins_len);
2691
2692        if (ins_len < 0) {
2693                lowest_unlock = 2;
2694
2695                /* when we are removing items, we might have to go up to level
2696                 * two as we update tree pointers  Make sure we keep write
2697                 * for those levels as well
2698                 */
2699                write_lock_level = 2;
2700        } else if (ins_len > 0) {
2701                /*
2702                 * for inserting items, make sure we have a write lock on
2703                 * level 1 so we can update keys
2704                 */
2705                write_lock_level = 1;
2706        }
2707
2708        if (!cow)
2709                write_lock_level = -1;
2710
2711        if (cow && (p->keep_locks || p->lowest_level))
2712                write_lock_level = BTRFS_MAX_LEVEL;
2713
2714        min_write_lock_level = write_lock_level;
2715
2716again:
2717        prev_cmp = -1;
2718        b = btrfs_search_slot_get_root(root, p, write_lock_level);
2719        if (IS_ERR(b)) {
2720                ret = PTR_ERR(b);
2721                goto done;
2722        }
2723
2724        while (b) {
2725                int dec = 0;
2726
2727                level = btrfs_header_level(b);
2728
2729                if (cow) {
2730                        bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2731
2732                        /*
2733                         * if we don't really need to cow this block
2734                         * then we don't want to set the path blocking,
2735                         * so we test it here
2736                         */
2737                        if (!should_cow_block(trans, root, b)) {
2738                                trans->dirty = true;
2739                                goto cow_done;
2740                        }
2741
2742                        /*
2743                         * must have write locks on this node and the
2744                         * parent
2745                         */
2746                        if (level > write_lock_level ||
2747                            (level + 1 > write_lock_level &&
2748                            level + 1 < BTRFS_MAX_LEVEL &&
2749                            p->nodes[level + 1])) {
2750                                write_lock_level = level + 1;
2751                                btrfs_release_path(p);
2752                                goto again;
2753                        }
2754
2755                        btrfs_set_path_blocking(p);
2756                        if (last_level)
2757                                err = btrfs_cow_block(trans, root, b, NULL, 0,
2758                                                      &b,
2759                                                      BTRFS_NESTING_COW);
2760                        else
2761                                err = btrfs_cow_block(trans, root, b,
2762                                                      p->nodes[level + 1],
2763                                                      p->slots[level + 1], &b,
2764                                                      BTRFS_NESTING_COW);
2765                        if (err) {
2766                                ret = err;
2767                                goto done;
2768                        }
2769                }
2770cow_done:
2771                p->nodes[level] = b;
2772                /*
2773                 * Leave path with blocking locks to avoid massive
2774                 * lock context switch, this is made on purpose.
2775                 */
2776
2777                /*
2778                 * we have a lock on b and as long as we aren't changing
2779                 * the tree, there is no way to for the items in b to change.
2780                 * It is safe to drop the lock on our parent before we
2781                 * go through the expensive btree search on b.
2782                 *
2783                 * If we're inserting or deleting (ins_len != 0), then we might
2784                 * be changing slot zero, which may require changing the parent.
2785                 * So, we can't drop the lock until after we know which slot
2786                 * we're operating on.
2787                 */
2788                if (!ins_len && !p->keep_locks) {
2789                        int u = level + 1;
2790
2791                        if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2792                                btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2793                                p->locks[u] = 0;
2794                        }
2795                }
2796
2797                /*
2798                 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
2799                 * we can safely assume the target key will always be in slot 0
2800                 * on lower levels due to the invariants BTRFS' btree provides,
2801                 * namely that a btrfs_key_ptr entry always points to the
2802                 * lowest key in the child node, thus we can skip searching
2803                 * lower levels
2804                 */
2805                if (prev_cmp == 0) {
2806                        slot = 0;
2807                        ret = 0;
2808                } else {
2809                        ret = btrfs_bin_search(b, key, &slot);
2810                        prev_cmp = ret;
2811                        if (ret < 0)
2812                                goto done;
2813                }
2814
2815                if (level == 0) {
2816                        p->slots[level] = slot;
2817                        if (ins_len > 0 &&
2818                            btrfs_leaf_free_space(b) < ins_len) {
2819                                if (write_lock_level < 1) {
2820                                        write_lock_level = 1;
2821                                        btrfs_release_path(p);
2822                                        goto again;
2823                                }
2824
2825                                btrfs_set_path_blocking(p);
2826                                err = split_leaf(trans, root, key,
2827                                                 p, ins_len, ret == 0);
2828
2829                                BUG_ON(err > 0);
2830                                if (err) {
2831                                        ret = err;
2832                                        goto done;
2833                                }
2834                        }
2835                        if (!p->search_for_split)
2836                                unlock_up(p, level, lowest_unlock,
2837                                          min_write_lock_level, NULL);
2838                        goto done;
2839                }
2840                if (ret && slot > 0) {
2841                        dec = 1;
2842                        slot--;
2843                }
2844                p->slots[level] = slot;
2845                err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2846                                             &write_lock_level);
2847                if (err == -EAGAIN)
2848                        goto again;
2849                if (err) {
2850                        ret = err;
2851                        goto done;
2852                }
2853                b = p->nodes[level];
2854                slot = p->slots[level];
2855
2856                /*
2857                 * Slot 0 is special, if we change the key we have to update
2858                 * the parent pointer which means we must have a write lock on
2859                 * the parent
2860                 */
2861                if (slot == 0 && ins_len && write_lock_level < level + 1) {
2862                        write_lock_level = level + 1;
2863                        btrfs_release_path(p);
2864                        goto again;
2865                }
2866
2867                unlock_up(p, level, lowest_unlock, min_write_lock_level,
2868                          &write_lock_level);
2869
2870                if (level == lowest_level) {
2871                        if (dec)
2872                                p->slots[level]++;
2873                        goto done;
2874                }
2875
2876                err = read_block_for_search(root, p, &b, level, slot, key);
2877                if (err == -EAGAIN)
2878                        goto again;
2879                if (err) {
2880                        ret = err;
2881                        goto done;
2882                }
2883
2884                if (!p->skip_locking) {
2885                        level = btrfs_header_level(b);
2886                        if (level <= write_lock_level) {
2887                                if (!btrfs_try_tree_write_lock(b)) {
2888                                        btrfs_set_path_blocking(p);
2889                                        btrfs_tree_lock(b);
2890                                }
2891                                p->locks[level] = BTRFS_WRITE_LOCK;
2892                        } else {
2893                                if (!btrfs_tree_read_lock_atomic(b)) {
2894                                        btrfs_set_path_blocking(p);
2895                                        __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
2896                                                               p->recurse);
2897                                }
2898                                p->locks[level] = BTRFS_READ_LOCK;
2899                        }
2900                        p->nodes[level] = b;
2901                }
2902        }
2903        ret = 1;
2904done:
2905        /*
2906         * we don't really know what they plan on doing with the path
2907         * from here on, so for now just mark it as blocking
2908         */
2909        if (!p->leave_spinning)
2910                btrfs_set_path_blocking(p);
2911        if (ret < 0 && !p->skip_release_on_error)
2912                btrfs_release_path(p);
2913        return ret;
2914}
2915
2916/*
2917 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2918 * current state of the tree together with the operations recorded in the tree
2919 * modification log to search for the key in a previous version of this tree, as
2920 * denoted by the time_seq parameter.
2921 *
2922 * Naturally, there is no support for insert, delete or cow operations.
2923 *
2924 * The resulting path and return value will be set up as if we called
2925 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2926 */
2927int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2928                          struct btrfs_path *p, u64 time_seq)
2929{
2930        struct btrfs_fs_info *fs_info = root->fs_info;
2931        struct extent_buffer *b;
2932        int slot;
2933        int ret;
2934        int err;
2935        int level;
2936        int lowest_unlock = 1;
2937        u8 lowest_level = 0;
2938
2939        lowest_level = p->lowest_level;
2940        WARN_ON(p->nodes[0] != NULL);
2941
2942        if (p->search_commit_root) {
2943                BUG_ON(time_seq);
2944                return btrfs_search_slot(NULL, root, key, p, 0, 0);
2945        }
2946
2947again:
2948        b = get_old_root(root, time_seq);
2949        if (!b) {
2950                ret = -EIO;
2951                goto done;
2952        }
2953        level = btrfs_header_level(b);
2954        p->locks[level] = BTRFS_READ_LOCK;
2955
2956        while (b) {
2957                int dec = 0;
2958
2959                level = btrfs_header_level(b);
2960                p->nodes[level] = b;
2961
2962                /*
2963                 * we have a lock on b and as long as we aren't changing
2964                 * the tree, there is no way to for the items in b to change.
2965                 * It is safe to drop the lock on our parent before we
2966                 * go through the expensive btree search on b.
2967                 */
2968                btrfs_unlock_up_safe(p, level + 1);
2969
2970                ret = btrfs_bin_search(b, key, &slot);
2971                if (ret < 0)
2972                        goto done;
2973
2974                if (level == 0) {
2975                        p->slots[level] = slot;
2976                        unlock_up(p, level, lowest_unlock, 0, NULL);
2977                        goto done;
2978                }
2979
2980                if (ret && slot > 0) {
2981                        dec = 1;
2982                        slot--;
2983                }
2984                p->slots[level] = slot;
2985                unlock_up(p, level, lowest_unlock, 0, NULL);
2986
2987                if (level == lowest_level) {
2988                        if (dec)
2989                                p->slots[level]++;
2990                        goto done;
2991                }
2992
2993                err = read_block_for_search(root, p, &b, level, slot, key);
2994                if (err == -EAGAIN)
2995                        goto again;
2996                if (err) {
2997                        ret = err;
2998                        goto done;
2999                }
3000
3001                level = btrfs_header_level(b);
3002                if (!btrfs_tree_read_lock_atomic(b)) {
3003                        btrfs_set_path_blocking(p);
3004                        btrfs_tree_read_lock(b);
3005                }
3006                b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3007                if (!b) {
3008                        ret = -ENOMEM;
3009                        goto done;
3010                }
3011                p->locks[level] = BTRFS_READ_LOCK;
3012                p->nodes[level] = b;
3013        }
3014        ret = 1;
3015done:
3016        if (!p->leave_spinning)
3017                btrfs_set_path_blocking(p);
3018        if (ret < 0)
3019                btrfs_release_path(p);
3020
3021        return ret;
3022}
3023
3024/*
3025 * helper to use instead of search slot if no exact match is needed but
3026 * instead the next or previous item should be returned.
3027 * When find_higher is true, the next higher item is returned, the next lower
3028 * otherwise.
3029 * When return_any and find_higher are both true, and no higher item is found,
3030 * return the next lower instead.
3031 * When return_any is true and find_higher is false, and no lower item is found,
3032 * return the next higher instead.
3033 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3034 * < 0 on error
3035 */
3036int btrfs_search_slot_for_read(struct btrfs_root *root,
3037                               const struct btrfs_key *key,
3038                               struct btrfs_path *p, int find_higher,
3039                               int return_any)
3040{
3041        int ret;
3042        struct extent_buffer *leaf;
3043
3044again:
3045        ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3046        if (ret <= 0)
3047                return ret;
3048        /*
3049         * a return value of 1 means the path is at the position where the
3050         * item should be inserted. Normally this is the next bigger item,
3051         * but in case the previous item is the last in a leaf, path points
3052         * to the first free slot in the previous leaf, i.e. at an invalid
3053         * item.
3054         */
3055        leaf = p->nodes[0];
3056
3057        if (find_higher) {
3058                if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3059                        ret = btrfs_next_leaf(root, p);
3060                        if (ret <= 0)
3061                                return ret;
3062                        if (!return_any)
3063                                return 1;
3064                        /*
3065                         * no higher item found, return the next
3066                         * lower instead
3067                         */
3068                        return_any = 0;
3069                        find_higher = 0;
3070                        btrfs_release_path(p);
3071                        goto again;
3072                }
3073        } else {
3074                if (p->slots[0] == 0) {
3075                        ret = btrfs_prev_leaf(root, p);
3076                        if (ret < 0)
3077                                return ret;
3078                        if (!ret) {
3079                                leaf = p->nodes[0];
3080                                if (p->slots[0] == btrfs_header_nritems(leaf))
3081                                        p->slots[0]--;
3082                                return 0;
3083                        }
3084                        if (!return_any)
3085                                return 1;
3086                        /*
3087                         * no lower item found, return the next
3088                         * higher instead
3089                         */
3090                        return_any = 0;
3091                        find_higher = 1;
3092                        btrfs_release_path(p);
3093                        goto again;
3094                } else {
3095                        --p->slots[0];
3096                }
3097        }
3098        return 0;
3099}
3100
3101/*
3102 * adjust the pointers going up the tree, starting at level
3103 * making sure the right key of each node is points to 'key'.
3104 * This is used after shifting pointers to the left, so it stops
3105 * fixing up pointers when a given leaf/node is not in slot 0 of the
3106 * higher levels
3107 *
3108 */
3109static void fixup_low_keys(struct btrfs_path *path,
3110                           struct btrfs_disk_key *key, int level)
3111{
3112        int i;
3113        struct extent_buffer *t;
3114        int ret;
3115
3116        for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3117                int tslot = path->slots[i];
3118
3119                if (!path->nodes[i])
3120                        break;
3121                t = path->nodes[i];
3122                ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3123                                GFP_ATOMIC);
3124                BUG_ON(ret < 0);
3125                btrfs_set_node_key(t, key, tslot);
3126                btrfs_mark_buffer_dirty(path->nodes[i]);
3127                if (tslot != 0)
3128                        break;
3129        }
3130}
3131
3132/*
3133 * update item key.
3134 *
3135 * This function isn't completely safe. It's the caller's responsibility
3136 * that the new key won't break the order
3137 */
3138void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3139                             struct btrfs_path *path,
3140                             const struct btrfs_key *new_key)
3141{
3142        struct btrfs_disk_key disk_key;
3143        struct extent_buffer *eb;
3144        int slot;
3145
3146        eb = path->nodes[0];
3147        slot = path->slots[0];
3148        if (slot > 0) {
3149                btrfs_item_key(eb, &disk_key, slot - 1);
3150                if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3151                        btrfs_crit(fs_info,
3152                "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3153                                   slot, btrfs_disk_key_objectid(&disk_key),
3154                                   btrfs_disk_key_type(&disk_key),
3155                                   btrfs_disk_key_offset(&disk_key),
3156                                   new_key->objectid, new_key->type,
3157                                   new_key->offset);
3158                        btrfs_print_leaf(eb);
3159                        BUG();
3160                }
3161        }
3162        if (slot < btrfs_header_nritems(eb) - 1) {
3163                btrfs_item_key(eb, &disk_key, slot + 1);
3164                if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3165                        btrfs_crit(fs_info,
3166                "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3167                                   slot, btrfs_disk_key_objectid(&disk_key),
3168                                   btrfs_disk_key_type(&disk_key),
3169                                   btrfs_disk_key_offset(&disk_key),
3170                                   new_key->objectid, new_key->type,
3171                                   new_key->offset);
3172                        btrfs_print_leaf(eb);
3173                        BUG();
3174                }
3175        }
3176
3177        btrfs_cpu_key_to_disk(&disk_key, new_key);
3178        btrfs_set_item_key(eb, &disk_key, slot);
3179        btrfs_mark_buffer_dirty(eb);
3180        if (slot == 0)
3181                fixup_low_keys(path, &disk_key, 1);
3182}
3183
3184/*
3185 * Check key order of two sibling extent buffers.
3186 *
3187 * Return true if something is wrong.
3188 * Return false if everything is fine.
3189 *
3190 * Tree-checker only works inside one tree block, thus the following
3191 * corruption can not be detected by tree-checker:
3192 *
3193 * Leaf @left                   | Leaf @right
3194 * --------------------------------------------------------------
3195 * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
3196 *
3197 * Key f6 in leaf @left itself is valid, but not valid when the next
3198 * key in leaf @right is 7.
3199 * This can only be checked at tree block merge time.
3200 * And since tree checker has ensured all key order in each tree block
3201 * is correct, we only need to bother the last key of @left and the first
3202 * key of @right.
3203 */
3204static bool check_sibling_keys(struct extent_buffer *left,
3205                               struct extent_buffer *right)
3206{
3207        struct btrfs_key left_last;
3208        struct btrfs_key right_first;
3209        int level = btrfs_header_level(left);
3210        int nr_left = btrfs_header_nritems(left);
3211        int nr_right = btrfs_header_nritems(right);
3212
3213        /* No key to check in one of the tree blocks */
3214        if (!nr_left || !nr_right)
3215                return false;
3216
3217        if (level) {
3218                btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
3219                btrfs_node_key_to_cpu(right, &right_first, 0);
3220        } else {
3221                btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
3222                btrfs_item_key_to_cpu(right, &right_first, 0);
3223        }
3224
3225        if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
3226                btrfs_crit(left->fs_info,
3227"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
3228                           left_last.objectid, left_last.type,
3229                           left_last.offset, right_first.objectid,
3230                           right_first.type, right_first.offset);
3231                return true;
3232        }
3233        return false;
3234}
3235
3236/*
3237 * try to push data from one node into the next node left in the
3238 * tree.
3239 *
3240 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3241 * error, and > 0 if there was no room in the left hand block.
3242 */
3243static int push_node_left(struct btrfs_trans_handle *trans,
3244                          struct extent_buffer *dst,
3245                          struct extent_buffer *src, int empty)
3246{
3247        struct btrfs_fs_info *fs_info = trans->fs_info;
3248        int push_items = 0;
3249        int src_nritems;
3250        int dst_nritems;
3251        int ret = 0;
3252
3253        src_nritems = btrfs_header_nritems(src);
3254        dst_nritems = btrfs_header_nritems(dst);
3255        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3256        WARN_ON(btrfs_header_generation(src) != trans->transid);
3257        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3258
3259        if (!empty && src_nritems <= 8)
3260                return 1;
3261
3262        if (push_items <= 0)
3263                return 1;
3264
3265        if (empty) {
3266                push_items = min(src_nritems, push_items);
3267                if (push_items < src_nritems) {
3268                        /* leave at least 8 pointers in the node if
3269                         * we aren't going to empty it
3270                         */
3271                        if (src_nritems - push_items < 8) {
3272                                if (push_items <= 8)
3273                                        return 1;
3274                                push_items -= 8;
3275                        }
3276                }
3277        } else
3278                push_items = min(src_nritems - 8, push_items);
3279
3280        /* dst is the left eb, src is the middle eb */
3281        if (check_sibling_keys(dst, src)) {
3282                ret = -EUCLEAN;
3283                btrfs_abort_transaction(trans, ret);
3284                return ret;
3285        }
3286        ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3287        if (ret) {
3288                btrfs_abort_transaction(trans, ret);
3289                return ret;
3290        }
3291        copy_extent_buffer(dst, src,
3292                           btrfs_node_key_ptr_offset(dst_nritems),
3293                           btrfs_node_key_ptr_offset(0),
3294                           push_items * sizeof(struct btrfs_key_ptr));
3295
3296        if (push_items < src_nritems) {
3297                /*
3298                 * Don't call tree_mod_log_insert_move here, key removal was
3299                 * already fully logged by tree_mod_log_eb_copy above.
3300                 */
3301                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3302                                      btrfs_node_key_ptr_offset(push_items),
3303                                      (src_nritems - push_items) *
3304                                      sizeof(struct btrfs_key_ptr));
3305        }
3306        btrfs_set_header_nritems(src, src_nritems - push_items);
3307        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3308        btrfs_mark_buffer_dirty(src);
3309        btrfs_mark_buffer_dirty(dst);
3310
3311        return ret;
3312}
3313
3314/*
3315 * try to push data from one node into the next node right in the
3316 * tree.
3317 *
3318 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3319 * error, and > 0 if there was no room in the right hand block.
3320 *
3321 * this will  only push up to 1/2 the contents of the left node over
3322 */
3323static int balance_node_right(struct btrfs_trans_handle *trans,
3324                              struct extent_buffer *dst,
3325                              struct extent_buffer *src)
3326{
3327        struct btrfs_fs_info *fs_info = trans->fs_info;
3328        int push_items = 0;
3329        int max_push;
3330        int src_nritems;
3331        int dst_nritems;
3332        int ret = 0;
3333
3334        WARN_ON(btrfs_header_generation(src) != trans->transid);
3335        WARN_ON(btrfs_header_generation(dst) != trans->transid);
3336
3337        src_nritems = btrfs_header_nritems(src);
3338        dst_nritems = btrfs_header_nritems(dst);
3339        push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3340        if (push_items <= 0)
3341                return 1;
3342
3343        if (src_nritems < 4)
3344                return 1;
3345
3346        max_push = src_nritems / 2 + 1;
3347        /* don't try to empty the node */
3348        if (max_push >= src_nritems)
3349                return 1;
3350
3351        if (max_push < push_items)
3352                push_items = max_push;
3353
3354        /* dst is the right eb, src is the middle eb */
3355        if (check_sibling_keys(src, dst)) {
3356                ret = -EUCLEAN;
3357                btrfs_abort_transaction(trans, ret);
3358                return ret;
3359        }
3360        ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3361        BUG_ON(ret < 0);
3362        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3363                                      btrfs_node_key_ptr_offset(0),
3364                                      (dst_nritems) *
3365                                      sizeof(struct btrfs_key_ptr));
3366
3367        ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3368                                   push_items);
3369        if (ret) {
3370                btrfs_abort_transaction(trans, ret);
3371                return ret;
3372        }
3373        copy_extent_buffer(dst, src,
3374                           btrfs_node_key_ptr_offset(0),
3375                           btrfs_node_key_ptr_offset(src_nritems - push_items),
3376                           push_items * sizeof(struct btrfs_key_ptr));
3377
3378        btrfs_set_header_nritems(src, src_nritems - push_items);
3379        btrfs_set_header_nritems(dst, dst_nritems + push_items);
3380
3381        btrfs_mark_buffer_dirty(src);
3382        btrfs_mark_buffer_dirty(dst);
3383
3384        return ret;
3385}
3386
3387/*
3388 * helper function to insert a new root level in the tree.
3389 * A new node is allocated, and a single item is inserted to
3390 * point to the existing root
3391 *
3392 * returns zero on success or < 0 on failure.
3393 */
3394static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3395                           struct btrfs_root *root,
3396                           struct btrfs_path *path, int level)
3397{
3398        struct btrfs_fs_info *fs_info = root->fs_info;
3399        u64 lower_gen;
3400        struct extent_buffer *lower;
3401        struct extent_buffer *c;
3402        struct extent_buffer *old;
3403        struct btrfs_disk_key lower_key;
3404        int ret;
3405
3406        BUG_ON(path->nodes[level]);
3407        BUG_ON(path->nodes[level-1] != root->node);
3408
3409        lower = path->nodes[level-1];
3410        if (level == 1)
3411                btrfs_item_key(lower, &lower_key, 0);
3412        else
3413                btrfs_node_key(lower, &lower_key, 0);
3414
3415        c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3416                                         root->node->start, 0,
3417                                         BTRFS_NESTING_NEW_ROOT);
3418        if (IS_ERR(c))
3419                return PTR_ERR(c);
3420
3421        root_add_used(root, fs_info->nodesize);
3422
3423        btrfs_set_header_nritems(c, 1);
3424        btrfs_set_node_key(c, &lower_key, 0);
3425        btrfs_set_node_blockptr(c, 0, lower->start);
3426        lower_gen = btrfs_header_generation(lower);
3427        WARN_ON(lower_gen != trans->transid);
3428
3429        btrfs_set_node_ptr_generation(c, 0, lower_gen);
3430
3431        btrfs_mark_buffer_dirty(c);
3432
3433        old = root->node;
3434        ret = tree_mod_log_insert_root(root->node, c, 0);
3435        BUG_ON(ret < 0);
3436        rcu_assign_pointer(root->node, c);
3437
3438        /* the super has an extra ref to root->node */
3439        free_extent_buffer(old);
3440
3441        add_root_to_dirty_list(root);
3442        atomic_inc(&c->refs);
3443        path->nodes[level] = c;
3444        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3445        path->slots[level] = 0;
3446        return 0;
3447}
3448
3449/*
3450 * worker function to insert a single pointer in a node.
3451 * the node should have enough room for the pointer already
3452 *
3453 * slot and level indicate where you want the key to go, and
3454 * blocknr is the block the key points to.
3455 */
3456static void insert_ptr(struct btrfs_trans_handle *trans,
3457                       struct btrfs_path *path,
3458                       struct btrfs_disk_key *key, u64 bytenr,
3459                       int slot, int level)
3460{
3461        struct extent_buffer *lower;
3462        int nritems;
3463        int ret;
3464
3465        BUG_ON(!path->nodes[level]);
3466        btrfs_assert_tree_locked(path->nodes[level]);
3467        lower = path->nodes[level];
3468        nritems = btrfs_header_nritems(lower);
3469        BUG_ON(slot > nritems);
3470        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3471        if (slot != nritems) {
3472                if (level) {
3473                        ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3474                                        nritems - slot);
3475                        BUG_ON(ret < 0);
3476                }
3477                memmove_extent_buffer(lower,
3478                              btrfs_node_key_ptr_offset(slot + 1),
3479                              btrfs_node_key_ptr_offset(slot),
3480                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
3481        }
3482        if (level) {
3483                ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3484                                GFP_NOFS);
3485                BUG_ON(ret < 0);
3486        }
3487        btrfs_set_node_key(lower, key, slot);
3488        btrfs_set_node_blockptr(lower, slot, bytenr);
3489        WARN_ON(trans->transid == 0);
3490        btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3491        btrfs_set_header_nritems(lower, nritems + 1);
3492        btrfs_mark_buffer_dirty(lower);
3493}
3494
3495/*
3496 * split the node at the specified level in path in two.
3497 * The path is corrected to point to the appropriate node after the split
3498 *
3499 * Before splitting this tries to make some room in the node by pushing
3500 * left and right, if either one works, it returns right away.
3501 *
3502 * returns 0 on success and < 0 on failure
3503 */
3504static noinline int split_node(struct btrfs_trans_handle *trans,
3505                               struct btrfs_root *root,
3506                               struct btrfs_path *path, int level)
3507{
3508        struct btrfs_fs_info *fs_info = root->fs_info;
3509        struct extent_buffer *c;
3510        struct extent_buffer *split;
3511        struct btrfs_disk_key disk_key;
3512        int mid;
3513        int ret;
3514        u32 c_nritems;
3515
3516        c = path->nodes[level];
3517        WARN_ON(btrfs_header_generation(c) != trans->transid);
3518        if (c == root->node) {
3519                /*
3520                 * trying to split the root, lets make a new one
3521                 *
3522                 * tree mod log: We don't log_removal old root in
3523                 * insert_new_root, because that root buffer will be kept as a
3524                 * normal node. We are going to log removal of half of the
3525                 * elements below with tree_mod_log_eb_copy. We're holding a
3526                 * tree lock on the buffer, which is why we cannot race with
3527                 * other tree_mod_log users.
3528                 */
3529                ret = insert_new_root(trans, root, path, level + 1);
3530                if (ret)
3531                        return ret;
3532        } else {
3533                ret = push_nodes_for_insert(trans, root, path, level);
3534                c = path->nodes[level];
3535                if (!ret && btrfs_header_nritems(c) <
3536                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3537                        return 0;
3538                if (ret < 0)
3539                        return ret;
3540        }
3541
3542        c_nritems = btrfs_header_nritems(c);
3543        mid = (c_nritems + 1) / 2;
3544        btrfs_node_key(c, &disk_key, mid);
3545
3546        split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3547                                             c->start, 0, BTRFS_NESTING_SPLIT);
3548        if (IS_ERR(split))
3549                return PTR_ERR(split);
3550
3551        root_add_used(root, fs_info->nodesize);
3552        ASSERT(btrfs_header_level(c) == level);
3553
3554        ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3555        if (ret) {
3556                btrfs_abort_transaction(trans, ret);
3557                return ret;
3558        }
3559        copy_extent_buffer(split, c,
3560                           btrfs_node_key_ptr_offset(0),
3561                           btrfs_node_key_ptr_offset(mid),
3562                           (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3563        btrfs_set_header_nritems(split, c_nritems - mid);
3564        btrfs_set_header_nritems(c, mid);
3565        ret = 0;
3566
3567        btrfs_mark_buffer_dirty(c);
3568        btrfs_mark_buffer_dirty(split);
3569
3570        insert_ptr(trans, path, &disk_key, split->start,
3571                   path->slots[level + 1] + 1, level + 1);
3572
3573        if (path->slots[level] >= mid) {
3574                path->slots[level] -= mid;
3575                btrfs_tree_unlock(c);
3576                free_extent_buffer(c);
3577                path->nodes[level] = split;
3578                path->slots[level + 1] += 1;
3579        } else {
3580                btrfs_tree_unlock(split);
3581                free_extent_buffer(split);
3582        }
3583        return ret;
3584}
3585
3586/*
3587 * how many bytes are required to store the items in a leaf.  start
3588 * and nr indicate which items in the leaf to check.  This totals up the
3589 * space used both by the item structs and the item data
3590 */
3591static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3592{
3593        struct btrfs_item *start_item;
3594        struct btrfs_item *end_item;
3595        int data_len;
3596        int nritems = btrfs_header_nritems(l);
3597        int end = min(nritems, start + nr) - 1;
3598
3599        if (!nr)
3600                return 0;
3601        start_item = btrfs_item_nr(start);
3602        end_item = btrfs_item_nr(end);
3603        data_len = btrfs_item_offset(l, start_item) +
3604                   btrfs_item_size(l, start_item);
3605        data_len = data_len - btrfs_item_offset(l, end_item);
3606        data_len += sizeof(struct btrfs_item) * nr;
3607        WARN_ON(data_len < 0);
3608        return data_len;
3609}
3610
3611/*
3612 * The space between the end of the leaf items and
3613 * the start of the leaf data.  IOW, how much room
3614 * the leaf has left for both items and data
3615 */
3616noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3617{
3618        struct btrfs_fs_info *fs_info = leaf->fs_info;
3619        int nritems = btrfs_header_nritems(leaf);
3620        int ret;
3621
3622        ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3623        if (ret < 0) {
3624                btrfs_crit(fs_info,
3625                           "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3626                           ret,
3627                           (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3628                           leaf_space_used(leaf, 0, nritems), nritems);
3629        }
3630        return ret;
3631}
3632
3633/*
3634 * min slot controls the lowest index we're willing to push to the
3635 * right.  We'll push up to and including min_slot, but no lower
3636 */
3637static noinline int __push_leaf_right(struct btrfs_path *path,
3638                                      int data_size, int empty,
3639                                      struct extent_buffer *right,
3640                                      int free_space, u32 left_nritems,
3641                                      u32 min_slot)
3642{
3643        struct btrfs_fs_info *fs_info = right->fs_info;
3644        struct extent_buffer *left = path->nodes[0];
3645        struct extent_buffer *upper = path->nodes[1];
3646        struct btrfs_map_token token;
3647        struct btrfs_disk_key disk_key;
3648        int slot;
3649        u32 i;
3650        int push_space = 0;
3651        int push_items = 0;
3652        struct btrfs_item *item;
3653        u32 nr;
3654        u32 right_nritems;
3655        u32 data_end;
3656        u32 this_item_size;
3657
3658        if (empty)
3659                nr = 0;
3660        else
3661                nr = max_t(u32, 1, min_slot);
3662
3663        if (path->slots[0] >= left_nritems)
3664                push_space += data_size;
3665
3666        slot = path->slots[1];
3667        i = left_nritems - 1;
3668        while (i >= nr) {
3669                item = btrfs_item_nr(i);
3670
3671                if (!empty && push_items > 0) {
3672                        if (path->slots[0] > i)
3673                                break;
3674                        if (path->slots[0] == i) {
3675                                int space = btrfs_leaf_free_space(left);
3676
3677                                if (space + push_space * 2 > free_space)
3678                                        break;
3679                        }
3680                }
3681
3682                if (path->slots[0] == i)
3683                        push_space += data_size;
3684
3685                this_item_size = btrfs_item_size(left, item);
3686                if (this_item_size + sizeof(*item) + push_space > free_space)
3687                        break;
3688
3689                push_items++;
3690                push_space += this_item_size + sizeof(*item);
3691                if (i == 0)
3692                        break;
3693                i--;
3694        }
3695
3696        if (push_items == 0)
3697                goto out_unlock;
3698
3699        WARN_ON(!empty && push_items == left_nritems);
3700
3701        /* push left to right */
3702        right_nritems = btrfs_header_nritems(right);
3703
3704        push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3705        push_space -= leaf_data_end(left);
3706
3707        /* make room in the right data area */
3708        data_end = leaf_data_end(right);
3709        memmove_extent_buffer(right,
3710                              BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3711                              BTRFS_LEAF_DATA_OFFSET + data_end,
3712                              BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3713
3714        /* copy from the left data area */
3715        copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3716                     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3717                     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3718                     push_space);
3719
3720        memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3721                              btrfs_item_nr_offset(0),
3722                              right_nritems * sizeof(struct btrfs_item));
3723
3724        /* copy the items from left to right */
3725        copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3726                   btrfs_item_nr_offset(left_nritems - push_items),
3727                   push_items * sizeof(struct btrfs_item));
3728
3729        /* update the item pointers */
3730        btrfs_init_map_token(&token, right);
3731        right_nritems += push_items;
3732        btrfs_set_header_nritems(right, right_nritems);
3733        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3734        for (i = 0; i < right_nritems; i++) {
3735                item = btrfs_item_nr(i);
3736                push_space -= btrfs_token_item_size(&token, item);
3737                btrfs_set_token_item_offset(&token, item, push_space);
3738        }
3739
3740        left_nritems -= push_items;
3741        btrfs_set_header_nritems(left, left_nritems);
3742
3743        if (left_nritems)
3744                btrfs_mark_buffer_dirty(left);
3745        else
3746                btrfs_clean_tree_block(left);
3747
3748        btrfs_mark_buffer_dirty(right);
3749
3750        btrfs_item_key(right, &disk_key, 0);
3751        btrfs_set_node_key(upper, &disk_key, slot + 1);
3752        btrfs_mark_buffer_dirty(upper);
3753
3754        /* then fixup the leaf pointer in the path */
3755        if (path->slots[0] >= left_nritems) {
3756                path->slots[0] -= left_nritems;
3757                if (btrfs_header_nritems(path->nodes[0]) == 0)
3758                        btrfs_clean_tree_block(path->nodes[0]);
3759                btrfs_tree_unlock(path->nodes[0]);
3760                free_extent_buffer(path->nodes[0]);
3761                path->nodes[0] = right;
3762                path->slots[1] += 1;
3763        } else {
3764                btrfs_tree_unlock(right);
3765                free_extent_buffer(right);
3766        }
3767        return 0;
3768
3769out_unlock:
3770        btrfs_tree_unlock(right);
3771        free_extent_buffer(right);
3772        return 1;
3773}
3774
3775/*
3776 * push some data in the path leaf to the right, trying to free up at
3777 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3778 *
3779 * returns 1 if the push failed because the other node didn't have enough
3780 * room, 0 if everything worked out and < 0 if there were major errors.
3781 *
3782 * this will push starting from min_slot to the end of the leaf.  It won't
3783 * push any slot lower than min_slot
3784 */
3785static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3786                           *root, struct btrfs_path *path,
3787                           int min_data_size, int data_size,
3788                           int empty, u32 min_slot)
3789{
3790        struct extent_buffer *left = path->nodes[0];
3791        struct extent_buffer *right;
3792        struct extent_buffer *upper;
3793        int slot;
3794        int free_space;
3795        u32 left_nritems;
3796        int ret;
3797
3798        if (!path->nodes[1])
3799                return 1;
3800
3801        slot = path->slots[1];
3802        upper = path->nodes[1];
3803        if (slot >= btrfs_header_nritems(upper) - 1)
3804                return 1;
3805
3806        btrfs_assert_tree_locked(path->nodes[1]);
3807
3808        right = btrfs_read_node_slot(upper, slot + 1);
3809        /*
3810         * slot + 1 is not valid or we fail to read the right node,
3811         * no big deal, just return.
3812         */
3813        if (IS_ERR(right))
3814                return 1;
3815
3816        __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3817        btrfs_set_lock_blocking_write(right);
3818
3819        free_space = btrfs_leaf_free_space(right);
3820        if (free_space < data_size)
3821                goto out_unlock;
3822
3823        /* cow and double check */
3824        ret = btrfs_cow_block(trans, root, right, upper,
3825                              slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3826        if (ret)
3827                goto out_unlock;
3828
3829        free_space = btrfs_leaf_free_space(right);
3830        if (free_space < data_size)
3831                goto out_unlock;
3832
3833        left_nritems = btrfs_header_nritems(left);
3834        if (left_nritems == 0)
3835                goto out_unlock;
3836
3837        if (check_sibling_keys(left, right)) {
3838                ret = -EUCLEAN;
3839                btrfs_tree_unlock(right);
3840                free_extent_buffer(right);
3841                return ret;
3842        }
3843        if (path->slots[0] == left_nritems && !empty) {
3844                /* Key greater than all keys in the leaf, right neighbor has
3845                 * enough room for it and we're not emptying our leaf to delete
3846                 * it, therefore use right neighbor to insert the new item and
3847                 * no need to touch/dirty our left leaf. */
3848                btrfs_tree_unlock(left);
3849                free_extent_buffer(left);
3850                path->nodes[0] = right;
3851                path->slots[0] = 0;
3852                path->slots[1]++;
3853                return 0;
3854        }
3855
3856        return __push_leaf_right(path, min_data_size, empty,
3857                                right, free_space, left_nritems, min_slot);
3858out_unlock:
3859        btrfs_tree_unlock(right);
3860        free_extent_buffer(right);
3861        return 1;
3862}
3863
3864/*
3865 * push some data in the path leaf to the left, trying to free up at
3866 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3867 *
3868 * max_slot can put a limit on how far into the leaf we'll push items.  The
3869 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3870 * items
3871 */
3872static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3873                                     int empty, struct extent_buffer *left,
3874                                     int free_space, u32 right_nritems,
3875                                     u32 max_slot)
3876{
3877        struct btrfs_fs_info *fs_info = left->fs_info;
3878        struct btrfs_disk_key disk_key;
3879        struct extent_buffer *right = path->nodes[0];
3880        int i;
3881        int push_space = 0;
3882        int push_items = 0;
3883        struct btrfs_item *item;
3884        u32 old_left_nritems;
3885        u32 nr;
3886        int ret = 0;
3887        u32 this_item_size;
3888        u32 old_left_item_size;
3889        struct btrfs_map_token token;
3890
3891        if (empty)
3892                nr = min(right_nritems, max_slot);
3893        else
3894                nr = min(right_nritems - 1, max_slot);
3895
3896        for (i = 0; i < nr; i++) {
3897                item = btrfs_item_nr(i);
3898
3899                if (!empty && push_items > 0) {
3900                        if (path->slots[0] < i)
3901                                break;
3902                        if (path->slots[0] == i) {
3903                                int space = btrfs_leaf_free_space(right);
3904
3905                                if (space + push_space * 2 > free_space)
3906                                        break;
3907                        }
3908                }
3909
3910                if (path->slots[0] == i)
3911                        push_space += data_size;
3912
3913                this_item_size = btrfs_item_size(right, item);
3914                if (this_item_size + sizeof(*item) + push_space > free_space)
3915                        break;
3916
3917                push_items++;
3918                push_space += this_item_size + sizeof(*item);
3919        }
3920
3921        if (push_items == 0) {
3922                ret = 1;
3923                goto out;
3924        }
3925        WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3926
3927        /* push data from right to left */
3928        copy_extent_buffer(left, right,
3929                           btrfs_item_nr_offset(btrfs_header_nritems(left)),
3930                           btrfs_item_nr_offset(0),
3931                           push_items * sizeof(struct btrfs_item));
3932
3933        push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3934                     btrfs_item_offset_nr(right, push_items - 1);
3935
3936        copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3937                     leaf_data_end(left) - push_space,
3938                     BTRFS_LEAF_DATA_OFFSET +
3939                     btrfs_item_offset_nr(right, push_items - 1),
3940                     push_space);
3941        old_left_nritems = btrfs_header_nritems(left);
3942        BUG_ON(old_left_nritems <= 0);
3943
3944        btrfs_init_map_token(&token, left);
3945        old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3946        for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3947                u32 ioff;
3948
3949                item = btrfs_item_nr(i);
3950
3951                ioff = btrfs_token_item_offset(&token, item);
3952                btrfs_set_token_item_offset(&token, item,
3953                      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3954        }
3955        btrfs_set_header_nritems(left, old_left_nritems + push_items);
3956
3957        /* fixup right node */
3958        if (push_items > right_nritems)
3959                WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3960                       right_nritems);
3961
3962        if (push_items < right_nritems) {
3963                push_space = btrfs_item_offset_nr(right, push_items - 1) -
3964                                                  leaf_data_end(right);
3965                memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3966                                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3967                                      BTRFS_LEAF_DATA_OFFSET +
3968                                      leaf_data_end(right), push_space);
3969
3970                memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3971                              btrfs_item_nr_offset(push_items),
3972                             (btrfs_header_nritems(right) - push_items) *
3973                             sizeof(struct btrfs_item));
3974        }
3975
3976        btrfs_init_map_token(&token, right);
3977        right_nritems -= push_items;
3978        btrfs_set_header_nritems(right, right_nritems);
3979        push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3980        for (i = 0; i < right_nritems; i++) {
3981                item = btrfs_item_nr(i);
3982
3983                push_space = push_space - btrfs_token_item_size(&token, item);
3984                btrfs_set_token_item_offset(&token, item, push_space);
3985        }
3986
3987        btrfs_mark_buffer_dirty(left);
3988        if (right_nritems)
3989                btrfs_mark_buffer_dirty(right);
3990        else
3991                btrfs_clean_tree_block(right);
3992
3993        btrfs_item_key(right, &disk_key, 0);
3994        fixup_low_keys(path, &disk_key, 1);
3995
3996        /* then fixup the leaf pointer in the path */
3997        if (path->slots[0] < push_items) {
3998                path->slots[0] += old_left_nritems;
3999                btrfs_tree_unlock(path->nodes[0]);
4000                free_extent_buffer(path->nodes[0]);
4001                path->nodes[0] = left;
4002                path->slots[1] -= 1;
4003        } else {
4004                btrfs_tree_unlock(left);
4005                free_extent_buffer(left);
4006                path->slots[0] -= push_items;
4007        }
4008        BUG_ON(path->slots[0] < 0);
4009        return ret;
4010out:
4011        btrfs_tree_unlock(left);
4012        free_extent_buffer(left);
4013        return ret;
4014}
4015
4016/*
4017 * push some data in the path leaf to the left, trying to free up at
4018 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
4019 *
4020 * max_slot can put a limit on how far into the leaf we'll push items.  The
4021 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
4022 * items
4023 */
4024static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4025                          *root, struct btrfs_path *path, int min_data_size,
4026                          int data_size, int empty, u32 max_slot)
4027{
4028        struct extent_buffer *right = path->nodes[0];
4029        struct extent_buffer *left;
4030        int slot;
4031        int free_space;
4032        u32 right_nritems;
4033        int ret = 0;
4034
4035        slot = path->slots[1];
4036        if (slot == 0)
4037                return 1;
4038        if (!path->nodes[1])
4039                return 1;
4040
4041        right_nritems = btrfs_header_nritems(right);
4042        if (right_nritems == 0)
4043                return 1;
4044
4045        btrfs_assert_tree_locked(path->nodes[1]);
4046
4047        left = btrfs_read_node_slot(path->nodes[1], slot - 1);
4048        /*
4049         * slot - 1 is not valid or we fail to read the left node,
4050         * no big deal, just return.
4051         */
4052        if (IS_ERR(left))
4053                return 1;
4054
4055        __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
4056        btrfs_set_lock_blocking_write(left);
4057
4058        free_space = btrfs_leaf_free_space(left);
4059        if (free_space < data_size) {
4060                ret = 1;
4061                goto out;
4062        }
4063
4064        /* cow and double check */
4065        ret = btrfs_cow_block(trans, root, left,
4066                              path->nodes[1], slot - 1, &left,
4067                              BTRFS_NESTING_LEFT_COW);
4068        if (ret) {
4069                /* we hit -ENOSPC, but it isn't fatal here */
4070                if (ret == -ENOSPC)
4071                        ret = 1;
4072                goto out;
4073        }
4074
4075        free_space = btrfs_leaf_free_space(left);
4076        if (free_space < data_size) {
4077                ret = 1;
4078                goto out;
4079        }
4080
4081        if (check_sibling_keys(left, right)) {
4082                ret = -EUCLEAN;
4083                goto out;
4084        }
4085        return __push_leaf_left(path, min_data_size,
4086                               empty, left, free_space, right_nritems,
4087                               max_slot);
4088out:
4089        btrfs_tree_unlock(left);
4090        free_extent_buffer(left);
4091        return ret;
4092}
4093
4094/*
4095 * split the path's leaf in two, making sure there is at least data_size
4096 * available for the resulting leaf level of the path.
4097 */
4098static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4099                                    struct btrfs_path *path,
4100                                    struct extent_buffer *l,
4101                                    struct extent_buffer *right,
4102                                    int slot, int mid, int nritems)
4103{
4104        struct btrfs_fs_info *fs_info = trans->fs_info;
4105        int data_copy_size;
4106        int rt_data_off;
4107        int i;
4108        struct btrfs_disk_key disk_key;
4109        struct btrfs_map_token token;
4110
4111        nritems = nritems - mid;
4112        btrfs_set_header_nritems(right, nritems);
4113        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4114
4115        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4116                           btrfs_item_nr_offset(mid),
4117                           nritems * sizeof(struct btrfs_item));
4118
4119        copy_extent_buffer(right, l,
4120                     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4121                     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4122                     leaf_data_end(l), data_copy_size);
4123
4124        rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4125
4126        btrfs_init_map_token(&token, right);
4127        for (i = 0; i < nritems; i++) {
4128                struct btrfs_item *item = btrfs_item_nr(i);
4129                u32 ioff;
4130
4131                ioff = btrfs_token_item_offset(&token, item);
4132                btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
4133        }
4134
4135        btrfs_set_header_nritems(l, mid);
4136        btrfs_item_key(right, &disk_key, 0);
4137        insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4138
4139        btrfs_mark_buffer_dirty(right);
4140        btrfs_mark_buffer_dirty(l);
4141        BUG_ON(path->slots[0] != slot);
4142
4143        if (mid <= slot) {
4144                btrfs_tree_unlock(path->nodes[0]);
4145                free_extent_buffer(path->nodes[0]);
4146                path->nodes[0] = right;
4147                path->slots[0] -= mid;
4148                path->slots[1] += 1;
4149        } else {
4150                btrfs_tree_unlock(right);
4151                free_extent_buffer(right);
4152        }
4153
4154        BUG_ON(path->slots[0] < 0);
4155}
4156
4157/*
4158 * double splits happen when we need to insert a big item in the middle
4159 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4160 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4161 *          A                 B                 C
4162 *
4163 * We avoid this by trying to push the items on either side of our target
4164 * into the adjacent leaves.  If all goes well we can avoid the double split
4165 * completely.
4166 */
4167static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4168                                          struct btrfs_root *root,
4169                                          struct btrfs_path *path,
4170                                          int data_size)
4171{
4172        int ret;
4173        int progress = 0;
4174        int slot;
4175        u32 nritems;
4176        int space_needed = data_size;
4177
4178        slot = path->slots[0];
4179        if (slot < btrfs_header_nritems(path->nodes[0]))
4180                space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4181
4182        /*
4183         * try to push all the items after our slot into the
4184         * right leaf
4185         */
4186        ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4187        if (ret < 0)
4188                return ret;
4189
4190        if (ret == 0)
4191                progress++;
4192
4193        nritems = btrfs_header_nritems(path->nodes[0]);
4194        /*
4195         * our goal is to get our slot at the start or end of a leaf.  If
4196         * we've done so we're done
4197         */
4198        if (path->slots[0] == 0 || path->slots[0] == nritems)
4199                return 0;
4200
4201        if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4202                return 0;
4203
4204        /* try to push all the items before our slot into the next leaf */
4205        slot = path->slots[0];
4206        space_needed = data_size;
4207        if (slot > 0)
4208                space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4209        ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4210        if (ret < 0)
4211                return ret;
4212
4213        if (ret == 0)
4214                progress++;
4215
4216        if (progress)
4217                return 0;
4218        return 1;
4219}
4220
4221/*
4222 * split the path's leaf in two, making sure there is at least data_size
4223 * available for the resulting leaf level of the path.
4224 *
4225 * returns 0 if all went well and < 0 on failure.
4226 */
4227static noinline int split_leaf(struct btrfs_trans_handle *trans,
4228                               struct btrfs_root *root,
4229                               const struct btrfs_key *ins_key,
4230                               struct btrfs_path *path, int data_size,
4231                               int extend)
4232{
4233        struct btrfs_disk_key disk_key;
4234        struct extent_buffer *l;
4235        u32 nritems;
4236        int mid;
4237        int slot;
4238        struct extent_buffer *right;
4239        struct btrfs_fs_info *fs_info = root->fs_info;
4240        int ret = 0;
4241        int wret;
4242        int split;
4243        int num_doubles = 0;
4244        int tried_avoid_double = 0;
4245
4246        l = path->nodes[0];
4247        slot = path->slots[0];
4248        if (extend && data_size + btrfs_item_size_nr(l, slot) +
4249            sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4250                return -EOVERFLOW;
4251
4252        /* first try to make some room by pushing left and right */
4253        if (data_size && path->nodes[1]) {
4254                int space_needed = data_size;
4255
4256                if (slot < btrfs_header_nritems(l))
4257                        space_needed -= btrfs_leaf_free_space(l);
4258
4259                wret = push_leaf_right(trans, root, path, space_needed,
4260                                       space_needed, 0, 0);
4261                if (wret < 0)
4262                        return wret;
4263                if (wret) {
4264                        space_needed = data_size;
4265                        if (slot > 0)
4266                                space_needed -= btrfs_leaf_free_space(l);
4267                        wret = push_leaf_left(trans, root, path, space_needed,
4268                                              space_needed, 0, (u32)-1);
4269                        if (wret < 0)
4270                                return wret;
4271                }
4272                l = path->nodes[0];
4273
4274                /* did the pushes work? */
4275                if (btrfs_leaf_free_space(l) >= data_size)
4276                        return 0;
4277        }
4278
4279        if (!path->nodes[1]) {
4280                ret = insert_new_root(trans, root, path, 1);
4281                if (ret)
4282                        return ret;
4283        }
4284again:
4285        split = 1;
4286        l = path->nodes[0];
4287        slot = path->slots[0];
4288        nritems = btrfs_header_nritems(l);
4289        mid = (nritems + 1) / 2;
4290
4291        if (mid <= slot) {
4292                if (nritems == 1 ||
4293                    leaf_space_used(l, mid, nritems - mid) + data_size >
4294                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
4295                        if (slot >= nritems) {
4296                                split = 0;
4297                        } else {
4298                                mid = slot;
4299                                if (mid != nritems &&
4300                                    leaf_space_used(l, mid, nritems - mid) +
4301                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4302                                        if (data_size && !tried_avoid_double)
4303                                                goto push_for_double;
4304                                        split = 2;
4305                                }
4306                        }
4307                }
4308        } else {
4309                if (leaf_space_used(l, 0, mid) + data_size >
4310                        BTRFS_LEAF_DATA_SIZE(fs_info)) {
4311                        if (!extend && data_size && slot == 0) {
4312                                split = 0;
4313                        } else if ((extend || !data_size) && slot == 0) {
4314                                mid = 1;
4315                        } else {
4316                                mid = slot;
4317                                if (mid != nritems &&
4318                                    leaf_space_used(l, mid, nritems - mid) +
4319                                    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4320                                        if (data_size && !tried_avoid_double)
4321                                                goto push_for_double;
4322                                        split = 2;
4323                                }
4324                        }
4325                }
4326        }
4327
4328        if (split == 0)
4329                btrfs_cpu_key_to_disk(&disk_key, ins_key);
4330        else
4331                btrfs_item_key(l, &disk_key, mid);
4332
4333        /*
4334         * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
4335         * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
4336         * subclasses, which is 8 at the time of this patch, and we've maxed it
4337         * out.  In the future we could add a
4338         * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
4339         * use BTRFS_NESTING_NEW_ROOT.
4340         */
4341        right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4342                                             l->start, 0, num_doubles ?
4343                                             BTRFS_NESTING_NEW_ROOT :
4344                                             BTRFS_NESTING_SPLIT);
4345        if (IS_ERR(right))
4346                return PTR_ERR(right);
4347
4348        root_add_used(root, fs_info->nodesize);
4349
4350        if (split == 0) {
4351                if (mid <= slot) {
4352                        btrfs_set_header_nritems(right, 0);
4353                        insert_ptr(trans, path, &disk_key,
4354                                   right->start, path->slots[1] + 1, 1);
4355                        btrfs_tree_unlock(path->nodes[0]);
4356                        free_extent_buffer(path->nodes[0]);
4357                        path->nodes[0] = right;
4358                        path->slots[0] = 0;
4359                        path->slots[1] += 1;
4360                } else {
4361                        btrfs_set_header_nritems(right, 0);
4362                        insert_ptr(trans, path, &disk_key,
4363                                   right->start, path->slots[1], 1);
4364                        btrfs_tree_unlock(path->nodes[0]);
4365                        free_extent_buffer(path->nodes[0]);
4366                        path->nodes[0] = right;
4367                        path->slots[0] = 0;
4368                        if (path->slots[1] == 0)
4369                                fixup_low_keys(path, &disk_key, 1);
4370                }
4371                /*
4372                 * We create a new leaf 'right' for the required ins_len and
4373                 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4374                 * the content of ins_len to 'right'.
4375                 */
4376                return ret;
4377        }
4378
4379        copy_for_split(trans, path, l, right, slot, mid, nritems);
4380
4381        if (split == 2) {
4382                BUG_ON(num_doubles != 0);
4383                num_doubles++;
4384                goto again;
4385        }
4386
4387        return 0;
4388
4389push_for_double:
4390        push_for_double_split(trans, root, path, data_size);
4391        tried_avoid_double = 1;
4392        if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4393                return 0;
4394        goto again;
4395}
4396
4397static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4398                                         struct btrfs_root *root,
4399                                         struct btrfs_path *path, int ins_len)
4400{
4401        struct btrfs_key key;
4402        struct extent_buffer *leaf;
4403        struct btrfs_file_extent_item *fi;
4404        u64 extent_len = 0;
4405        u32 item_size;
4406        int ret;
4407
4408        leaf = path->nodes[0];
4409        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4410
4411        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4412               key.type != BTRFS_EXTENT_CSUM_KEY);
4413
4414        if (btrfs_leaf_free_space(leaf) >= ins_len)
4415                return 0;
4416
4417        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4418        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4419                fi = btrfs_item_ptr(leaf, path->slots[0],
4420                                    struct btrfs_file_extent_item);
4421                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4422        }
4423        btrfs_release_path(path);
4424
4425        path->keep_locks = 1;
4426        path->search_for_split = 1;
4427        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4428        path->search_for_split = 0;
4429        if (ret > 0)
4430                ret = -EAGAIN;
4431        if (ret < 0)
4432                goto err;
4433
4434        ret = -EAGAIN;
4435        leaf = path->nodes[0];
4436        /* if our item isn't there, return now */
4437        if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4438                goto err;
4439
4440        /* the leaf has  changed, it now has room.  return now */
4441        if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4442                goto err;
4443
4444        if (key.type == BTRFS_EXTENT_DATA_KEY) {
4445                fi = btrfs_item_ptr(leaf, path->slots[0],
4446                                    struct btrfs_file_extent_item);
4447                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4448                        goto err;
4449        }
4450
4451        btrfs_set_path_blocking(path);
4452        ret = split_leaf(trans, root, &key, path, ins_len, 1);
4453        if (ret)
4454                goto err;
4455
4456        path->keep_locks = 0;
4457        btrfs_unlock_up_safe(path, 1);
4458        return 0;
4459err:
4460        path->keep_locks = 0;
4461        return ret;
4462}
4463
4464static noinline int split_item(struct btrfs_path *path,
4465                               const struct btrfs_key *new_key,
4466                               unsigned long split_offset)
4467{
4468        struct extent_buffer *leaf;
4469        struct btrfs_item *item;
4470        struct btrfs_item *new_item;
4471        int slot;
4472        char *buf;
4473        u32 nritems;
4474        u32 item_size;
4475        u32 orig_offset;
4476        struct btrfs_disk_key disk_key;
4477
4478        leaf = path->nodes[0];
4479        BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4480
4481        btrfs_set_path_blocking(path);
4482
4483        item = btrfs_item_nr(path->slots[0]);
4484        orig_offset = btrfs_item_offset(leaf, item);
4485        item_size = btrfs_item_size(leaf, item);
4486
4487        buf = kmalloc(item_size, GFP_NOFS);
4488        if (!buf)
4489                return -ENOMEM;
4490
4491        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4492                            path->slots[0]), item_size);
4493
4494        slot = path->slots[0] + 1;
4495        nritems = btrfs_header_nritems(leaf);
4496        if (slot != nritems) {
4497                /* shift the items */
4498                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4499                                btrfs_item_nr_offset(slot),
4500                                (nritems - slot) * sizeof(struct btrfs_item));
4501        }
4502
4503        btrfs_cpu_key_to_disk(&disk_key, new_key);
4504        btrfs_set_item_key(leaf, &disk_key, slot);
4505
4506        new_item = btrfs_item_nr(slot);
4507
4508        btrfs_set_item_offset(leaf, new_item, orig_offset);
4509        btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4510
4511        btrfs_set_item_offset(leaf, item,
4512                              orig_offset + item_size - split_offset);
4513        btrfs_set_item_size(leaf, item, split_offset);
4514
4515        btrfs_set_header_nritems(leaf, nritems + 1);
4516
4517        /* write the data for the start of the original item */
4518        write_extent_buffer(leaf, buf,
4519                            btrfs_item_ptr_offset(leaf, path->slots[0]),
4520                            split_offset);
4521
4522        /* write the data for the new item */
4523        write_extent_buffer(leaf, buf + split_offset,
4524                            btrfs_item_ptr_offset(leaf, slot),
4525                            item_size - split_offset);
4526        btrfs_mark_buffer_dirty(leaf);
4527
4528        BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4529        kfree(buf);
4530        return 0;
4531}
4532
4533/*
4534 * This function splits a single item into two items,
4535 * giving 'new_key' to the new item and splitting the
4536 * old one at split_offset (from the start of the item).
4537 *
4538 * The path may be released by this operation.  After
4539 * the split, the path is pointing to the old item.  The
4540 * new item is going to be in the same node as the old one.
4541 *
4542 * Note, the item being split must be smaller enough to live alone on
4543 * a tree block with room for one extra struct btrfs_item
4544 *
4545 * This allows us to split the item in place, keeping a lock on the
4546 * leaf the entire time.
4547 */
4548int btrfs_split_item(struct btrfs_trans_handle *trans,
4549                     struct btrfs_root *root,
4550                     struct btrfs_path *path,
4551                     const struct btrfs_key *new_key,
4552                     unsigned long split_offset)
4553{
4554        int ret;
4555        ret = setup_leaf_for_split(trans, root, path,
4556                                   sizeof(struct btrfs_item));
4557        if (ret)
4558                return ret;
4559
4560        ret = split_item(path, new_key, split_offset);
4561        return ret;
4562}
4563
4564/*
4565 * This function duplicate a item, giving 'new_key' to the new item.
4566 * It guarantees both items live in the same tree leaf and the new item
4567 * is contiguous with the original item.
4568 *
4569 * This allows us to split file extent in place, keeping a lock on the
4570 * leaf the entire time.
4571 */
4572int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4573                         struct btrfs_root *root,
4574                         struct btrfs_path *path,
4575                         const struct btrfs_key *new_key)
4576{
4577        struct extent_buffer *leaf;
4578        int ret;
4579        u32 item_size;
4580
4581        leaf = path->nodes[0];
4582        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4583        ret = setup_leaf_for_split(trans, root, path,
4584                                   item_size + sizeof(struct btrfs_item));
4585        if (ret)
4586                return ret;
4587
4588        path->slots[0]++;
4589        setup_items_for_insert(root, path, new_key, &item_size, 1);
4590        leaf = path->nodes[0];
4591        memcpy_extent_buffer(leaf,
4592                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4593                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4594                             item_size);
4595        return 0;
4596}
4597
4598/*
4599 * make the item pointed to by the path smaller.  new_size indicates
4600 * how small to make it, and from_end tells us if we just chop bytes
4601 * off the end of the item or if we shift the item to chop bytes off
4602 * the front.
4603 */
4604void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4605{
4606        int slot;
4607        struct extent_buffer *leaf;
4608        struct btrfs_item *item;
4609        u32 nritems;
4610        unsigned int data_end;
4611        unsigned int old_data_start;
4612        unsigned int old_size;
4613        unsigned int size_diff;
4614        int i;
4615        struct btrfs_map_token token;
4616
4617        leaf = path->nodes[0];
4618        slot = path->slots[0];
4619
4620        old_size = btrfs_item_size_nr(leaf, slot);
4621        if (old_size == new_size)
4622                return;
4623
4624        nritems = btrfs_header_nritems(leaf);
4625        data_end = leaf_data_end(leaf);
4626
4627        old_data_start = btrfs_item_offset_nr(leaf, slot);
4628
4629        size_diff = old_size - new_size;
4630
4631        BUG_ON(slot < 0);
4632        BUG_ON(slot >= nritems);
4633
4634        /*
4635         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4636         */
4637        /* first correct the data pointers */
4638        btrfs_init_map_token(&token, leaf);
4639        for (i = slot; i < nritems; i++) {
4640                u32 ioff;
4641                item = btrfs_item_nr(i);
4642
4643                ioff = btrfs_token_item_offset(&token, item);
4644                btrfs_set_token_item_offset(&token, item, ioff + size_diff);
4645        }
4646
4647        /* shift the data */
4648        if (from_end) {
4649                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4650                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4651                              data_end, old_data_start + new_size - data_end);
4652        } else {
4653                struct btrfs_disk_key disk_key;
4654                u64 offset;
4655
4656                btrfs_item_key(leaf, &disk_key, slot);
4657
4658                if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4659                        unsigned long ptr;
4660                        struct btrfs_file_extent_item *fi;
4661
4662                        fi = btrfs_item_ptr(leaf, slot,
4663                                            struct btrfs_file_extent_item);
4664                        fi = (struct btrfs_file_extent_item *)(
4665                             (unsigned long)fi - size_diff);
4666
4667                        if (btrfs_file_extent_type(leaf, fi) ==
4668                            BTRFS_FILE_EXTENT_INLINE) {
4669                                ptr = btrfs_item_ptr_offset(leaf, slot);
4670                                memmove_extent_buffer(leaf, ptr,
4671                                      (unsigned long)fi,
4672                                      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4673                        }
4674                }
4675
4676                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4677                              data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4678                              data_end, old_data_start - data_end);
4679
4680                offset = btrfs_disk_key_offset(&disk_key);
4681                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4682                btrfs_set_item_key(leaf, &disk_key, slot);
4683                if (slot == 0)
4684                        fixup_low_keys(path, &disk_key, 1);
4685        }
4686
4687        item = btrfs_item_nr(slot);
4688        btrfs_set_item_size(leaf, item, new_size);
4689        btrfs_mark_buffer_dirty(leaf);
4690
4691        if (btrfs_leaf_free_space(leaf) < 0) {
4692                btrfs_print_leaf(leaf);
4693                BUG();
4694        }
4695}
4696
4697/*
4698 * make the item pointed to by the path bigger, data_size is the added size.
4699 */
4700void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4701{
4702        int slot;
4703        struct extent_buffer *leaf;
4704        struct btrfs_item *item;
4705        u32 nritems;
4706        unsigned int data_end;
4707        unsigned int old_data;
4708        unsigned int old_size;
4709        int i;
4710        struct btrfs_map_token token;
4711
4712        leaf = path->nodes[0];
4713
4714        nritems = btrfs_header_nritems(leaf);
4715        data_end = leaf_data_end(leaf);
4716
4717        if (btrfs_leaf_free_space(leaf) < data_size) {
4718                btrfs_print_leaf(leaf);
4719                BUG();
4720        }
4721        slot = path->slots[0];
4722        old_data = btrfs_item_end_nr(leaf, slot);
4723
4724        BUG_ON(slot < 0);
4725        if (slot >= nritems) {
4726                btrfs_print_leaf(leaf);
4727                btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4728                           slot, nritems);
4729                BUG();
4730        }
4731
4732        /*
4733         * item0..itemN ... dataN.offset..dataN.size .. data0.size
4734         */
4735        /* first correct the data pointers */
4736        btrfs_init_map_token(&token, leaf);
4737        for (i = slot; i < nritems; i++) {
4738                u32 ioff;
4739                item = btrfs_item_nr(i);
4740
4741                ioff = btrfs_token_item_offset(&token, item);
4742                btrfs_set_token_item_offset(&token, item, ioff - data_size);
4743        }
4744
4745        /* shift the data */
4746        memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4747                      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4748                      data_end, old_data - data_end);
4749
4750        data_end = old_data;
4751        old_size = btrfs_item_size_nr(leaf, slot);
4752        item = btrfs_item_nr(slot);
4753        btrfs_set_item_size(leaf, item, old_size + data_size);
4754        btrfs_mark_buffer_dirty(leaf);
4755
4756        if (btrfs_leaf_free_space(leaf) < 0) {
4757                btrfs_print_leaf(leaf);
4758                BUG();
4759        }
4760}
4761
4762/**
4763 * setup_items_for_insert - Helper called before inserting one or more items
4764 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
4765 * in a function that doesn't call btrfs_search_slot
4766 *
4767 * @root:       root we are inserting items to
4768 * @path:       points to the leaf/slot where we are going to insert new items
4769 * @cpu_key:    array of keys for items to be inserted
4770 * @data_size:  size of the body of each item we are going to insert
4771 * @nr:         size of @cpu_key/@data_size arrays
4772 */
4773void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4774                            const struct btrfs_key *cpu_key, u32 *data_size,
4775                            int nr)
4776{
4777        struct btrfs_fs_info *fs_info = root->fs_info;
4778        struct btrfs_item *item;
4779        int i;
4780        u32 nritems;
4781        unsigned int data_end;
4782        struct btrfs_disk_key disk_key;
4783        struct extent_buffer *leaf;
4784        int slot;
4785        struct btrfs_map_token token;
4786        u32 total_size;
4787        u32 total_data = 0;
4788
4789        for (i = 0; i < nr; i++)
4790                total_data += data_size[i];
4791        total_size = total_data + (nr * sizeof(struct btrfs_item));
4792
4793        if (path->slots[0] == 0) {
4794                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4795                fixup_low_keys(path, &disk_key, 1);
4796        }
4797        btrfs_unlock_up_safe(path, 1);
4798
4799        leaf = path->nodes[0];
4800        slot = path->slots[0];
4801
4802        nritems = btrfs_header_nritems(leaf);
4803        data_end = leaf_data_end(leaf);
4804
4805        if (btrfs_leaf_free_space(leaf) < total_size) {
4806                btrfs_print_leaf(leaf);
4807                btrfs_crit(fs_info, "not enough freespace need %u have %d",
4808                           total_size, btrfs_leaf_free_space(leaf));
4809                BUG();
4810        }
4811
4812        btrfs_init_map_token(&token, leaf);
4813        if (slot != nritems) {
4814                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4815
4816                if (old_data < data_end) {
4817                        btrfs_print_leaf(leaf);
4818                        btrfs_crit(fs_info,
4819                "item at slot %d with data offset %u beyond data end of leaf %u",
4820                                   slot, old_data, data_end);
4821                        BUG();
4822                }
4823                /*
4824                 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4825                 */
4826                /* first correct the data pointers */
4827                for (i = slot; i < nritems; i++) {
4828                        u32 ioff;
4829
4830                        item = btrfs_item_nr(i);
4831                        ioff = btrfs_token_item_offset(&token, item);
4832                        btrfs_set_token_item_offset(&token, item,
4833                                                    ioff - total_data);
4834                }
4835                /* shift the items */
4836                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4837                              btrfs_item_nr_offset(slot),
4838                              (nritems - slot) * sizeof(struct btrfs_item));
4839
4840                /* shift the data */
4841                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4842                              data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4843                              data_end, old_data - data_end);
4844                data_end = old_data;
4845        }
4846
4847        /* setup the item for the new data */
4848        for (i = 0; i < nr; i++) {
4849                btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4850                btrfs_set_item_key(leaf, &disk_key, slot + i);
4851                item = btrfs_item_nr(slot + i);
4852                data_end -= data_size[i];
4853                btrfs_set_token_item_offset(&token, item, data_end);
4854                btrfs_set_token_item_size(&token, item, data_size[i]);
4855        }
4856
4857        btrfs_set_header_nritems(leaf, nritems + nr);
4858        btrfs_mark_buffer_dirty(leaf);
4859
4860        if (btrfs_leaf_free_space(leaf) < 0) {
4861                btrfs_print_leaf(leaf);
4862                BUG();
4863        }
4864}
4865
4866/*
4867 * Given a key and some data, insert items into the tree.
4868 * This does all the path init required, making room in the tree if needed.
4869 */
4870int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4871                            struct btrfs_root *root,
4872                            struct btrfs_path *path,
4873                            const struct btrfs_key *cpu_key, u32 *data_size,
4874                            int nr)
4875{
4876        int ret = 0;
4877        int slot;
4878        int i;
4879        u32 total_size = 0;
4880        u32 total_data = 0;
4881
4882        for (i = 0; i < nr; i++)
4883                total_data += data_size[i];
4884
4885        total_size = total_data + (nr * sizeof(struct btrfs_item));
4886        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4887        if (ret == 0)
4888                return -EEXIST;
4889        if (ret < 0)
4890                return ret;
4891
4892        slot = path->slots[0];
4893        BUG_ON(slot < 0);
4894
4895        setup_items_for_insert(root, path, cpu_key, data_size, nr);
4896        return 0;
4897}
4898
4899/*
4900 * Given a key and some data, insert an item into the tree.
4901 * This does all the path init required, making room in the tree if needed.
4902 */
4903int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4904                      const struct btrfs_key *cpu_key, void *data,
4905                      u32 data_size)
4906{
4907        int ret = 0;
4908        struct btrfs_path *path;
4909        struct extent_buffer *leaf;
4910        unsigned long ptr;
4911
4912        path = btrfs_alloc_path();
4913        if (!path)
4914                return -ENOMEM;
4915        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4916        if (!ret) {
4917                leaf = path->nodes[0];
4918                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4919                write_extent_buffer(leaf, data, ptr, data_size);
4920                btrfs_mark_buffer_dirty(leaf);
4921        }
4922        btrfs_free_path(path);
4923        return ret;
4924}
4925
4926/*
4927 * delete the pointer from a given node.
4928 *
4929 * the tree should have been previously balanced so the deletion does not
4930 * empty a node.
4931 */
4932static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4933                    int level, int slot)
4934{
4935        struct extent_buffer *parent = path->nodes[level];
4936        u32 nritems;
4937        int ret;
4938
4939        nritems = btrfs_header_nritems(parent);
4940        if (slot != nritems - 1) {
4941                if (level) {
4942                        ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4943                                        nritems - slot - 1);
4944                        BUG_ON(ret < 0);
4945                }
4946                memmove_extent_buffer(parent,
4947                              btrfs_node_key_ptr_offset(slot),
4948                              btrfs_node_key_ptr_offset(slot + 1),
4949                              sizeof(struct btrfs_key_ptr) *
4950                              (nritems - slot - 1));
4951        } else if (level) {
4952                ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4953                                GFP_NOFS);
4954                BUG_ON(ret < 0);
4955        }
4956
4957        nritems--;
4958        btrfs_set_header_nritems(parent, nritems);
4959        if (nritems == 0 && parent == root->node) {
4960                BUG_ON(btrfs_header_level(root->node) != 1);
4961                /* just turn the root into a leaf and break */
4962                btrfs_set_header_level(root->node, 0);
4963        } else if (slot == 0) {
4964                struct btrfs_disk_key disk_key;
4965
4966                btrfs_node_key(parent, &disk_key, 0);
4967                fixup_low_keys(path, &disk_key, level + 1);
4968        }
4969        btrfs_mark_buffer_dirty(parent);
4970}
4971
4972/*
4973 * a helper function to delete the leaf pointed to by path->slots[1] and
4974 * path->nodes[1].
4975 *
4976 * This deletes the pointer in path->nodes[1] and frees the leaf
4977 * block extent.  zero is returned if it all worked out, < 0 otherwise.
4978 *
4979 * The path must have already been setup for deleting the leaf, including
4980 * all the proper balancing.  path->nodes[1] must be locked.
4981 */
4982static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4983                                    struct btrfs_root *root,
4984                                    struct btrfs_path *path,
4985                                    struct extent_buffer *leaf)
4986{
4987        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4988        del_ptr(root, path, 1, path->slots[1]);
4989
4990        /*
4991         * btrfs_free_extent is expensive, we want to make sure we
4992         * aren't holding any locks when we call it
4993         */
4994        btrfs_unlock_up_safe(path, 0);
4995
4996        root_sub_used(root, leaf->len);
4997
4998        atomic_inc(&leaf->refs);
4999        btrfs_free_tree_block(trans, root, leaf, 0, 1);
5000        free_extent_buffer_stale(leaf);
5001}
5002/*
5003 * delete the item at the leaf level in path.  If that empties
5004 * the leaf, remove it from the tree
5005 */
5006int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5007                    struct btrfs_path *path, int slot, int nr)
5008{
5009        struct btrfs_fs_info *fs_info = root->fs_info;
5010        struct extent_buffer *leaf;
5011        struct btrfs_item *item;
5012        u32 last_off;
5013        u32 dsize = 0;
5014        int ret = 0;
5015        int wret;
5016        int i;
5017        u32 nritems;
5018
5019        leaf = path->nodes[0];
5020        last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5021
5022        for (i = 0; i < nr; i++)
5023                dsize += btrfs_item_size_nr(leaf, slot + i);
5024
5025        nritems = btrfs_header_nritems(leaf);
5026
5027        if (slot + nr != nritems) {
5028                int data_end = leaf_data_end(leaf);
5029                struct btrfs_map_token token;
5030
5031                memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5032                              data_end + dsize,
5033                              BTRFS_LEAF_DATA_OFFSET + data_end,
5034                              last_off - data_end);
5035
5036                btrfs_init_map_token(&token, leaf);
5037                for (i = slot + nr; i < nritems; i++) {
5038                        u32 ioff;
5039
5040                        item = btrfs_item_nr(i);
5041                        ioff = btrfs_token_item_offset(&token, item);
5042                        btrfs_set_token_item_offset(&token, item, ioff + dsize);
5043                }
5044
5045                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5046                              btrfs_item_nr_offset(slot + nr),
5047                              sizeof(struct btrfs_item) *
5048                              (nritems - slot - nr));
5049        }
5050        btrfs_set_header_nritems(leaf, nritems - nr);
5051        nritems -= nr;
5052
5053        /* delete the leaf if we've emptied it */
5054        if (nritems == 0) {
5055                if (leaf == root->node) {
5056                        btrfs_set_header_level(leaf, 0);
5057                } else {
5058                        btrfs_set_path_blocking(path);
5059                        btrfs_clean_tree_block(leaf);
5060                        btrfs_del_leaf(trans, root, path, leaf);
5061                }
5062        } else {
5063                int used = leaf_space_used(leaf, 0, nritems);
5064                if (slot == 0) {
5065                        struct btrfs_disk_key disk_key;
5066
5067                        btrfs_item_key(leaf, &disk_key, 0);
5068                        fixup_low_keys(path, &disk_key, 1);
5069                }
5070
5071                /* delete the leaf if it is mostly empty */
5072                if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5073                        /* push_leaf_left fixes the path.
5074                         * make sure the path still points to our leaf
5075                         * for possible call to del_ptr below
5076                         */
5077                        slot = path->slots[1];
5078                        atomic_inc(&leaf->refs);
5079
5080                        btrfs_set_path_blocking(path);
5081                        wret = push_leaf_left(trans, root, path, 1, 1,
5082                                              1, (u32)-1);
5083                        if (wret < 0 && wret != -ENOSPC)
5084                                ret = wret;
5085
5086                        if (path->nodes[0] == leaf &&
5087                            btrfs_header_nritems(leaf)) {
5088                                wret = push_leaf_right(trans, root, path, 1,
5089                                                       1, 1, 0);
5090                                if (wret < 0 && wret != -ENOSPC)
5091                                        ret = wret;
5092                        }
5093
5094                        if (btrfs_header_nritems(leaf) == 0) {
5095                                path->slots[1] = slot;
5096                                btrfs_del_leaf(trans, root, path, leaf);
5097                                free_extent_buffer(leaf);
5098                                ret = 0;
5099                        } else {
5100                                /* if we're still in the path, make sure
5101                                 * we're dirty.  Otherwise, one of the
5102                                 * push_leaf functions must have already
5103                                 * dirtied this buffer
5104                                 */
5105                                if (path->nodes[0] == leaf)
5106                                        btrfs_mark_buffer_dirty(leaf);
5107                                free_extent_buffer(leaf);
5108                        }
5109                } else {
5110                        btrfs_mark_buffer_dirty(leaf);
5111                }
5112        }
5113        return ret;
5114}
5115
5116/*
5117 * search the tree again to find a leaf with lesser keys
5118 * returns 0 if it found something or 1 if there are no lesser leaves.
5119 * returns < 0 on io errors.
5120 *
5121 * This may release the path, and so you may lose any locks held at the
5122 * time you call it.
5123 */
5124int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5125{
5126        struct btrfs_key key;
5127        struct btrfs_disk_key found_key;
5128        int ret;
5129
5130        btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5131
5132        if (key.offset > 0) {
5133                key.offset--;
5134        } else if (key.type > 0) {
5135                key.type--;
5136                key.offset = (u64)-1;
5137        } else if (key.objectid > 0) {
5138                key.objectid--;
5139                key.type = (u8)-1;
5140                key.offset = (u64)-1;
5141        } else {
5142                return 1;
5143        }
5144
5145        btrfs_release_path(path);
5146        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5147        if (ret < 0)
5148                return ret;
5149        btrfs_item_key(path->nodes[0], &found_key, 0);
5150        ret = comp_keys(&found_key, &key);
5151        /*
5152         * We might have had an item with the previous key in the tree right
5153         * before we released our path. And after we released our path, that
5154         * item might have been pushed to the first slot (0) of the leaf we
5155         * were holding due to a tree balance. Alternatively, an item with the
5156         * previous key can exist as the only element of a leaf (big fat item).
5157         * Therefore account for these 2 cases, so that our callers (like
5158         * btrfs_previous_item) don't miss an existing item with a key matching
5159         * the previous key we computed above.
5160         */
5161        if (ret <= 0)
5162                return 0;
5163        return 1;
5164}
5165
5166/*
5167 * A helper function to walk down the tree starting at min_key, and looking
5168 * for nodes or leaves that are have a minimum transaction id.
5169 * This is used by the btree defrag code, and tree logging
5170 *
5171 * This does not cow, but it does stuff the starting key it finds back
5172 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5173 * key and get a writable path.
5174 *
5175 * This honors path->lowest_level to prevent descent past a given level
5176 * of the tree.
5177 *
5178 * min_trans indicates the oldest transaction that you are interested
5179 * in walking through.  Any nodes or leaves older than min_trans are
5180 * skipped over (without reading them).
5181 *
5182 * returns zero if something useful was found, < 0 on error and 1 if there
5183 * was nothing in the tree that matched the search criteria.
5184 */
5185int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5186                         struct btrfs_path *path,
5187                         u64 min_trans)
5188{
5189        struct extent_buffer *cur;
5190        struct btrfs_key found_key;
5191        int slot;
5192        int sret;
5193        u32 nritems;
5194        int level;
5195        int ret = 1;
5196        int keep_locks = path->keep_locks;
5197
5198        path->keep_locks = 1;
5199again:
5200        cur = btrfs_read_lock_root_node(root);
5201        level = btrfs_header_level(cur);
5202        WARN_ON(path->nodes[level]);
5203        path->nodes[level] = cur;
5204        path->locks[level] = BTRFS_READ_LOCK;
5205
5206        if (btrfs_header_generation(cur) < min_trans) {
5207                ret = 1;
5208                goto out;
5209        }
5210        while (1) {
5211                nritems = btrfs_header_nritems(cur);
5212                level = btrfs_header_level(cur);
5213                sret = btrfs_bin_search(cur, min_key, &slot);
5214                if (sret < 0) {
5215                        ret = sret;
5216                        goto out;
5217                }
5218
5219                /* at the lowest level, we're done, setup the path and exit */
5220                if (level == path->lowest_level) {
5221                        if (slot >= nritems)
5222                                goto find_next_key;
5223                        ret = 0;
5224                        path->slots[level] = slot;
5225                        btrfs_item_key_to_cpu(cur, &found_key, slot);
5226                        goto out;
5227                }
5228                if (sret && slot > 0)
5229                        slot--;
5230                /*
5231                 * check this node pointer against the min_trans parameters.
5232                 * If it is too old, skip to the next one.
5233                 */
5234                while (slot < nritems) {
5235                        u64 gen;
5236
5237                        gen = btrfs_node_ptr_generation(cur, slot);
5238                        if (gen < min_trans) {
5239                                slot++;
5240                                continue;
5241                        }
5242                        break;
5243                }
5244find_next_key:
5245                /*
5246                 * we didn't find a candidate key in this node, walk forward
5247                 * and find another one
5248                 */
5249                if (slot >= nritems) {
5250                        path->slots[level] = slot;
5251                        btrfs_set_path_blocking(path);
5252                        sret = btrfs_find_next_key(root, path, min_key, level,
5253                                                  min_trans);
5254                        if (sret == 0) {
5255                                btrfs_release_path(path);
5256                                goto again;
5257                        } else {
5258                                goto out;
5259                        }
5260                }
5261                /* save our key for returning back */
5262                btrfs_node_key_to_cpu(cur, &found_key, slot);
5263                path->slots[level] = slot;
5264                if (level == path->lowest_level) {
5265                        ret = 0;
5266                        goto out;
5267                }
5268                btrfs_set_path_blocking(path);
5269                cur = btrfs_read_node_slot(cur, slot);
5270                if (IS_ERR(cur)) {
5271                        ret = PTR_ERR(cur);
5272                        goto out;
5273                }
5274
5275                btrfs_tree_read_lock(cur);
5276
5277                path->locks[level - 1] = BTRFS_READ_LOCK;
5278                path->nodes[level - 1] = cur;
5279                unlock_up(path, level, 1, 0, NULL);
5280        }
5281out:
5282        path->keep_locks = keep_locks;
5283        if (ret == 0) {
5284                btrfs_unlock_up_safe(path, path->lowest_level + 1);
5285                btrfs_set_path_blocking(path);
5286                memcpy(min_key, &found_key, sizeof(found_key));
5287        }
5288        return ret;
5289}
5290
5291/*
5292 * this is similar to btrfs_next_leaf, but does not try to preserve
5293 * and fixup the path.  It looks for and returns the next key in the
5294 * tree based on the current path and the min_trans parameters.
5295 *
5296 * 0 is returned if another key is found, < 0 if there are any errors
5297 * and 1 is returned if there are no higher keys in the tree
5298 *
5299 * path->keep_locks should be set to 1 on the search made before
5300 * calling this function.
5301 */
5302int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5303                        struct btrfs_key *key, int level, u64 min_trans)
5304{
5305        int slot;
5306        struct extent_buffer *c;
5307
5308        WARN_ON(!path->keep_locks && !path->skip_locking);
5309        while (level < BTRFS_MAX_LEVEL) {
5310                if (!path->nodes[level])
5311                        return 1;
5312
5313                slot = path->slots[level] + 1;
5314                c = path->nodes[level];
5315next:
5316                if (slot >= btrfs_header_nritems(c)) {
5317                        int ret;
5318                        int orig_lowest;
5319                        struct btrfs_key cur_key;
5320                        if (level + 1 >= BTRFS_MAX_LEVEL ||
5321                            !path->nodes[level + 1])
5322                                return 1;
5323
5324                        if (path->locks[level + 1] || path->skip_locking) {
5325                                level++;
5326                                continue;
5327                        }
5328
5329                        slot = btrfs_header_nritems(c) - 1;
5330                        if (level == 0)
5331                                btrfs_item_key_to_cpu(c, &cur_key, slot);
5332                        else
5333                                btrfs_node_key_to_cpu(c, &cur_key, slot);
5334
5335                        orig_lowest = path->lowest_level;
5336                        btrfs_release_path(path);
5337                        path->lowest_level = level;
5338                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
5339                                                0, 0);
5340                        path->lowest_level = orig_lowest;
5341                        if (ret < 0)
5342                                return ret;
5343
5344                        c = path->nodes[level];
5345                        slot = path->slots[level];
5346                        if (ret == 0)
5347                                slot++;
5348                        goto next;
5349                }
5350
5351                if (level == 0)
5352                        btrfs_item_key_to_cpu(c, key, slot);
5353                else {
5354                        u64 gen = btrfs_node_ptr_generation(c, slot);
5355
5356                        if (gen < min_trans) {
5357                                slot++;
5358                                goto next;
5359                        }
5360                        btrfs_node_key_to_cpu(c, key, slot);
5361                }
5362                return 0;
5363        }
5364        return 1;
5365}
5366
5367/*
5368 * search the tree again to find a leaf with greater keys
5369 * returns 0 if it found something or 1 if there are no greater leaves.
5370 * returns < 0 on io errors.
5371 */
5372int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5373{
5374        return btrfs_next_old_leaf(root, path, 0);
5375}
5376
5377int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5378                        u64 time_seq)
5379{
5380        int slot;
5381        int level;
5382        struct extent_buffer *c;
5383        struct extent_buffer *next;
5384        struct btrfs_key key;
5385        u32 nritems;
5386        int ret;
5387        int old_spinning = path->leave_spinning;
5388        int next_rw_lock = 0;
5389
5390        nritems = btrfs_header_nritems(path->nodes[0]);
5391        if (nritems == 0)
5392                return 1;
5393
5394        btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5395again:
5396        level = 1;
5397        next = NULL;
5398        next_rw_lock = 0;
5399        btrfs_release_path(path);
5400
5401        path->keep_locks = 1;
5402        path->leave_spinning = 1;
5403
5404        if (time_seq)
5405                ret = btrfs_search_old_slot(root, &key, path, time_seq);
5406        else
5407                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5408        path->keep_locks = 0;
5409
5410        if (ret < 0)
5411                return ret;
5412
5413        nritems = btrfs_header_nritems(path->nodes[0]);
5414        /*
5415         * by releasing the path above we dropped all our locks.  A balance
5416         * could have added more items next to the key that used to be
5417         * at the very end of the block.  So, check again here and
5418         * advance the path if there are now more items available.
5419         */
5420        if (nritems > 0 && path->slots[0] < nritems - 1) {
5421                if (ret == 0)
5422                        path->slots[0]++;
5423                ret = 0;
5424                goto done;
5425        }
5426        /*
5427         * So the above check misses one case:
5428         * - after releasing the path above, someone has removed the item that
5429         *   used to be at the very end of the block, and balance between leafs
5430         *   gets another one with bigger key.offset to replace it.
5431         *
5432         * This one should be returned as well, or we can get leaf corruption
5433         * later(esp. in __btrfs_drop_extents()).
5434         *
5435         * And a bit more explanation about this check,
5436         * with ret > 0, the key isn't found, the path points to the slot
5437         * where it should be inserted, so the path->slots[0] item must be the
5438         * bigger one.
5439         */
5440        if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5441                ret = 0;
5442                goto done;
5443        }
5444
5445        while (level < BTRFS_MAX_LEVEL) {
5446                if (!path->nodes[level]) {
5447                        ret = 1;
5448                        goto done;
5449                }
5450
5451                slot = path->slots[level] + 1;
5452                c = path->nodes[level];
5453                if (slot >= btrfs_header_nritems(c)) {
5454                        level++;
5455                        if (level == BTRFS_MAX_LEVEL) {
5456                                ret = 1;
5457                                goto done;
5458                        }
5459                        continue;
5460                }
5461
5462                if (next) {
5463                        btrfs_tree_unlock_rw(next, next_rw_lock);
5464                        free_extent_buffer(next);
5465                }
5466
5467                next = c;
5468                next_rw_lock = path->locks[level];
5469                ret = read_block_for_search(root, path, &next, level,
5470                                            slot, &key);
5471                if (ret == -EAGAIN)
5472                        goto again;
5473
5474                if (ret < 0) {
5475                        btrfs_release_path(path);
5476                        goto done;
5477                }
5478
5479                if (!path->skip_locking) {
5480                        ret = btrfs_try_tree_read_lock(next);
5481                        if (!ret && time_seq) {
5482                                /*
5483                                 * If we don't get the lock, we may be racing
5484                                 * with push_leaf_left, holding that lock while
5485                                 * itself waiting for the leaf we've currently
5486                                 * locked. To solve this situation, we give up
5487                                 * on our lock and cycle.
5488                                 */
5489                                free_extent_buffer(next);
5490                                btrfs_release_path(path);
5491                                cond_resched();
5492                                goto again;
5493                        }
5494                        if (!ret) {
5495                                btrfs_set_path_blocking(path);
5496                                __btrfs_tree_read_lock(next,
5497                                                       BTRFS_NESTING_RIGHT,
5498                                                       path->recurse);
5499                        }
5500                        next_rw_lock = BTRFS_READ_LOCK;
5501                }
5502                break;
5503        }
5504        path->slots[level] = slot;
5505        while (1) {
5506                level--;
5507                c = path->nodes[level];
5508                if (path->locks[level])
5509                        btrfs_tree_unlock_rw(c, path->locks[level]);
5510
5511                free_extent_buffer(c);
5512                path->nodes[level] = next;
5513                path->slots[level] = 0;
5514                if (!path->skip_locking)
5515                        path->locks[level] = next_rw_lock;
5516                if (!level)
5517                        break;
5518
5519                ret = read_block_for_search(root, path, &next, level,
5520                                            0, &key);
5521                if (ret == -EAGAIN)
5522                        goto again;
5523
5524                if (ret < 0) {
5525                        btrfs_release_path(path);
5526                        goto done;
5527                }
5528
5529                if (!path->skip_locking) {
5530                        ret = btrfs_try_tree_read_lock(next);
5531                        if (!ret) {
5532                                btrfs_set_path_blocking(path);
5533                                __btrfs_tree_read_lock(next,
5534                                                       BTRFS_NESTING_RIGHT,
5535                                                       path->recurse);
5536                        }
5537                        next_rw_lock = BTRFS_READ_LOCK;
5538                }
5539        }
5540        ret = 0;
5541done:
5542        unlock_up(path, 0, 1, 0, NULL);
5543        path->leave_spinning = old_spinning;
5544        if (!old_spinning)
5545                btrfs_set_path_blocking(path);
5546
5547        return ret;
5548}
5549
5550/*
5551 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5552 * searching until it gets past min_objectid or finds an item of 'type'
5553 *
5554 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5555 */
5556int btrfs_previous_item(struct btrfs_root *root,
5557                        struct btrfs_path *path, u64 min_objectid,
5558                        int type)
5559{
5560        struct btrfs_key found_key;
5561        struct extent_buffer *leaf;
5562        u32 nritems;
5563        int ret;
5564
5565        while (1) {
5566                if (path->slots[0] == 0) {
5567                        btrfs_set_path_blocking(path);
5568                        ret = btrfs_prev_leaf(root, path);
5569                        if (ret != 0)
5570                                return ret;
5571                } else {
5572                        path->slots[0]--;
5573                }
5574                leaf = path->nodes[0];
5575                nritems = btrfs_header_nritems(leaf);
5576                if (nritems == 0)
5577                        return 1;
5578                if (path->slots[0] == nritems)
5579                        path->slots[0]--;
5580
5581                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5582                if (found_key.objectid < min_objectid)
5583                        break;
5584                if (found_key.type == type)
5585                        return 0;
5586                if (found_key.objectid == min_objectid &&
5587                    found_key.type < type)
5588                        break;
5589        }
5590        return 1;
5591}
5592
5593/*
5594 * search in extent tree to find a previous Metadata/Data extent item with
5595 * min objecitd.
5596 *
5597 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5598 */
5599int btrfs_previous_extent_item(struct btrfs_root *root,
5600                        struct btrfs_path *path, u64 min_objectid)
5601{
5602        struct btrfs_key found_key;
5603        struct extent_buffer *leaf;
5604        u32 nritems;
5605        int ret;
5606
5607        while (1) {
5608                if (path->slots[0] == 0) {
5609                        btrfs_set_path_blocking(path);
5610                        ret = btrfs_prev_leaf(root, path);
5611                        if (ret != 0)
5612                                return ret;
5613                } else {
5614                        path->slots[0]--;
5615                }
5616                leaf = path->nodes[0];
5617                nritems = btrfs_header_nritems(leaf);
5618                if (nritems == 0)
5619                        return 1;
5620                if (path->slots[0] == nritems)
5621                        path->slots[0]--;
5622
5623                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5624                if (found_key.objectid < min_objectid)
5625                        break;
5626                if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5627                    found_key.type == BTRFS_METADATA_ITEM_KEY)
5628                        return 0;
5629                if (found_key.objectid == min_objectid &&
5630                    found_key.type < BTRFS_EXTENT_ITEM_KEY)
5631                        break;
5632        }
5633        return 1;
5634}
5635