linux/fs/btrfs/relocation.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2009 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/pagemap.h>
  21#include <linux/writeback.h>
  22#include <linux/blkdev.h>
  23#include <linux/rbtree.h>
  24#include <linux/slab.h>
  25#include "ctree.h"
  26#include "disk-io.h"
  27#include "transaction.h"
  28#include "volumes.h"
  29#include "locking.h"
  30#include "btrfs_inode.h"
  31#include "async-thread.h"
  32#include "free-space-cache.h"
  33#include "inode-map.h"
  34
  35/*
  36 * backref_node, mapping_node and tree_block start with this
  37 */
  38struct tree_entry {
  39        struct rb_node rb_node;
  40        u64 bytenr;
  41};
  42
  43/*
  44 * present a tree block in the backref cache
  45 */
  46struct backref_node {
  47        struct rb_node rb_node;
  48        u64 bytenr;
  49
  50        u64 new_bytenr;
  51        /* objectid of tree block owner, can be not uptodate */
  52        u64 owner;
  53        /* link to pending, changed or detached list */
  54        struct list_head list;
  55        /* list of upper level blocks reference this block */
  56        struct list_head upper;
  57        /* list of child blocks in the cache */
  58        struct list_head lower;
  59        /* NULL if this node is not tree root */
  60        struct btrfs_root *root;
  61        /* extent buffer got by COW the block */
  62        struct extent_buffer *eb;
  63        /* level of tree block */
  64        unsigned int level:8;
  65        /* is the block in non-reference counted tree */
  66        unsigned int cowonly:1;
  67        /* 1 if no child node in the cache */
  68        unsigned int lowest:1;
  69        /* is the extent buffer locked */
  70        unsigned int locked:1;
  71        /* has the block been processed */
  72        unsigned int processed:1;
  73        /* have backrefs of this block been checked */
  74        unsigned int checked:1;
  75        /*
  76         * 1 if corresponding block has been cowed but some upper
  77         * level block pointers may not point to the new location
  78         */
  79        unsigned int pending:1;
  80        /*
  81         * 1 if the backref node isn't connected to any other
  82         * backref node.
  83         */
  84        unsigned int detached:1;
  85};
  86
  87/*
  88 * present a block pointer in the backref cache
  89 */
  90struct backref_edge {
  91        struct list_head list[2];
  92        struct backref_node *node[2];
  93};
  94
  95#define LOWER   0
  96#define UPPER   1
  97
  98struct backref_cache {
  99        /* red black tree of all backref nodes in the cache */
 100        struct rb_root rb_root;
 101        /* for passing backref nodes to btrfs_reloc_cow_block */
 102        struct backref_node *path[BTRFS_MAX_LEVEL];
 103        /*
 104         * list of blocks that have been cowed but some block
 105         * pointers in upper level blocks may not reflect the
 106         * new location
 107         */
 108        struct list_head pending[BTRFS_MAX_LEVEL];
 109        /* list of backref nodes with no child node */
 110        struct list_head leaves;
 111        /* list of blocks that have been cowed in current transaction */
 112        struct list_head changed;
 113        /* list of detached backref node. */
 114        struct list_head detached;
 115
 116        u64 last_trans;
 117
 118        int nr_nodes;
 119        int nr_edges;
 120};
 121
 122/*
 123 * map address of tree root to tree
 124 */
 125struct mapping_node {
 126        struct rb_node rb_node;
 127        u64 bytenr;
 128        void *data;
 129};
 130
 131struct mapping_tree {
 132        struct rb_root rb_root;
 133        spinlock_t lock;
 134};
 135
 136/*
 137 * present a tree block to process
 138 */
 139struct tree_block {
 140        struct rb_node rb_node;
 141        u64 bytenr;
 142        struct btrfs_key key;
 143        unsigned int level:8;
 144        unsigned int key_ready:1;
 145};
 146
 147#define MAX_EXTENTS 128
 148
 149struct file_extent_cluster {
 150        u64 start;
 151        u64 end;
 152        u64 boundary[MAX_EXTENTS];
 153        unsigned int nr;
 154};
 155
 156struct reloc_control {
 157        /* block group to relocate */
 158        struct btrfs_block_group_cache *block_group;
 159        /* extent tree */
 160        struct btrfs_root *extent_root;
 161        /* inode for moving data */
 162        struct inode *data_inode;
 163
 164        struct btrfs_block_rsv *block_rsv;
 165
 166        struct backref_cache backref_cache;
 167
 168        struct file_extent_cluster cluster;
 169        /* tree blocks have been processed */
 170        struct extent_io_tree processed_blocks;
 171        /* map start of tree root to corresponding reloc tree */
 172        struct mapping_tree reloc_root_tree;
 173        /* list of reloc trees */
 174        struct list_head reloc_roots;
 175        /* size of metadata reservation for merging reloc trees */
 176        u64 merging_rsv_size;
 177        /* size of relocated tree nodes */
 178        u64 nodes_relocated;
 179
 180        u64 search_start;
 181        u64 extents_found;
 182
 183        unsigned int stage:8;
 184        unsigned int create_reloc_tree:1;
 185        unsigned int merge_reloc_tree:1;
 186        unsigned int found_file_extent:1;
 187        unsigned int commit_transaction:1;
 188};
 189
 190/* stages of data relocation */
 191#define MOVE_DATA_EXTENTS       0
 192#define UPDATE_DATA_PTRS        1
 193
 194static void remove_backref_node(struct backref_cache *cache,
 195                                struct backref_node *node);
 196static void __mark_block_processed(struct reloc_control *rc,
 197                                   struct backref_node *node);
 198
 199static void mapping_tree_init(struct mapping_tree *tree)
 200{
 201        tree->rb_root = RB_ROOT;
 202        spin_lock_init(&tree->lock);
 203}
 204
 205static void backref_cache_init(struct backref_cache *cache)
 206{
 207        int i;
 208        cache->rb_root = RB_ROOT;
 209        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 210                INIT_LIST_HEAD(&cache->pending[i]);
 211        INIT_LIST_HEAD(&cache->changed);
 212        INIT_LIST_HEAD(&cache->detached);
 213        INIT_LIST_HEAD(&cache->leaves);
 214}
 215
 216static void backref_cache_cleanup(struct backref_cache *cache)
 217{
 218        struct backref_node *node;
 219        int i;
 220
 221        while (!list_empty(&cache->detached)) {
 222                node = list_entry(cache->detached.next,
 223                                  struct backref_node, list);
 224                remove_backref_node(cache, node);
 225        }
 226
 227        while (!list_empty(&cache->leaves)) {
 228                node = list_entry(cache->leaves.next,
 229                                  struct backref_node, lower);
 230                remove_backref_node(cache, node);
 231        }
 232
 233        cache->last_trans = 0;
 234
 235        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 236                BUG_ON(!list_empty(&cache->pending[i]));
 237        BUG_ON(!list_empty(&cache->changed));
 238        BUG_ON(!list_empty(&cache->detached));
 239        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
 240        BUG_ON(cache->nr_nodes);
 241        BUG_ON(cache->nr_edges);
 242}
 243
 244static struct backref_node *alloc_backref_node(struct backref_cache *cache)
 245{
 246        struct backref_node *node;
 247
 248        node = kzalloc(sizeof(*node), GFP_NOFS);
 249        if (node) {
 250                INIT_LIST_HEAD(&node->list);
 251                INIT_LIST_HEAD(&node->upper);
 252                INIT_LIST_HEAD(&node->lower);
 253                RB_CLEAR_NODE(&node->rb_node);
 254                cache->nr_nodes++;
 255        }
 256        return node;
 257}
 258
 259static void free_backref_node(struct backref_cache *cache,
 260                              struct backref_node *node)
 261{
 262        if (node) {
 263                cache->nr_nodes--;
 264                kfree(node);
 265        }
 266}
 267
 268static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
 269{
 270        struct backref_edge *edge;
 271
 272        edge = kzalloc(sizeof(*edge), GFP_NOFS);
 273        if (edge)
 274                cache->nr_edges++;
 275        return edge;
 276}
 277
 278static void free_backref_edge(struct backref_cache *cache,
 279                              struct backref_edge *edge)
 280{
 281        if (edge) {
 282                cache->nr_edges--;
 283                kfree(edge);
 284        }
 285}
 286
 287static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 288                                   struct rb_node *node)
 289{
 290        struct rb_node **p = &root->rb_node;
 291        struct rb_node *parent = NULL;
 292        struct tree_entry *entry;
 293
 294        while (*p) {
 295                parent = *p;
 296                entry = rb_entry(parent, struct tree_entry, rb_node);
 297
 298                if (bytenr < entry->bytenr)
 299                        p = &(*p)->rb_left;
 300                else if (bytenr > entry->bytenr)
 301                        p = &(*p)->rb_right;
 302                else
 303                        return parent;
 304        }
 305
 306        rb_link_node(node, parent, p);
 307        rb_insert_color(node, root);
 308        return NULL;
 309}
 310
 311static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 312{
 313        struct rb_node *n = root->rb_node;
 314        struct tree_entry *entry;
 315
 316        while (n) {
 317                entry = rb_entry(n, struct tree_entry, rb_node);
 318
 319                if (bytenr < entry->bytenr)
 320                        n = n->rb_left;
 321                else if (bytenr > entry->bytenr)
 322                        n = n->rb_right;
 323                else
 324                        return n;
 325        }
 326        return NULL;
 327}
 328
 329void backref_tree_panic(struct rb_node *rb_node, int errno,
 330                                          u64 bytenr)
 331{
 332
 333        struct btrfs_fs_info *fs_info = NULL;
 334        struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
 335                                              rb_node);
 336        if (bnode->root)
 337                fs_info = bnode->root->fs_info;
 338        btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
 339                    "found at offset %llu\n", (unsigned long long)bytenr);
 340}
 341
 342/*
 343 * walk up backref nodes until reach node presents tree root
 344 */
 345static struct backref_node *walk_up_backref(struct backref_node *node,
 346                                            struct backref_edge *edges[],
 347                                            int *index)
 348{
 349        struct backref_edge *edge;
 350        int idx = *index;
 351
 352        while (!list_empty(&node->upper)) {
 353                edge = list_entry(node->upper.next,
 354                                  struct backref_edge, list[LOWER]);
 355                edges[idx++] = edge;
 356                node = edge->node[UPPER];
 357        }
 358        BUG_ON(node->detached);
 359        *index = idx;
 360        return node;
 361}
 362
 363/*
 364 * walk down backref nodes to find start of next reference path
 365 */
 366static struct backref_node *walk_down_backref(struct backref_edge *edges[],
 367                                              int *index)
 368{
 369        struct backref_edge *edge;
 370        struct backref_node *lower;
 371        int idx = *index;
 372
 373        while (idx > 0) {
 374                edge = edges[idx - 1];
 375                lower = edge->node[LOWER];
 376                if (list_is_last(&edge->list[LOWER], &lower->upper)) {
 377                        idx--;
 378                        continue;
 379                }
 380                edge = list_entry(edge->list[LOWER].next,
 381                                  struct backref_edge, list[LOWER]);
 382                edges[idx - 1] = edge;
 383                *index = idx;
 384                return edge->node[UPPER];
 385        }
 386        *index = 0;
 387        return NULL;
 388}
 389
 390static void unlock_node_buffer(struct backref_node *node)
 391{
 392        if (node->locked) {
 393                btrfs_tree_unlock(node->eb);
 394                node->locked = 0;
 395        }
 396}
 397
 398static void drop_node_buffer(struct backref_node *node)
 399{
 400        if (node->eb) {
 401                unlock_node_buffer(node);
 402                free_extent_buffer(node->eb);
 403                node->eb = NULL;
 404        }
 405}
 406
 407static void drop_backref_node(struct backref_cache *tree,
 408                              struct backref_node *node)
 409{
 410        BUG_ON(!list_empty(&node->upper));
 411
 412        drop_node_buffer(node);
 413        list_del(&node->list);
 414        list_del(&node->lower);
 415        if (!RB_EMPTY_NODE(&node->rb_node))
 416                rb_erase(&node->rb_node, &tree->rb_root);
 417        free_backref_node(tree, node);
 418}
 419
 420/*
 421 * remove a backref node from the backref cache
 422 */
 423static void remove_backref_node(struct backref_cache *cache,
 424                                struct backref_node *node)
 425{
 426        struct backref_node *upper;
 427        struct backref_edge *edge;
 428
 429        if (!node)
 430                return;
 431
 432        BUG_ON(!node->lowest && !node->detached);
 433        while (!list_empty(&node->upper)) {
 434                edge = list_entry(node->upper.next, struct backref_edge,
 435                                  list[LOWER]);
 436                upper = edge->node[UPPER];
 437                list_del(&edge->list[LOWER]);
 438                list_del(&edge->list[UPPER]);
 439                free_backref_edge(cache, edge);
 440
 441                if (RB_EMPTY_NODE(&upper->rb_node)) {
 442                        BUG_ON(!list_empty(&node->upper));
 443                        drop_backref_node(cache, node);
 444                        node = upper;
 445                        node->lowest = 1;
 446                        continue;
 447                }
 448                /*
 449                 * add the node to leaf node list if no other
 450                 * child block cached.
 451                 */
 452                if (list_empty(&upper->lower)) {
 453                        list_add_tail(&upper->lower, &cache->leaves);
 454                        upper->lowest = 1;
 455                }
 456        }
 457
 458        drop_backref_node(cache, node);
 459}
 460
 461static void update_backref_node(struct backref_cache *cache,
 462                                struct backref_node *node, u64 bytenr)
 463{
 464        struct rb_node *rb_node;
 465        rb_erase(&node->rb_node, &cache->rb_root);
 466        node->bytenr = bytenr;
 467        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
 468        if (rb_node)
 469                backref_tree_panic(rb_node, -EEXIST, bytenr);
 470}
 471
 472/*
 473 * update backref cache after a transaction commit
 474 */
 475static int update_backref_cache(struct btrfs_trans_handle *trans,
 476                                struct backref_cache *cache)
 477{
 478        struct backref_node *node;
 479        int level = 0;
 480
 481        if (cache->last_trans == 0) {
 482                cache->last_trans = trans->transid;
 483                return 0;
 484        }
 485
 486        if (cache->last_trans == trans->transid)
 487                return 0;
 488
 489        /*
 490         * detached nodes are used to avoid unnecessary backref
 491         * lookup. transaction commit changes the extent tree.
 492         * so the detached nodes are no longer useful.
 493         */
 494        while (!list_empty(&cache->detached)) {
 495                node = list_entry(cache->detached.next,
 496                                  struct backref_node, list);
 497                remove_backref_node(cache, node);
 498        }
 499
 500        while (!list_empty(&cache->changed)) {
 501                node = list_entry(cache->changed.next,
 502                                  struct backref_node, list);
 503                list_del_init(&node->list);
 504                BUG_ON(node->pending);
 505                update_backref_node(cache, node, node->new_bytenr);
 506        }
 507
 508        /*
 509         * some nodes can be left in the pending list if there were
 510         * errors during processing the pending nodes.
 511         */
 512        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 513                list_for_each_entry(node, &cache->pending[level], list) {
 514                        BUG_ON(!node->pending);
 515                        if (node->bytenr == node->new_bytenr)
 516                                continue;
 517                        update_backref_node(cache, node, node->new_bytenr);
 518                }
 519        }
 520
 521        cache->last_trans = 0;
 522        return 1;
 523}
 524
 525
 526static int should_ignore_root(struct btrfs_root *root)
 527{
 528        struct btrfs_root *reloc_root;
 529
 530        if (!root->ref_cows)
 531                return 0;
 532
 533        reloc_root = root->reloc_root;
 534        if (!reloc_root)
 535                return 0;
 536
 537        if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
 538            root->fs_info->running_transaction->transid - 1)
 539                return 0;
 540        /*
 541         * if there is reloc tree and it was created in previous
 542         * transaction backref lookup can find the reloc tree,
 543         * so backref node for the fs tree root is useless for
 544         * relocation.
 545         */
 546        return 1;
 547}
 548/*
 549 * find reloc tree by address of tree root
 550 */
 551static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
 552                                          u64 bytenr)
 553{
 554        struct rb_node *rb_node;
 555        struct mapping_node *node;
 556        struct btrfs_root *root = NULL;
 557
 558        spin_lock(&rc->reloc_root_tree.lock);
 559        rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
 560        if (rb_node) {
 561                node = rb_entry(rb_node, struct mapping_node, rb_node);
 562                root = (struct btrfs_root *)node->data;
 563        }
 564        spin_unlock(&rc->reloc_root_tree.lock);
 565        return root;
 566}
 567
 568static int is_cowonly_root(u64 root_objectid)
 569{
 570        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
 571            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
 572            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
 573            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
 574            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
 575            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
 576                return 1;
 577        return 0;
 578}
 579
 580static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
 581                                        u64 root_objectid)
 582{
 583        struct btrfs_key key;
 584
 585        key.objectid = root_objectid;
 586        key.type = BTRFS_ROOT_ITEM_KEY;
 587        if (is_cowonly_root(root_objectid))
 588                key.offset = 0;
 589        else
 590                key.offset = (u64)-1;
 591
 592        return btrfs_read_fs_root_no_name(fs_info, &key);
 593}
 594
 595#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 596static noinline_for_stack
 597struct btrfs_root *find_tree_root(struct reloc_control *rc,
 598                                  struct extent_buffer *leaf,
 599                                  struct btrfs_extent_ref_v0 *ref0)
 600{
 601        struct btrfs_root *root;
 602        u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
 603        u64 generation = btrfs_ref_generation_v0(leaf, ref0);
 604
 605        BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
 606
 607        root = read_fs_root(rc->extent_root->fs_info, root_objectid);
 608        BUG_ON(IS_ERR(root));
 609
 610        if (root->ref_cows &&
 611            generation != btrfs_root_generation(&root->root_item))
 612                return NULL;
 613
 614        return root;
 615}
 616#endif
 617
 618static noinline_for_stack
 619int find_inline_backref(struct extent_buffer *leaf, int slot,
 620                        unsigned long *ptr, unsigned long *end)
 621{
 622        struct btrfs_extent_item *ei;
 623        struct btrfs_tree_block_info *bi;
 624        u32 item_size;
 625
 626        item_size = btrfs_item_size_nr(leaf, slot);
 627#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 628        if (item_size < sizeof(*ei)) {
 629                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
 630                return 1;
 631        }
 632#endif
 633        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 634        WARN_ON(!(btrfs_extent_flags(leaf, ei) &
 635                  BTRFS_EXTENT_FLAG_TREE_BLOCK));
 636
 637        if (item_size <= sizeof(*ei) + sizeof(*bi)) {
 638                WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 639                return 1;
 640        }
 641
 642        bi = (struct btrfs_tree_block_info *)(ei + 1);
 643        *ptr = (unsigned long)(bi + 1);
 644        *end = (unsigned long)ei + item_size;
 645        return 0;
 646}
 647
 648/*
 649 * build backref tree for a given tree block. root of the backref tree
 650 * corresponds the tree block, leaves of the backref tree correspond
 651 * roots of b-trees that reference the tree block.
 652 *
 653 * the basic idea of this function is check backrefs of a given block
 654 * to find upper level blocks that refernece the block, and then check
 655 * bakcrefs of these upper level blocks recursively. the recursion stop
 656 * when tree root is reached or backrefs for the block is cached.
 657 *
 658 * NOTE: if we find backrefs for a block are cached, we know backrefs
 659 * for all upper level blocks that directly/indirectly reference the
 660 * block are also cached.
 661 */
 662static noinline_for_stack
 663struct backref_node *build_backref_tree(struct reloc_control *rc,
 664                                        struct btrfs_key *node_key,
 665                                        int level, u64 bytenr)
 666{
 667        struct backref_cache *cache = &rc->backref_cache;
 668        struct btrfs_path *path1;
 669        struct btrfs_path *path2;
 670        struct extent_buffer *eb;
 671        struct btrfs_root *root;
 672        struct backref_node *cur;
 673        struct backref_node *upper;
 674        struct backref_node *lower;
 675        struct backref_node *node = NULL;
 676        struct backref_node *exist = NULL;
 677        struct backref_edge *edge;
 678        struct rb_node *rb_node;
 679        struct btrfs_key key;
 680        unsigned long end;
 681        unsigned long ptr;
 682        LIST_HEAD(list);
 683        LIST_HEAD(useless);
 684        int cowonly;
 685        int ret;
 686        int err = 0;
 687
 688        path1 = btrfs_alloc_path();
 689        path2 = btrfs_alloc_path();
 690        if (!path1 || !path2) {
 691                err = -ENOMEM;
 692                goto out;
 693        }
 694        path1->reada = 1;
 695        path2->reada = 2;
 696
 697        node = alloc_backref_node(cache);
 698        if (!node) {
 699                err = -ENOMEM;
 700                goto out;
 701        }
 702
 703        node->bytenr = bytenr;
 704        node->level = level;
 705        node->lowest = 1;
 706        cur = node;
 707again:
 708        end = 0;
 709        ptr = 0;
 710        key.objectid = cur->bytenr;
 711        key.type = BTRFS_EXTENT_ITEM_KEY;
 712        key.offset = (u64)-1;
 713
 714        path1->search_commit_root = 1;
 715        path1->skip_locking = 1;
 716        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
 717                                0, 0);
 718        if (ret < 0) {
 719                err = ret;
 720                goto out;
 721        }
 722        BUG_ON(!ret || !path1->slots[0]);
 723
 724        path1->slots[0]--;
 725
 726        WARN_ON(cur->checked);
 727        if (!list_empty(&cur->upper)) {
 728                /*
 729                 * the backref was added previously when processing
 730                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
 731                 */
 732                BUG_ON(!list_is_singular(&cur->upper));
 733                edge = list_entry(cur->upper.next, struct backref_edge,
 734                                  list[LOWER]);
 735                BUG_ON(!list_empty(&edge->list[UPPER]));
 736                exist = edge->node[UPPER];
 737                /*
 738                 * add the upper level block to pending list if we need
 739                 * check its backrefs
 740                 */
 741                if (!exist->checked)
 742                        list_add_tail(&edge->list[UPPER], &list);
 743        } else {
 744                exist = NULL;
 745        }
 746
 747        while (1) {
 748                cond_resched();
 749                eb = path1->nodes[0];
 750
 751                if (ptr >= end) {
 752                        if (path1->slots[0] >= btrfs_header_nritems(eb)) {
 753                                ret = btrfs_next_leaf(rc->extent_root, path1);
 754                                if (ret < 0) {
 755                                        err = ret;
 756                                        goto out;
 757                                }
 758                                if (ret > 0)
 759                                        break;
 760                                eb = path1->nodes[0];
 761                        }
 762
 763                        btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
 764                        if (key.objectid != cur->bytenr) {
 765                                WARN_ON(exist);
 766                                break;
 767                        }
 768
 769                        if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 770                                ret = find_inline_backref(eb, path1->slots[0],
 771                                                          &ptr, &end);
 772                                if (ret)
 773                                        goto next;
 774                        }
 775                }
 776
 777                if (ptr < end) {
 778                        /* update key for inline back ref */
 779                        struct btrfs_extent_inline_ref *iref;
 780                        iref = (struct btrfs_extent_inline_ref *)ptr;
 781                        key.type = btrfs_extent_inline_ref_type(eb, iref);
 782                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
 783                        WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
 784                                key.type != BTRFS_SHARED_BLOCK_REF_KEY);
 785                }
 786
 787                if (exist &&
 788                    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
 789                      exist->owner == key.offset) ||
 790                     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
 791                      exist->bytenr == key.offset))) {
 792                        exist = NULL;
 793                        goto next;
 794                }
 795
 796#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 797                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
 798                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
 799                        if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
 800                                struct btrfs_extent_ref_v0 *ref0;
 801                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
 802                                                struct btrfs_extent_ref_v0);
 803                                if (key.objectid == key.offset) {
 804                                        root = find_tree_root(rc, eb, ref0);
 805                                        if (root && !should_ignore_root(root))
 806                                                cur->root = root;
 807                                        else
 808                                                list_add(&cur->list, &useless);
 809                                        break;
 810                                }
 811                                if (is_cowonly_root(btrfs_ref_root_v0(eb,
 812                                                                      ref0)))
 813                                        cur->cowonly = 1;
 814                        }
 815#else
 816                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
 817                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
 818#endif
 819                        if (key.objectid == key.offset) {
 820                                /*
 821                                 * only root blocks of reloc trees use
 822                                 * backref of this type.
 823                                 */
 824                                root = find_reloc_root(rc, cur->bytenr);
 825                                BUG_ON(!root);
 826                                cur->root = root;
 827                                break;
 828                        }
 829
 830                        edge = alloc_backref_edge(cache);
 831                        if (!edge) {
 832                                err = -ENOMEM;
 833                                goto out;
 834                        }
 835                        rb_node = tree_search(&cache->rb_root, key.offset);
 836                        if (!rb_node) {
 837                                upper = alloc_backref_node(cache);
 838                                if (!upper) {
 839                                        free_backref_edge(cache, edge);
 840                                        err = -ENOMEM;
 841                                        goto out;
 842                                }
 843                                upper->bytenr = key.offset;
 844                                upper->level = cur->level + 1;
 845                                /*
 846                                 *  backrefs for the upper level block isn't
 847                                 *  cached, add the block to pending list
 848                                 */
 849                                list_add_tail(&edge->list[UPPER], &list);
 850                        } else {
 851                                upper = rb_entry(rb_node, struct backref_node,
 852                                                 rb_node);
 853                                BUG_ON(!upper->checked);
 854                                INIT_LIST_HEAD(&edge->list[UPPER]);
 855                        }
 856                        list_add_tail(&edge->list[LOWER], &cur->upper);
 857                        edge->node[LOWER] = cur;
 858                        edge->node[UPPER] = upper;
 859
 860                        goto next;
 861                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
 862                        goto next;
 863                }
 864
 865                /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
 866                root = read_fs_root(rc->extent_root->fs_info, key.offset);
 867                if (IS_ERR(root)) {
 868                        err = PTR_ERR(root);
 869                        goto out;
 870                }
 871
 872                if (!root->ref_cows)
 873                        cur->cowonly = 1;
 874
 875                if (btrfs_root_level(&root->root_item) == cur->level) {
 876                        /* tree root */
 877                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
 878                               cur->bytenr);
 879                        if (should_ignore_root(root))
 880                                list_add(&cur->list, &useless);
 881                        else
 882                                cur->root = root;
 883                        break;
 884                }
 885
 886                level = cur->level + 1;
 887
 888                /*
 889                 * searching the tree to find upper level blocks
 890                 * reference the block.
 891                 */
 892                path2->search_commit_root = 1;
 893                path2->skip_locking = 1;
 894                path2->lowest_level = level;
 895                ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
 896                path2->lowest_level = 0;
 897                if (ret < 0) {
 898                        err = ret;
 899                        goto out;
 900                }
 901                if (ret > 0 && path2->slots[level] > 0)
 902                        path2->slots[level]--;
 903
 904                eb = path2->nodes[level];
 905                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
 906                        cur->bytenr);
 907
 908                lower = cur;
 909                for (; level < BTRFS_MAX_LEVEL; level++) {
 910                        if (!path2->nodes[level]) {
 911                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
 912                                       lower->bytenr);
 913                                if (should_ignore_root(root))
 914                                        list_add(&lower->list, &useless);
 915                                else
 916                                        lower->root = root;
 917                                break;
 918                        }
 919
 920                        edge = alloc_backref_edge(cache);
 921                        if (!edge) {
 922                                err = -ENOMEM;
 923                                goto out;
 924                        }
 925
 926                        eb = path2->nodes[level];
 927                        rb_node = tree_search(&cache->rb_root, eb->start);
 928                        if (!rb_node) {
 929                                upper = alloc_backref_node(cache);
 930                                if (!upper) {
 931                                        free_backref_edge(cache, edge);
 932                                        err = -ENOMEM;
 933                                        goto out;
 934                                }
 935                                upper->bytenr = eb->start;
 936                                upper->owner = btrfs_header_owner(eb);
 937                                upper->level = lower->level + 1;
 938                                if (!root->ref_cows)
 939                                        upper->cowonly = 1;
 940
 941                                /*
 942                                 * if we know the block isn't shared
 943                                 * we can void checking its backrefs.
 944                                 */
 945                                if (btrfs_block_can_be_shared(root, eb))
 946                                        upper->checked = 0;
 947                                else
 948                                        upper->checked = 1;
 949
 950                                /*
 951                                 * add the block to pending list if we
 952                                 * need check its backrefs. only block
 953                                 * at 'cur->level + 1' is added to the
 954                                 * tail of pending list. this guarantees
 955                                 * we check backrefs from lower level
 956                                 * blocks to upper level blocks.
 957                                 */
 958                                if (!upper->checked &&
 959                                    level == cur->level + 1) {
 960                                        list_add_tail(&edge->list[UPPER],
 961                                                      &list);
 962                                } else
 963                                        INIT_LIST_HEAD(&edge->list[UPPER]);
 964                        } else {
 965                                upper = rb_entry(rb_node, struct backref_node,
 966                                                 rb_node);
 967                                BUG_ON(!upper->checked);
 968                                INIT_LIST_HEAD(&edge->list[UPPER]);
 969                                if (!upper->owner)
 970                                        upper->owner = btrfs_header_owner(eb);
 971                        }
 972                        list_add_tail(&edge->list[LOWER], &lower->upper);
 973                        edge->node[LOWER] = lower;
 974                        edge->node[UPPER] = upper;
 975
 976                        if (rb_node)
 977                                break;
 978                        lower = upper;
 979                        upper = NULL;
 980                }
 981                btrfs_release_path(path2);
 982next:
 983                if (ptr < end) {
 984                        ptr += btrfs_extent_inline_ref_size(key.type);
 985                        if (ptr >= end) {
 986                                WARN_ON(ptr > end);
 987                                ptr = 0;
 988                                end = 0;
 989                        }
 990                }
 991                if (ptr >= end)
 992                        path1->slots[0]++;
 993        }
 994        btrfs_release_path(path1);
 995
 996        cur->checked = 1;
 997        WARN_ON(exist);
 998
 999        /* the pending list isn't empty, take the first block to process */
1000        if (!list_empty(&list)) {
1001                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1002                list_del_init(&edge->list[UPPER]);
1003                cur = edge->node[UPPER];
1004                goto again;
1005        }
1006
1007        /*
1008         * everything goes well, connect backref nodes and insert backref nodes
1009         * into the cache.
1010         */
1011        BUG_ON(!node->checked);
1012        cowonly = node->cowonly;
1013        if (!cowonly) {
1014                rb_node = tree_insert(&cache->rb_root, node->bytenr,
1015                                      &node->rb_node);
1016                if (rb_node)
1017                        backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1018                list_add_tail(&node->lower, &cache->leaves);
1019        }
1020
1021        list_for_each_entry(edge, &node->upper, list[LOWER])
1022                list_add_tail(&edge->list[UPPER], &list);
1023
1024        while (!list_empty(&list)) {
1025                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1026                list_del_init(&edge->list[UPPER]);
1027                upper = edge->node[UPPER];
1028                if (upper->detached) {
1029                        list_del(&edge->list[LOWER]);
1030                        lower = edge->node[LOWER];
1031                        free_backref_edge(cache, edge);
1032                        if (list_empty(&lower->upper))
1033                                list_add(&lower->list, &useless);
1034                        continue;
1035                }
1036
1037                if (!RB_EMPTY_NODE(&upper->rb_node)) {
1038                        if (upper->lowest) {
1039                                list_del_init(&upper->lower);
1040                                upper->lowest = 0;
1041                        }
1042
1043                        list_add_tail(&edge->list[UPPER], &upper->lower);
1044                        continue;
1045                }
1046
1047                BUG_ON(!upper->checked);
1048                BUG_ON(cowonly != upper->cowonly);
1049                if (!cowonly) {
1050                        rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1051                                              &upper->rb_node);
1052                        if (rb_node)
1053                                backref_tree_panic(rb_node, -EEXIST,
1054                                                   upper->bytenr);
1055                }
1056
1057                list_add_tail(&edge->list[UPPER], &upper->lower);
1058
1059                list_for_each_entry(edge, &upper->upper, list[LOWER])
1060                        list_add_tail(&edge->list[UPPER], &list);
1061        }
1062        /*
1063         * process useless backref nodes. backref nodes for tree leaves
1064         * are deleted from the cache. backref nodes for upper level
1065         * tree blocks are left in the cache to avoid unnecessary backref
1066         * lookup.
1067         */
1068        while (!list_empty(&useless)) {
1069                upper = list_entry(useless.next, struct backref_node, list);
1070                list_del_init(&upper->list);
1071                BUG_ON(!list_empty(&upper->upper));
1072                if (upper == node)
1073                        node = NULL;
1074                if (upper->lowest) {
1075                        list_del_init(&upper->lower);
1076                        upper->lowest = 0;
1077                }
1078                while (!list_empty(&upper->lower)) {
1079                        edge = list_entry(upper->lower.next,
1080                                          struct backref_edge, list[UPPER]);
1081                        list_del(&edge->list[UPPER]);
1082                        list_del(&edge->list[LOWER]);
1083                        lower = edge->node[LOWER];
1084                        free_backref_edge(cache, edge);
1085
1086                        if (list_empty(&lower->upper))
1087                                list_add(&lower->list, &useless);
1088                }
1089                __mark_block_processed(rc, upper);
1090                if (upper->level > 0) {
1091                        list_add(&upper->list, &cache->detached);
1092                        upper->detached = 1;
1093                } else {
1094                        rb_erase(&upper->rb_node, &cache->rb_root);
1095                        free_backref_node(cache, upper);
1096                }
1097        }
1098out:
1099        btrfs_free_path(path1);
1100        btrfs_free_path(path2);
1101        if (err) {
1102                while (!list_empty(&useless)) {
1103                        lower = list_entry(useless.next,
1104                                           struct backref_node, upper);
1105                        list_del_init(&lower->upper);
1106                }
1107                upper = node;
1108                INIT_LIST_HEAD(&list);
1109                while (upper) {
1110                        if (RB_EMPTY_NODE(&upper->rb_node)) {
1111                                list_splice_tail(&upper->upper, &list);
1112                                free_backref_node(cache, upper);
1113                        }
1114
1115                        if (list_empty(&list))
1116                                break;
1117
1118                        edge = list_entry(list.next, struct backref_edge,
1119                                          list[LOWER]);
1120                        list_del(&edge->list[LOWER]);
1121                        upper = edge->node[UPPER];
1122                        free_backref_edge(cache, edge);
1123                }
1124                return ERR_PTR(err);
1125        }
1126        BUG_ON(node && node->detached);
1127        return node;
1128}
1129
1130/*
1131 * helper to add backref node for the newly created snapshot.
1132 * the backref node is created by cloning backref node that
1133 * corresponds to root of source tree
1134 */
1135static int clone_backref_node(struct btrfs_trans_handle *trans,
1136                              struct reloc_control *rc,
1137                              struct btrfs_root *src,
1138                              struct btrfs_root *dest)
1139{
1140        struct btrfs_root *reloc_root = src->reloc_root;
1141        struct backref_cache *cache = &rc->backref_cache;
1142        struct backref_node *node = NULL;
1143        struct backref_node *new_node;
1144        struct backref_edge *edge;
1145        struct backref_edge *new_edge;
1146        struct rb_node *rb_node;
1147
1148        if (cache->last_trans > 0)
1149                update_backref_cache(trans, cache);
1150
1151        rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1152        if (rb_node) {
1153                node = rb_entry(rb_node, struct backref_node, rb_node);
1154                if (node->detached)
1155                        node = NULL;
1156                else
1157                        BUG_ON(node->new_bytenr != reloc_root->node->start);
1158        }
1159
1160        if (!node) {
1161                rb_node = tree_search(&cache->rb_root,
1162                                      reloc_root->commit_root->start);
1163                if (rb_node) {
1164                        node = rb_entry(rb_node, struct backref_node,
1165                                        rb_node);
1166                        BUG_ON(node->detached);
1167                }
1168        }
1169
1170        if (!node)
1171                return 0;
1172
1173        new_node = alloc_backref_node(cache);
1174        if (!new_node)
1175                return -ENOMEM;
1176
1177        new_node->bytenr = dest->node->start;
1178        new_node->level = node->level;
1179        new_node->lowest = node->lowest;
1180        new_node->checked = 1;
1181        new_node->root = dest;
1182
1183        if (!node->lowest) {
1184                list_for_each_entry(edge, &node->lower, list[UPPER]) {
1185                        new_edge = alloc_backref_edge(cache);
1186                        if (!new_edge)
1187                                goto fail;
1188
1189                        new_edge->node[UPPER] = new_node;
1190                        new_edge->node[LOWER] = edge->node[LOWER];
1191                        list_add_tail(&new_edge->list[UPPER],
1192                                      &new_node->lower);
1193                }
1194        } else {
1195                list_add_tail(&new_node->lower, &cache->leaves);
1196        }
1197
1198        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1199                              &new_node->rb_node);
1200        if (rb_node)
1201                backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1202
1203        if (!new_node->lowest) {
1204                list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1205                        list_add_tail(&new_edge->list[LOWER],
1206                                      &new_edge->node[LOWER]->upper);
1207                }
1208        }
1209        return 0;
1210fail:
1211        while (!list_empty(&new_node->lower)) {
1212                new_edge = list_entry(new_node->lower.next,
1213                                      struct backref_edge, list[UPPER]);
1214                list_del(&new_edge->list[UPPER]);
1215                free_backref_edge(cache, new_edge);
1216        }
1217        free_backref_node(cache, new_node);
1218        return -ENOMEM;
1219}
1220
1221/*
1222 * helper to add 'address of tree root -> reloc tree' mapping
1223 */
1224static int __must_check __add_reloc_root(struct btrfs_root *root)
1225{
1226        struct rb_node *rb_node;
1227        struct mapping_node *node;
1228        struct reloc_control *rc = root->fs_info->reloc_ctl;
1229
1230        node = kmalloc(sizeof(*node), GFP_NOFS);
1231        if (!node)
1232                return -ENOMEM;
1233
1234        node->bytenr = root->node->start;
1235        node->data = root;
1236
1237        spin_lock(&rc->reloc_root_tree.lock);
1238        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1239                              node->bytenr, &node->rb_node);
1240        spin_unlock(&rc->reloc_root_tree.lock);
1241        if (rb_node) {
1242                btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1243                            "for start=%llu while inserting into relocation "
1244                            "tree\n", node->bytenr);
1245                kfree(node);
1246                return -EEXIST;
1247        }
1248
1249        list_add_tail(&root->root_list, &rc->reloc_roots);
1250        return 0;
1251}
1252
1253/*
1254 * helper to update/delete the 'address of tree root -> reloc tree'
1255 * mapping
1256 */
1257static int __update_reloc_root(struct btrfs_root *root, int del)
1258{
1259        struct rb_node *rb_node;
1260        struct mapping_node *node = NULL;
1261        struct reloc_control *rc = root->fs_info->reloc_ctl;
1262
1263        spin_lock(&rc->reloc_root_tree.lock);
1264        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1265                              root->commit_root->start);
1266        if (rb_node) {
1267                node = rb_entry(rb_node, struct mapping_node, rb_node);
1268                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1269        }
1270        spin_unlock(&rc->reloc_root_tree.lock);
1271
1272        BUG_ON((struct btrfs_root *)node->data != root);
1273
1274        if (!del) {
1275                spin_lock(&rc->reloc_root_tree.lock);
1276                node->bytenr = root->node->start;
1277                rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1278                                      node->bytenr, &node->rb_node);
1279                spin_unlock(&rc->reloc_root_tree.lock);
1280                if (rb_node)
1281                        backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1282        } else {
1283                spin_lock(&root->fs_info->trans_lock);
1284                list_del_init(&root->root_list);
1285                spin_unlock(&root->fs_info->trans_lock);
1286                kfree(node);
1287        }
1288        return 0;
1289}
1290
1291static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1292                                        struct btrfs_root *root, u64 objectid)
1293{
1294        struct btrfs_root *reloc_root;
1295        struct extent_buffer *eb;
1296        struct btrfs_root_item *root_item;
1297        struct btrfs_key root_key;
1298        int ret;
1299
1300        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1301        BUG_ON(!root_item);
1302
1303        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1304        root_key.type = BTRFS_ROOT_ITEM_KEY;
1305        root_key.offset = objectid;
1306
1307        if (root->root_key.objectid == objectid) {
1308                /* called by btrfs_init_reloc_root */
1309                ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1310                                      BTRFS_TREE_RELOC_OBJECTID);
1311                BUG_ON(ret);
1312
1313                btrfs_set_root_last_snapshot(&root->root_item,
1314                                             trans->transid - 1);
1315        } else {
1316                /*
1317                 * called by btrfs_reloc_post_snapshot_hook.
1318                 * the source tree is a reloc tree, all tree blocks
1319                 * modified after it was created have RELOC flag
1320                 * set in their headers. so it's OK to not update
1321                 * the 'last_snapshot'.
1322                 */
1323                ret = btrfs_copy_root(trans, root, root->node, &eb,
1324                                      BTRFS_TREE_RELOC_OBJECTID);
1325                BUG_ON(ret);
1326        }
1327
1328        memcpy(root_item, &root->root_item, sizeof(*root_item));
1329        btrfs_set_root_bytenr(root_item, eb->start);
1330        btrfs_set_root_level(root_item, btrfs_header_level(eb));
1331        btrfs_set_root_generation(root_item, trans->transid);
1332
1333        if (root->root_key.objectid == objectid) {
1334                btrfs_set_root_refs(root_item, 0);
1335                memset(&root_item->drop_progress, 0,
1336                       sizeof(struct btrfs_disk_key));
1337                root_item->drop_level = 0;
1338        }
1339
1340        btrfs_tree_unlock(eb);
1341        free_extent_buffer(eb);
1342
1343        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
1344                                &root_key, root_item);
1345        BUG_ON(ret);
1346        kfree(root_item);
1347
1348        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
1349                                                 &root_key);
1350        BUG_ON(IS_ERR(reloc_root));
1351        reloc_root->last_trans = trans->transid;
1352        return reloc_root;
1353}
1354
1355/*
1356 * create reloc tree for a given fs tree. reloc tree is just a
1357 * snapshot of the fs tree with special root objectid.
1358 */
1359int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1360                          struct btrfs_root *root)
1361{
1362        struct btrfs_root *reloc_root;
1363        struct reloc_control *rc = root->fs_info->reloc_ctl;
1364        int clear_rsv = 0;
1365        int ret;
1366
1367        if (root->reloc_root) {
1368                reloc_root = root->reloc_root;
1369                reloc_root->last_trans = trans->transid;
1370                return 0;
1371        }
1372
1373        if (!rc || !rc->create_reloc_tree ||
1374            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1375                return 0;
1376
1377        if (!trans->block_rsv) {
1378                trans->block_rsv = rc->block_rsv;
1379                clear_rsv = 1;
1380        }
1381        reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1382        if (clear_rsv)
1383                trans->block_rsv = NULL;
1384
1385        ret = __add_reloc_root(reloc_root);
1386        BUG_ON(ret < 0);
1387        root->reloc_root = reloc_root;
1388        return 0;
1389}
1390
1391/*
1392 * update root item of reloc tree
1393 */
1394int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1395                            struct btrfs_root *root)
1396{
1397        struct btrfs_root *reloc_root;
1398        struct btrfs_root_item *root_item;
1399        int del = 0;
1400        int ret;
1401
1402        if (!root->reloc_root)
1403                goto out;
1404
1405        reloc_root = root->reloc_root;
1406        root_item = &reloc_root->root_item;
1407
1408        if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1409            btrfs_root_refs(root_item) == 0) {
1410                root->reloc_root = NULL;
1411                del = 1;
1412        }
1413
1414        __update_reloc_root(reloc_root, del);
1415
1416        if (reloc_root->commit_root != reloc_root->node) {
1417                btrfs_set_root_node(root_item, reloc_root->node);
1418                free_extent_buffer(reloc_root->commit_root);
1419                reloc_root->commit_root = btrfs_root_node(reloc_root);
1420        }
1421
1422        ret = btrfs_update_root(trans, root->fs_info->tree_root,
1423                                &reloc_root->root_key, root_item);
1424        BUG_ON(ret);
1425
1426out:
1427        return 0;
1428}
1429
1430/*
1431 * helper to find first cached inode with inode number >= objectid
1432 * in a subvolume
1433 */
1434static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1435{
1436        struct rb_node *node;
1437        struct rb_node *prev;
1438        struct btrfs_inode *entry;
1439        struct inode *inode;
1440
1441        spin_lock(&root->inode_lock);
1442again:
1443        node = root->inode_tree.rb_node;
1444        prev = NULL;
1445        while (node) {
1446                prev = node;
1447                entry = rb_entry(node, struct btrfs_inode, rb_node);
1448
1449                if (objectid < btrfs_ino(&entry->vfs_inode))
1450                        node = node->rb_left;
1451                else if (objectid > btrfs_ino(&entry->vfs_inode))
1452                        node = node->rb_right;
1453                else
1454                        break;
1455        }
1456        if (!node) {
1457                while (prev) {
1458                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
1459                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
1460                                node = prev;
1461                                break;
1462                        }
1463                        prev = rb_next(prev);
1464                }
1465        }
1466        while (node) {
1467                entry = rb_entry(node, struct btrfs_inode, rb_node);
1468                inode = igrab(&entry->vfs_inode);
1469                if (inode) {
1470                        spin_unlock(&root->inode_lock);
1471                        return inode;
1472                }
1473
1474                objectid = btrfs_ino(&entry->vfs_inode) + 1;
1475                if (cond_resched_lock(&root->inode_lock))
1476                        goto again;
1477
1478                node = rb_next(node);
1479        }
1480        spin_unlock(&root->inode_lock);
1481        return NULL;
1482}
1483
1484static int in_block_group(u64 bytenr,
1485                          struct btrfs_block_group_cache *block_group)
1486{
1487        if (bytenr >= block_group->key.objectid &&
1488            bytenr < block_group->key.objectid + block_group->key.offset)
1489                return 1;
1490        return 0;
1491}
1492
1493/*
1494 * get new location of data
1495 */
1496static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1497                            u64 bytenr, u64 num_bytes)
1498{
1499        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1500        struct btrfs_path *path;
1501        struct btrfs_file_extent_item *fi;
1502        struct extent_buffer *leaf;
1503        int ret;
1504
1505        path = btrfs_alloc_path();
1506        if (!path)
1507                return -ENOMEM;
1508
1509        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1510        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
1511                                       bytenr, 0);
1512        if (ret < 0)
1513                goto out;
1514        if (ret > 0) {
1515                ret = -ENOENT;
1516                goto out;
1517        }
1518
1519        leaf = path->nodes[0];
1520        fi = btrfs_item_ptr(leaf, path->slots[0],
1521                            struct btrfs_file_extent_item);
1522
1523        BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1524               btrfs_file_extent_compression(leaf, fi) ||
1525               btrfs_file_extent_encryption(leaf, fi) ||
1526               btrfs_file_extent_other_encoding(leaf, fi));
1527
1528        if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1529                ret = 1;
1530                goto out;
1531        }
1532
1533        *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1534        ret = 0;
1535out:
1536        btrfs_free_path(path);
1537        return ret;
1538}
1539
1540/*
1541 * update file extent items in the tree leaf to point to
1542 * the new locations.
1543 */
1544static noinline_for_stack
1545int replace_file_extents(struct btrfs_trans_handle *trans,
1546                         struct reloc_control *rc,
1547                         struct btrfs_root *root,
1548                         struct extent_buffer *leaf)
1549{
1550        struct btrfs_key key;
1551        struct btrfs_file_extent_item *fi;
1552        struct inode *inode = NULL;
1553        u64 parent;
1554        u64 bytenr;
1555        u64 new_bytenr = 0;
1556        u64 num_bytes;
1557        u64 end;
1558        u32 nritems;
1559        u32 i;
1560        int ret;
1561        int first = 1;
1562        int dirty = 0;
1563
1564        if (rc->stage != UPDATE_DATA_PTRS)
1565                return 0;
1566
1567        /* reloc trees always use full backref */
1568        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1569                parent = leaf->start;
1570        else
1571                parent = 0;
1572
1573        nritems = btrfs_header_nritems(leaf);
1574        for (i = 0; i < nritems; i++) {
1575                cond_resched();
1576                btrfs_item_key_to_cpu(leaf, &key, i);
1577                if (key.type != BTRFS_EXTENT_DATA_KEY)
1578                        continue;
1579                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1580                if (btrfs_file_extent_type(leaf, fi) ==
1581                    BTRFS_FILE_EXTENT_INLINE)
1582                        continue;
1583                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1584                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1585                if (bytenr == 0)
1586                        continue;
1587                if (!in_block_group(bytenr, rc->block_group))
1588                        continue;
1589
1590                /*
1591                 * if we are modifying block in fs tree, wait for readpage
1592                 * to complete and drop the extent cache
1593                 */
1594                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1595                        if (first) {
1596                                inode = find_next_inode(root, key.objectid);
1597                                first = 0;
1598                        } else if (inode && btrfs_ino(inode) < key.objectid) {
1599                                btrfs_add_delayed_iput(inode);
1600                                inode = find_next_inode(root, key.objectid);
1601                        }
1602                        if (inode && btrfs_ino(inode) == key.objectid) {
1603                                end = key.offset +
1604                                      btrfs_file_extent_num_bytes(leaf, fi);
1605                                WARN_ON(!IS_ALIGNED(key.offset,
1606                                                    root->sectorsize));
1607                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1608                                end--;
1609                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1610                                                      key.offset, end);
1611                                if (!ret)
1612                                        continue;
1613
1614                                btrfs_drop_extent_cache(inode, key.offset, end,
1615                                                        1);
1616                                unlock_extent(&BTRFS_I(inode)->io_tree,
1617                                              key.offset, end);
1618                        }
1619                }
1620
1621                ret = get_new_location(rc->data_inode, &new_bytenr,
1622                                       bytenr, num_bytes);
1623                if (ret > 0) {
1624                        WARN_ON(1);
1625                        continue;
1626                }
1627                BUG_ON(ret < 0);
1628
1629                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1630                dirty = 1;
1631
1632                key.offset -= btrfs_file_extent_offset(leaf, fi);
1633                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1634                                           num_bytes, parent,
1635                                           btrfs_header_owner(leaf),
1636                                           key.objectid, key.offset, 1);
1637                BUG_ON(ret);
1638
1639                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1640                                        parent, btrfs_header_owner(leaf),
1641                                        key.objectid, key.offset, 1);
1642                BUG_ON(ret);
1643        }
1644        if (dirty)
1645                btrfs_mark_buffer_dirty(leaf);
1646        if (inode)
1647                btrfs_add_delayed_iput(inode);
1648        return 0;
1649}
1650
1651static noinline_for_stack
1652int memcmp_node_keys(struct extent_buffer *eb, int slot,
1653                     struct btrfs_path *path, int level)
1654{
1655        struct btrfs_disk_key key1;
1656        struct btrfs_disk_key key2;
1657        btrfs_node_key(eb, &key1, slot);
1658        btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1659        return memcmp(&key1, &key2, sizeof(key1));
1660}
1661
1662/*
1663 * try to replace tree blocks in fs tree with the new blocks
1664 * in reloc tree. tree blocks haven't been modified since the
1665 * reloc tree was create can be replaced.
1666 *
1667 * if a block was replaced, level of the block + 1 is returned.
1668 * if no block got replaced, 0 is returned. if there are other
1669 * errors, a negative error number is returned.
1670 */
1671static noinline_for_stack
1672int replace_path(struct btrfs_trans_handle *trans,
1673                 struct btrfs_root *dest, struct btrfs_root *src,
1674                 struct btrfs_path *path, struct btrfs_key *next_key,
1675                 int lowest_level, int max_level)
1676{
1677        struct extent_buffer *eb;
1678        struct extent_buffer *parent;
1679        struct btrfs_key key;
1680        u64 old_bytenr;
1681        u64 new_bytenr;
1682        u64 old_ptr_gen;
1683        u64 new_ptr_gen;
1684        u64 last_snapshot;
1685        u32 blocksize;
1686        int cow = 0;
1687        int level;
1688        int ret;
1689        int slot;
1690
1691        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1692        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1693
1694        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1695again:
1696        slot = path->slots[lowest_level];
1697        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1698
1699        eb = btrfs_lock_root_node(dest);
1700        btrfs_set_lock_blocking(eb);
1701        level = btrfs_header_level(eb);
1702
1703        if (level < lowest_level) {
1704                btrfs_tree_unlock(eb);
1705                free_extent_buffer(eb);
1706                return 0;
1707        }
1708
1709        if (cow) {
1710                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1711                BUG_ON(ret);
1712        }
1713        btrfs_set_lock_blocking(eb);
1714
1715        if (next_key) {
1716                next_key->objectid = (u64)-1;
1717                next_key->type = (u8)-1;
1718                next_key->offset = (u64)-1;
1719        }
1720
1721        parent = eb;
1722        while (1) {
1723                level = btrfs_header_level(parent);
1724                BUG_ON(level < lowest_level);
1725
1726                ret = btrfs_bin_search(parent, &key, level, &slot);
1727                if (ret && slot > 0)
1728                        slot--;
1729
1730                if (next_key && slot + 1 < btrfs_header_nritems(parent))
1731                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1732
1733                old_bytenr = btrfs_node_blockptr(parent, slot);
1734                blocksize = btrfs_level_size(dest, level - 1);
1735                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1736
1737                if (level <= max_level) {
1738                        eb = path->nodes[level];
1739                        new_bytenr = btrfs_node_blockptr(eb,
1740                                                        path->slots[level]);
1741                        new_ptr_gen = btrfs_node_ptr_generation(eb,
1742                                                        path->slots[level]);
1743                } else {
1744                        new_bytenr = 0;
1745                        new_ptr_gen = 0;
1746                }
1747
1748                if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1749                        WARN_ON(1);
1750                        ret = level;
1751                        break;
1752                }
1753
1754                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1755                    memcmp_node_keys(parent, slot, path, level)) {
1756                        if (level <= lowest_level) {
1757                                ret = 0;
1758                                break;
1759                        }
1760
1761                        eb = read_tree_block(dest, old_bytenr, blocksize,
1762                                             old_ptr_gen);
1763                        BUG_ON(!eb);
1764                        btrfs_tree_lock(eb);
1765                        if (cow) {
1766                                ret = btrfs_cow_block(trans, dest, eb, parent,
1767                                                      slot, &eb);
1768                                BUG_ON(ret);
1769                        }
1770                        btrfs_set_lock_blocking(eb);
1771
1772                        btrfs_tree_unlock(parent);
1773                        free_extent_buffer(parent);
1774
1775                        parent = eb;
1776                        continue;
1777                }
1778
1779                if (!cow) {
1780                        btrfs_tree_unlock(parent);
1781                        free_extent_buffer(parent);
1782                        cow = 1;
1783                        goto again;
1784                }
1785
1786                btrfs_node_key_to_cpu(path->nodes[level], &key,
1787                                      path->slots[level]);
1788                btrfs_release_path(path);
1789
1790                path->lowest_level = level;
1791                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1792                path->lowest_level = 0;
1793                BUG_ON(ret);
1794
1795                /*
1796                 * swap blocks in fs tree and reloc tree.
1797                 */
1798                btrfs_set_node_blockptr(parent, slot, new_bytenr);
1799                btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1800                btrfs_mark_buffer_dirty(parent);
1801
1802                btrfs_set_node_blockptr(path->nodes[level],
1803                                        path->slots[level], old_bytenr);
1804                btrfs_set_node_ptr_generation(path->nodes[level],
1805                                              path->slots[level], old_ptr_gen);
1806                btrfs_mark_buffer_dirty(path->nodes[level]);
1807
1808                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1809                                        path->nodes[level]->start,
1810                                        src->root_key.objectid, level - 1, 0,
1811                                        1);
1812                BUG_ON(ret);
1813                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1814                                        0, dest->root_key.objectid, level - 1,
1815                                        0, 1);
1816                BUG_ON(ret);
1817
1818                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1819                                        path->nodes[level]->start,
1820                                        src->root_key.objectid, level - 1, 0,
1821                                        1);
1822                BUG_ON(ret);
1823
1824                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1825                                        0, dest->root_key.objectid, level - 1,
1826                                        0, 1);
1827                BUG_ON(ret);
1828
1829                btrfs_unlock_up_safe(path, 0);
1830
1831                ret = level;
1832                break;
1833        }
1834        btrfs_tree_unlock(parent);
1835        free_extent_buffer(parent);
1836        return ret;
1837}
1838
1839/*
1840 * helper to find next relocated block in reloc tree
1841 */
1842static noinline_for_stack
1843int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1844                       int *level)
1845{
1846        struct extent_buffer *eb;
1847        int i;
1848        u64 last_snapshot;
1849        u32 nritems;
1850
1851        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1852
1853        for (i = 0; i < *level; i++) {
1854                free_extent_buffer(path->nodes[i]);
1855                path->nodes[i] = NULL;
1856        }
1857
1858        for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1859                eb = path->nodes[i];
1860                nritems = btrfs_header_nritems(eb);
1861                while (path->slots[i] + 1 < nritems) {
1862                        path->slots[i]++;
1863                        if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1864                            last_snapshot)
1865                                continue;
1866
1867                        *level = i;
1868                        return 0;
1869                }
1870                free_extent_buffer(path->nodes[i]);
1871                path->nodes[i] = NULL;
1872        }
1873        return 1;
1874}
1875
1876/*
1877 * walk down reloc tree to find relocated block of lowest level
1878 */
1879static noinline_for_stack
1880int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1881                         int *level)
1882{
1883        struct extent_buffer *eb = NULL;
1884        int i;
1885        u64 bytenr;
1886        u64 ptr_gen = 0;
1887        u64 last_snapshot;
1888        u32 blocksize;
1889        u32 nritems;
1890
1891        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1892
1893        for (i = *level; i > 0; i--) {
1894                eb = path->nodes[i];
1895                nritems = btrfs_header_nritems(eb);
1896                while (path->slots[i] < nritems) {
1897                        ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1898                        if (ptr_gen > last_snapshot)
1899                                break;
1900                        path->slots[i]++;
1901                }
1902                if (path->slots[i] >= nritems) {
1903                        if (i == *level)
1904                                break;
1905                        *level = i + 1;
1906                        return 0;
1907                }
1908                if (i == 1) {
1909                        *level = i;
1910                        return 0;
1911                }
1912
1913                bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1914                blocksize = btrfs_level_size(root, i - 1);
1915                eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1916                BUG_ON(btrfs_header_level(eb) != i - 1);
1917                path->nodes[i - 1] = eb;
1918                path->slots[i - 1] = 0;
1919        }
1920        return 1;
1921}
1922
1923/*
1924 * invalidate extent cache for file extents whose key in range of
1925 * [min_key, max_key)
1926 */
1927static int invalidate_extent_cache(struct btrfs_root *root,
1928                                   struct btrfs_key *min_key,
1929                                   struct btrfs_key *max_key)
1930{
1931        struct inode *inode = NULL;
1932        u64 objectid;
1933        u64 start, end;
1934        u64 ino;
1935
1936        objectid = min_key->objectid;
1937        while (1) {
1938                cond_resched();
1939                iput(inode);
1940
1941                if (objectid > max_key->objectid)
1942                        break;
1943
1944                inode = find_next_inode(root, objectid);
1945                if (!inode)
1946                        break;
1947                ino = btrfs_ino(inode);
1948
1949                if (ino > max_key->objectid) {
1950                        iput(inode);
1951                        break;
1952                }
1953
1954                objectid = ino + 1;
1955                if (!S_ISREG(inode->i_mode))
1956                        continue;
1957
1958                if (unlikely(min_key->objectid == ino)) {
1959                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1960                                continue;
1961                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1962                                start = 0;
1963                        else {
1964                                start = min_key->offset;
1965                                WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1966                        }
1967                } else {
1968                        start = 0;
1969                }
1970
1971                if (unlikely(max_key->objectid == ino)) {
1972                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1973                                continue;
1974                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1975                                end = (u64)-1;
1976                        } else {
1977                                if (max_key->offset == 0)
1978                                        continue;
1979                                end = max_key->offset;
1980                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1981                                end--;
1982                        }
1983                } else {
1984                        end = (u64)-1;
1985                }
1986
1987                /* the lock_extent waits for readpage to complete */
1988                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
1989                btrfs_drop_extent_cache(inode, start, end, 1);
1990                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
1991        }
1992        return 0;
1993}
1994
1995static int find_next_key(struct btrfs_path *path, int level,
1996                         struct btrfs_key *key)
1997
1998{
1999        while (level < BTRFS_MAX_LEVEL) {
2000                if (!path->nodes[level])
2001                        break;
2002                if (path->slots[level] + 1 <
2003                    btrfs_header_nritems(path->nodes[level])) {
2004                        btrfs_node_key_to_cpu(path->nodes[level], key,
2005                                              path->slots[level] + 1);
2006                        return 0;
2007                }
2008                level++;
2009        }
2010        return 1;
2011}
2012
2013/*
2014 * merge the relocated tree blocks in reloc tree with corresponding
2015 * fs tree.
2016 */
2017static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2018                                               struct btrfs_root *root)
2019{
2020        LIST_HEAD(inode_list);
2021        struct btrfs_key key;
2022        struct btrfs_key next_key;
2023        struct btrfs_trans_handle *trans;
2024        struct btrfs_root *reloc_root;
2025        struct btrfs_root_item *root_item;
2026        struct btrfs_path *path;
2027        struct extent_buffer *leaf;
2028        unsigned long nr;
2029        int level;
2030        int max_level;
2031        int replaced = 0;
2032        int ret;
2033        int err = 0;
2034        u32 min_reserved;
2035
2036        path = btrfs_alloc_path();
2037        if (!path)
2038                return -ENOMEM;
2039        path->reada = 1;
2040
2041        reloc_root = root->reloc_root;
2042        root_item = &reloc_root->root_item;
2043
2044        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2045                level = btrfs_root_level(root_item);
2046                extent_buffer_get(reloc_root->node);
2047                path->nodes[level] = reloc_root->node;
2048                path->slots[level] = 0;
2049        } else {
2050                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2051
2052                level = root_item->drop_level;
2053                BUG_ON(level == 0);
2054                path->lowest_level = level;
2055                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2056                path->lowest_level = 0;
2057                if (ret < 0) {
2058                        btrfs_free_path(path);
2059                        return ret;
2060                }
2061
2062                btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2063                                      path->slots[level]);
2064                WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2065
2066                btrfs_unlock_up_safe(path, 0);
2067        }
2068
2069        min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2070        memset(&next_key, 0, sizeof(next_key));
2071
2072        while (1) {
2073                trans = btrfs_start_transaction(root, 0);
2074                BUG_ON(IS_ERR(trans));
2075                trans->block_rsv = rc->block_rsv;
2076
2077                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2078                if (ret) {
2079                        BUG_ON(ret != -EAGAIN);
2080                        ret = btrfs_commit_transaction(trans, root);
2081                        BUG_ON(ret);
2082                        continue;
2083                }
2084
2085                replaced = 0;
2086                max_level = level;
2087
2088                ret = walk_down_reloc_tree(reloc_root, path, &level);
2089                if (ret < 0) {
2090                        err = ret;
2091                        goto out;
2092                }
2093                if (ret > 0)
2094                        break;
2095
2096                if (!find_next_key(path, level, &key) &&
2097                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2098                        ret = 0;
2099                } else {
2100                        ret = replace_path(trans, root, reloc_root, path,
2101                                           &next_key, level, max_level);
2102                }
2103                if (ret < 0) {
2104                        err = ret;
2105                        goto out;
2106                }
2107
2108                if (ret > 0) {
2109                        level = ret;
2110                        btrfs_node_key_to_cpu(path->nodes[level], &key,
2111                                              path->slots[level]);
2112                        replaced = 1;
2113                }
2114
2115                ret = walk_up_reloc_tree(reloc_root, path, &level);
2116                if (ret > 0)
2117                        break;
2118
2119                BUG_ON(level == 0);
2120                /*
2121                 * save the merging progress in the drop_progress.
2122                 * this is OK since root refs == 1 in this case.
2123                 */
2124                btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2125                               path->slots[level]);
2126                root_item->drop_level = level;
2127
2128                nr = trans->blocks_used;
2129                btrfs_end_transaction_throttle(trans, root);
2130
2131                btrfs_btree_balance_dirty(root, nr);
2132
2133                if (replaced && rc->stage == UPDATE_DATA_PTRS)
2134                        invalidate_extent_cache(root, &key, &next_key);
2135        }
2136
2137        /*
2138         * handle the case only one block in the fs tree need to be
2139         * relocated and the block is tree root.
2140         */
2141        leaf = btrfs_lock_root_node(root);
2142        ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2143        btrfs_tree_unlock(leaf);
2144        free_extent_buffer(leaf);
2145        if (ret < 0)
2146                err = ret;
2147out:
2148        btrfs_free_path(path);
2149
2150        if (err == 0) {
2151                memset(&root_item->drop_progress, 0,
2152                       sizeof(root_item->drop_progress));
2153                root_item->drop_level = 0;
2154                btrfs_set_root_refs(root_item, 0);
2155                btrfs_update_reloc_root(trans, root);
2156        }
2157
2158        nr = trans->blocks_used;
2159        btrfs_end_transaction_throttle(trans, root);
2160
2161        btrfs_btree_balance_dirty(root, nr);
2162
2163        if (replaced && rc->stage == UPDATE_DATA_PTRS)
2164                invalidate_extent_cache(root, &key, &next_key);
2165
2166        return err;
2167}
2168
2169static noinline_for_stack
2170int prepare_to_merge(struct reloc_control *rc, int err)
2171{
2172        struct btrfs_root *root = rc->extent_root;
2173        struct btrfs_root *reloc_root;
2174        struct btrfs_trans_handle *trans;
2175        LIST_HEAD(reloc_roots);
2176        u64 num_bytes = 0;
2177        int ret;
2178
2179        mutex_lock(&root->fs_info->reloc_mutex);
2180        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2181        rc->merging_rsv_size += rc->nodes_relocated * 2;
2182        mutex_unlock(&root->fs_info->reloc_mutex);
2183
2184again:
2185        if (!err) {
2186                num_bytes = rc->merging_rsv_size;
2187                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2188                if (ret)
2189                        err = ret;
2190        }
2191
2192        trans = btrfs_join_transaction(rc->extent_root);
2193        if (IS_ERR(trans)) {
2194                if (!err)
2195                        btrfs_block_rsv_release(rc->extent_root,
2196                                                rc->block_rsv, num_bytes);
2197                return PTR_ERR(trans);
2198        }
2199
2200        if (!err) {
2201                if (num_bytes != rc->merging_rsv_size) {
2202                        btrfs_end_transaction(trans, rc->extent_root);
2203                        btrfs_block_rsv_release(rc->extent_root,
2204                                                rc->block_rsv, num_bytes);
2205                        goto again;
2206                }
2207        }
2208
2209        rc->merge_reloc_tree = 1;
2210
2211        while (!list_empty(&rc->reloc_roots)) {
2212                reloc_root = list_entry(rc->reloc_roots.next,
2213                                        struct btrfs_root, root_list);
2214                list_del_init(&reloc_root->root_list);
2215
2216                root = read_fs_root(reloc_root->fs_info,
2217                                    reloc_root->root_key.offset);
2218                BUG_ON(IS_ERR(root));
2219                BUG_ON(root->reloc_root != reloc_root);
2220
2221                /*
2222                 * set reference count to 1, so btrfs_recover_relocation
2223                 * knows it should resumes merging
2224                 */
2225                if (!err)
2226                        btrfs_set_root_refs(&reloc_root->root_item, 1);
2227                btrfs_update_reloc_root(trans, root);
2228
2229                list_add(&reloc_root->root_list, &reloc_roots);
2230        }
2231
2232        list_splice(&reloc_roots, &rc->reloc_roots);
2233
2234        if (!err)
2235                btrfs_commit_transaction(trans, rc->extent_root);
2236        else
2237                btrfs_end_transaction(trans, rc->extent_root);
2238        return err;
2239}
2240
2241static noinline_for_stack
2242int merge_reloc_roots(struct reloc_control *rc)
2243{
2244        struct btrfs_root *root;
2245        struct btrfs_root *reloc_root;
2246        LIST_HEAD(reloc_roots);
2247        int found = 0;
2248        int ret;
2249again:
2250        root = rc->extent_root;
2251
2252        /*
2253         * this serializes us with btrfs_record_root_in_transaction,
2254         * we have to make sure nobody is in the middle of
2255         * adding their roots to the list while we are
2256         * doing this splice
2257         */
2258        mutex_lock(&root->fs_info->reloc_mutex);
2259        list_splice_init(&rc->reloc_roots, &reloc_roots);
2260        mutex_unlock(&root->fs_info->reloc_mutex);
2261
2262        while (!list_empty(&reloc_roots)) {
2263                found = 1;
2264                reloc_root = list_entry(reloc_roots.next,
2265                                        struct btrfs_root, root_list);
2266
2267                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2268                        root = read_fs_root(reloc_root->fs_info,
2269                                            reloc_root->root_key.offset);
2270                        BUG_ON(IS_ERR(root));
2271                        BUG_ON(root->reloc_root != reloc_root);
2272
2273                        ret = merge_reloc_root(rc, root);
2274                        BUG_ON(ret);
2275                } else {
2276                        list_del_init(&reloc_root->root_list);
2277                }
2278                ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2279                BUG_ON(ret < 0);
2280        }
2281
2282        if (found) {
2283                found = 0;
2284                goto again;
2285        }
2286        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2287        return 0;
2288}
2289
2290static void free_block_list(struct rb_root *blocks)
2291{
2292        struct tree_block *block;
2293        struct rb_node *rb_node;
2294        while ((rb_node = rb_first(blocks))) {
2295                block = rb_entry(rb_node, struct tree_block, rb_node);
2296                rb_erase(rb_node, blocks);
2297                kfree(block);
2298        }
2299}
2300
2301static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2302                                      struct btrfs_root *reloc_root)
2303{
2304        struct btrfs_root *root;
2305
2306        if (reloc_root->last_trans == trans->transid)
2307                return 0;
2308
2309        root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
2310        BUG_ON(IS_ERR(root));
2311        BUG_ON(root->reloc_root != reloc_root);
2312
2313        return btrfs_record_root_in_trans(trans, root);
2314}
2315
2316static noinline_for_stack
2317struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2318                                     struct reloc_control *rc,
2319                                     struct backref_node *node,
2320                                     struct backref_edge *edges[], int *nr)
2321{
2322        struct backref_node *next;
2323        struct btrfs_root *root;
2324        int index = 0;
2325
2326        next = node;
2327        while (1) {
2328                cond_resched();
2329                next = walk_up_backref(next, edges, &index);
2330                root = next->root;
2331                BUG_ON(!root);
2332                BUG_ON(!root->ref_cows);
2333
2334                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2335                        record_reloc_root_in_trans(trans, root);
2336                        break;
2337                }
2338
2339                btrfs_record_root_in_trans(trans, root);
2340                root = root->reloc_root;
2341
2342                if (next->new_bytenr != root->node->start) {
2343                        BUG_ON(next->new_bytenr);
2344                        BUG_ON(!list_empty(&next->list));
2345                        next->new_bytenr = root->node->start;
2346                        next->root = root;
2347                        list_add_tail(&next->list,
2348                                      &rc->backref_cache.changed);
2349                        __mark_block_processed(rc, next);
2350                        break;
2351                }
2352
2353                WARN_ON(1);
2354                root = NULL;
2355                next = walk_down_backref(edges, &index);
2356                if (!next || next->level <= node->level)
2357                        break;
2358        }
2359        if (!root)
2360                return NULL;
2361
2362        *nr = index;
2363        next = node;
2364        /* setup backref node path for btrfs_reloc_cow_block */
2365        while (1) {
2366                rc->backref_cache.path[next->level] = next;
2367                if (--index < 0)
2368                        break;
2369                next = edges[index]->node[UPPER];
2370        }
2371        return root;
2372}
2373
2374/*
2375 * select a tree root for relocation. return NULL if the block
2376 * is reference counted. we should use do_relocation() in this
2377 * case. return a tree root pointer if the block isn't reference
2378 * counted. return -ENOENT if the block is root of reloc tree.
2379 */
2380static noinline_for_stack
2381struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
2382                                   struct backref_node *node)
2383{
2384        struct backref_node *next;
2385        struct btrfs_root *root;
2386        struct btrfs_root *fs_root = NULL;
2387        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2388        int index = 0;
2389
2390        next = node;
2391        while (1) {
2392                cond_resched();
2393                next = walk_up_backref(next, edges, &index);
2394                root = next->root;
2395                BUG_ON(!root);
2396
2397                /* no other choice for non-references counted tree */
2398                if (!root->ref_cows)
2399                        return root;
2400
2401                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2402                        fs_root = root;
2403
2404                if (next != node)
2405                        return NULL;
2406
2407                next = walk_down_backref(edges, &index);
2408                if (!next || next->level <= node->level)
2409                        break;
2410        }
2411
2412        if (!fs_root)
2413                return ERR_PTR(-ENOENT);
2414        return fs_root;
2415}
2416
2417static noinline_for_stack
2418u64 calcu_metadata_size(struct reloc_control *rc,
2419                        struct backref_node *node, int reserve)
2420{
2421        struct backref_node *next = node;
2422        struct backref_edge *edge;
2423        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2424        u64 num_bytes = 0;
2425        int index = 0;
2426
2427        BUG_ON(reserve && node->processed);
2428
2429        while (next) {
2430                cond_resched();
2431                while (1) {
2432                        if (next->processed && (reserve || next != node))
2433                                break;
2434
2435                        num_bytes += btrfs_level_size(rc->extent_root,
2436                                                      next->level);
2437
2438                        if (list_empty(&next->upper))
2439                                break;
2440
2441                        edge = list_entry(next->upper.next,
2442                                          struct backref_edge, list[LOWER]);
2443                        edges[index++] = edge;
2444                        next = edge->node[UPPER];
2445                }
2446                next = walk_down_backref(edges, &index);
2447        }
2448        return num_bytes;
2449}
2450
2451static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2452                                  struct reloc_control *rc,
2453                                  struct backref_node *node)
2454{
2455        struct btrfs_root *root = rc->extent_root;
2456        u64 num_bytes;
2457        int ret;
2458
2459        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2460
2461        trans->block_rsv = rc->block_rsv;
2462        ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2463        if (ret) {
2464                if (ret == -EAGAIN)
2465                        rc->commit_transaction = 1;
2466                return ret;
2467        }
2468
2469        return 0;
2470}
2471
2472static void release_metadata_space(struct reloc_control *rc,
2473                                   struct backref_node *node)
2474{
2475        u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2476        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
2477}
2478
2479/*
2480 * relocate a block tree, and then update pointers in upper level
2481 * blocks that reference the block to point to the new location.
2482 *
2483 * if called by link_to_upper, the block has already been relocated.
2484 * in that case this function just updates pointers.
2485 */
2486static int do_relocation(struct btrfs_trans_handle *trans,
2487                         struct reloc_control *rc,
2488                         struct backref_node *node,
2489                         struct btrfs_key *key,
2490                         struct btrfs_path *path, int lowest)
2491{
2492        struct backref_node *upper;
2493        struct backref_edge *edge;
2494        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2495        struct btrfs_root *root;
2496        struct extent_buffer *eb;
2497        u32 blocksize;
2498        u64 bytenr;
2499        u64 generation;
2500        int nr;
2501        int slot;
2502        int ret;
2503        int err = 0;
2504
2505        BUG_ON(lowest && node->eb);
2506
2507        path->lowest_level = node->level + 1;
2508        rc->backref_cache.path[node->level] = node;
2509        list_for_each_entry(edge, &node->upper, list[LOWER]) {
2510                cond_resched();
2511
2512                upper = edge->node[UPPER];
2513                root = select_reloc_root(trans, rc, upper, edges, &nr);
2514                BUG_ON(!root);
2515
2516                if (upper->eb && !upper->locked) {
2517                        if (!lowest) {
2518                                ret = btrfs_bin_search(upper->eb, key,
2519                                                       upper->level, &slot);
2520                                BUG_ON(ret);
2521                                bytenr = btrfs_node_blockptr(upper->eb, slot);
2522                                if (node->eb->start == bytenr)
2523                                        goto next;
2524                        }
2525                        drop_node_buffer(upper);
2526                }
2527
2528                if (!upper->eb) {
2529                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2530                        if (ret < 0) {
2531                                err = ret;
2532                                break;
2533                        }
2534                        BUG_ON(ret > 0);
2535
2536                        if (!upper->eb) {
2537                                upper->eb = path->nodes[upper->level];
2538                                path->nodes[upper->level] = NULL;
2539                        } else {
2540                                BUG_ON(upper->eb != path->nodes[upper->level]);
2541                        }
2542
2543                        upper->locked = 1;
2544                        path->locks[upper->level] = 0;
2545
2546                        slot = path->slots[upper->level];
2547                        btrfs_release_path(path);
2548                } else {
2549                        ret = btrfs_bin_search(upper->eb, key, upper->level,
2550                                               &slot);
2551                        BUG_ON(ret);
2552                }
2553
2554                bytenr = btrfs_node_blockptr(upper->eb, slot);
2555                if (lowest) {
2556                        BUG_ON(bytenr != node->bytenr);
2557                } else {
2558                        if (node->eb->start == bytenr)
2559                                goto next;
2560                }
2561
2562                blocksize = btrfs_level_size(root, node->level);
2563                generation = btrfs_node_ptr_generation(upper->eb, slot);
2564                eb = read_tree_block(root, bytenr, blocksize, generation);
2565                if (!eb) {
2566                        err = -EIO;
2567                        goto next;
2568                }
2569                btrfs_tree_lock(eb);
2570                btrfs_set_lock_blocking(eb);
2571
2572                if (!node->eb) {
2573                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
2574                                              slot, &eb);
2575                        btrfs_tree_unlock(eb);
2576                        free_extent_buffer(eb);
2577                        if (ret < 0) {
2578                                err = ret;
2579                                goto next;
2580                        }
2581                        BUG_ON(node->eb != eb);
2582                } else {
2583                        btrfs_set_node_blockptr(upper->eb, slot,
2584                                                node->eb->start);
2585                        btrfs_set_node_ptr_generation(upper->eb, slot,
2586                                                      trans->transid);
2587                        btrfs_mark_buffer_dirty(upper->eb);
2588
2589                        ret = btrfs_inc_extent_ref(trans, root,
2590                                                node->eb->start, blocksize,
2591                                                upper->eb->start,
2592                                                btrfs_header_owner(upper->eb),
2593                                                node->level, 0, 1);
2594                        BUG_ON(ret);
2595
2596                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2597                        BUG_ON(ret);
2598                }
2599next:
2600                if (!upper->pending)
2601                        drop_node_buffer(upper);
2602                else
2603                        unlock_node_buffer(upper);
2604                if (err)
2605                        break;
2606        }
2607
2608        if (!err && node->pending) {
2609                drop_node_buffer(node);
2610                list_move_tail(&node->list, &rc->backref_cache.changed);
2611                node->pending = 0;
2612        }
2613
2614        path->lowest_level = 0;
2615        BUG_ON(err == -ENOSPC);
2616        return err;
2617}
2618
2619static int link_to_upper(struct btrfs_trans_handle *trans,
2620                         struct reloc_control *rc,
2621                         struct backref_node *node,
2622                         struct btrfs_path *path)
2623{
2624        struct btrfs_key key;
2625
2626        btrfs_node_key_to_cpu(node->eb, &key, 0);
2627        return do_relocation(trans, rc, node, &key, path, 0);
2628}
2629
2630static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2631                                struct reloc_control *rc,
2632                                struct btrfs_path *path, int err)
2633{
2634        LIST_HEAD(list);
2635        struct backref_cache *cache = &rc->backref_cache;
2636        struct backref_node *node;
2637        int level;
2638        int ret;
2639
2640        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2641                while (!list_empty(&cache->pending[level])) {
2642                        node = list_entry(cache->pending[level].next,
2643                                          struct backref_node, list);
2644                        list_move_tail(&node->list, &list);
2645                        BUG_ON(!node->pending);
2646
2647                        if (!err) {
2648                                ret = link_to_upper(trans, rc, node, path);
2649                                if (ret < 0)
2650                                        err = ret;
2651                        }
2652                }
2653                list_splice_init(&list, &cache->pending[level]);
2654        }
2655        return err;
2656}
2657
2658static void mark_block_processed(struct reloc_control *rc,
2659                                 u64 bytenr, u32 blocksize)
2660{
2661        set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2662                        EXTENT_DIRTY, GFP_NOFS);
2663}
2664
2665static void __mark_block_processed(struct reloc_control *rc,
2666                                   struct backref_node *node)
2667{
2668        u32 blocksize;
2669        if (node->level == 0 ||
2670            in_block_group(node->bytenr, rc->block_group)) {
2671                blocksize = btrfs_level_size(rc->extent_root, node->level);
2672                mark_block_processed(rc, node->bytenr, blocksize);
2673        }
2674        node->processed = 1;
2675}
2676
2677/*
2678 * mark a block and all blocks directly/indirectly reference the block
2679 * as processed.
2680 */
2681static void update_processed_blocks(struct reloc_control *rc,
2682                                    struct backref_node *node)
2683{
2684        struct backref_node *next = node;
2685        struct backref_edge *edge;
2686        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2687        int index = 0;
2688
2689        while (next) {
2690                cond_resched();
2691                while (1) {
2692                        if (next->processed)
2693                                break;
2694
2695                        __mark_block_processed(rc, next);
2696
2697                        if (list_empty(&next->upper))
2698                                break;
2699
2700                        edge = list_entry(next->upper.next,
2701                                          struct backref_edge, list[LOWER]);
2702                        edges[index++] = edge;
2703                        next = edge->node[UPPER];
2704                }
2705                next = walk_down_backref(edges, &index);
2706        }
2707}
2708
2709static int tree_block_processed(u64 bytenr, u32 blocksize,
2710                                struct reloc_control *rc)
2711{
2712        if (test_range_bit(&rc->processed_blocks, bytenr,
2713                           bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2714                return 1;
2715        return 0;
2716}
2717
2718static int get_tree_block_key(struct reloc_control *rc,
2719                              struct tree_block *block)
2720{
2721        struct extent_buffer *eb;
2722
2723        BUG_ON(block->key_ready);
2724        eb = read_tree_block(rc->extent_root, block->bytenr,
2725                             block->key.objectid, block->key.offset);
2726        BUG_ON(!eb);
2727        WARN_ON(btrfs_header_level(eb) != block->level);
2728        if (block->level == 0)
2729                btrfs_item_key_to_cpu(eb, &block->key, 0);
2730        else
2731                btrfs_node_key_to_cpu(eb, &block->key, 0);
2732        free_extent_buffer(eb);
2733        block->key_ready = 1;
2734        return 0;
2735}
2736
2737static int reada_tree_block(struct reloc_control *rc,
2738                            struct tree_block *block)
2739{
2740        BUG_ON(block->key_ready);
2741        readahead_tree_block(rc->extent_root, block->bytenr,
2742                             block->key.objectid, block->key.offset);
2743        return 0;
2744}
2745
2746/*
2747 * helper function to relocate a tree block
2748 */
2749static int relocate_tree_block(struct btrfs_trans_handle *trans,
2750                                struct reloc_control *rc,
2751                                struct backref_node *node,
2752                                struct btrfs_key *key,
2753                                struct btrfs_path *path)
2754{
2755        struct btrfs_root *root;
2756        int release = 0;
2757        int ret = 0;
2758
2759        if (!node)
2760                return 0;
2761
2762        BUG_ON(node->processed);
2763        root = select_one_root(trans, node);
2764        if (root == ERR_PTR(-ENOENT)) {
2765                update_processed_blocks(rc, node);
2766                goto out;
2767        }
2768
2769        if (!root || root->ref_cows) {
2770                ret = reserve_metadata_space(trans, rc, node);
2771                if (ret)
2772                        goto out;
2773                release = 1;
2774        }
2775
2776        if (root) {
2777                if (root->ref_cows) {
2778                        BUG_ON(node->new_bytenr);
2779                        BUG_ON(!list_empty(&node->list));
2780                        btrfs_record_root_in_trans(trans, root);
2781                        root = root->reloc_root;
2782                        node->new_bytenr = root->node->start;
2783                        node->root = root;
2784                        list_add_tail(&node->list, &rc->backref_cache.changed);
2785                } else {
2786                        path->lowest_level = node->level;
2787                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2788                        btrfs_release_path(path);
2789                        if (ret > 0)
2790                                ret = 0;
2791                }
2792                if (!ret)
2793                        update_processed_blocks(rc, node);
2794        } else {
2795                ret = do_relocation(trans, rc, node, key, path, 1);
2796        }
2797out:
2798        if (ret || node->level == 0 || node->cowonly) {
2799                if (release)
2800                        release_metadata_space(rc, node);
2801                remove_backref_node(&rc->backref_cache, node);
2802        }
2803        return ret;
2804}
2805
2806/*
2807 * relocate a list of blocks
2808 */
2809static noinline_for_stack
2810int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2811                         struct reloc_control *rc, struct rb_root *blocks)
2812{
2813        struct backref_node *node;
2814        struct btrfs_path *path;
2815        struct tree_block *block;
2816        struct rb_node *rb_node;
2817        int ret;
2818        int err = 0;
2819
2820        path = btrfs_alloc_path();
2821        if (!path)
2822                return -ENOMEM;
2823
2824        rb_node = rb_first(blocks);
2825        while (rb_node) {
2826                block = rb_entry(rb_node, struct tree_block, rb_node);
2827                if (!block->key_ready)
2828                        reada_tree_block(rc, block);
2829                rb_node = rb_next(rb_node);
2830        }
2831
2832        rb_node = rb_first(blocks);
2833        while (rb_node) {
2834                block = rb_entry(rb_node, struct tree_block, rb_node);
2835                if (!block->key_ready)
2836                        get_tree_block_key(rc, block);
2837                rb_node = rb_next(rb_node);
2838        }
2839
2840        rb_node = rb_first(blocks);
2841        while (rb_node) {
2842                block = rb_entry(rb_node, struct tree_block, rb_node);
2843
2844                node = build_backref_tree(rc, &block->key,
2845                                          block->level, block->bytenr);
2846                if (IS_ERR(node)) {
2847                        err = PTR_ERR(node);
2848                        goto out;
2849                }
2850
2851                ret = relocate_tree_block(trans, rc, node, &block->key,
2852                                          path);
2853                if (ret < 0) {
2854                        if (ret != -EAGAIN || rb_node == rb_first(blocks))
2855                                err = ret;
2856                        goto out;
2857                }
2858                rb_node = rb_next(rb_node);
2859        }
2860out:
2861        free_block_list(blocks);
2862        err = finish_pending_nodes(trans, rc, path, err);
2863
2864        btrfs_free_path(path);
2865        return err;
2866}
2867
2868static noinline_for_stack
2869int prealloc_file_extent_cluster(struct inode *inode,
2870                                 struct file_extent_cluster *cluster)
2871{
2872        u64 alloc_hint = 0;
2873        u64 start;
2874        u64 end;
2875        u64 offset = BTRFS_I(inode)->index_cnt;
2876        u64 num_bytes;
2877        int nr = 0;
2878        int ret = 0;
2879
2880        BUG_ON(cluster->start != cluster->boundary[0]);
2881        mutex_lock(&inode->i_mutex);
2882
2883        ret = btrfs_check_data_free_space(inode, cluster->end +
2884                                          1 - cluster->start);
2885        if (ret)
2886                goto out;
2887
2888        while (nr < cluster->nr) {
2889                start = cluster->boundary[nr] - offset;
2890                if (nr + 1 < cluster->nr)
2891                        end = cluster->boundary[nr + 1] - 1 - offset;
2892                else
2893                        end = cluster->end - offset;
2894
2895                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2896                num_bytes = end + 1 - start;
2897                ret = btrfs_prealloc_file_range(inode, 0, start,
2898                                                num_bytes, num_bytes,
2899                                                end + 1, &alloc_hint);
2900                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2901                if (ret)
2902                        break;
2903                nr++;
2904        }
2905        btrfs_free_reserved_data_space(inode, cluster->end +
2906                                       1 - cluster->start);
2907out:
2908        mutex_unlock(&inode->i_mutex);
2909        return ret;
2910}
2911
2912static noinline_for_stack
2913int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2914                         u64 block_start)
2915{
2916        struct btrfs_root *root = BTRFS_I(inode)->root;
2917        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2918        struct extent_map *em;
2919        int ret = 0;
2920
2921        em = alloc_extent_map();
2922        if (!em)
2923                return -ENOMEM;
2924
2925        em->start = start;
2926        em->len = end + 1 - start;
2927        em->block_len = em->len;
2928        em->block_start = block_start;
2929        em->bdev = root->fs_info->fs_devices->latest_bdev;
2930        set_bit(EXTENT_FLAG_PINNED, &em->flags);
2931
2932        lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2933        while (1) {
2934                write_lock(&em_tree->lock);
2935                ret = add_extent_mapping(em_tree, em);
2936                write_unlock(&em_tree->lock);
2937                if (ret != -EEXIST) {
2938                        free_extent_map(em);
2939                        break;
2940                }
2941                btrfs_drop_extent_cache(inode, start, end, 0);
2942        }
2943        unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2944        return ret;
2945}
2946
2947static int relocate_file_extent_cluster(struct inode *inode,
2948                                        struct file_extent_cluster *cluster)
2949{
2950        u64 page_start;
2951        u64 page_end;
2952        u64 offset = BTRFS_I(inode)->index_cnt;
2953        unsigned long index;
2954        unsigned long last_index;
2955        struct page *page;
2956        struct file_ra_state *ra;
2957        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2958        int nr = 0;
2959        int ret = 0;
2960
2961        if (!cluster->nr)
2962                return 0;
2963
2964        ra = kzalloc(sizeof(*ra), GFP_NOFS);
2965        if (!ra)
2966                return -ENOMEM;
2967
2968        ret = prealloc_file_extent_cluster(inode, cluster);
2969        if (ret)
2970                goto out;
2971
2972        file_ra_state_init(ra, inode->i_mapping);
2973
2974        ret = setup_extent_mapping(inode, cluster->start - offset,
2975                                   cluster->end - offset, cluster->start);
2976        if (ret)
2977                goto out;
2978
2979        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2980        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2981        while (index <= last_index) {
2982                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2983                if (ret)
2984                        goto out;
2985
2986                page = find_lock_page(inode->i_mapping, index);
2987                if (!page) {
2988                        page_cache_sync_readahead(inode->i_mapping,
2989                                                  ra, NULL, index,
2990                                                  last_index + 1 - index);
2991                        page = find_or_create_page(inode->i_mapping, index,
2992                                                   mask);
2993                        if (!page) {
2994                                btrfs_delalloc_release_metadata(inode,
2995                                                        PAGE_CACHE_SIZE);
2996                                ret = -ENOMEM;
2997                                goto out;
2998                        }
2999                }
3000
3001                if (PageReadahead(page)) {
3002                        page_cache_async_readahead(inode->i_mapping,
3003                                                   ra, NULL, page, index,
3004                                                   last_index + 1 - index);
3005                }
3006
3007                if (!PageUptodate(page)) {
3008                        btrfs_readpage(NULL, page);
3009                        lock_page(page);
3010                        if (!PageUptodate(page)) {
3011                                unlock_page(page);
3012                                page_cache_release(page);
3013                                btrfs_delalloc_release_metadata(inode,
3014                                                        PAGE_CACHE_SIZE);
3015                                ret = -EIO;
3016                                goto out;
3017                        }
3018                }
3019
3020                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
3021                page_end = page_start + PAGE_CACHE_SIZE - 1;
3022
3023                lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3024
3025                set_page_extent_mapped(page);
3026
3027                if (nr < cluster->nr &&
3028                    page_start + offset == cluster->boundary[nr]) {
3029                        set_extent_bits(&BTRFS_I(inode)->io_tree,
3030                                        page_start, page_end,
3031                                        EXTENT_BOUNDARY, GFP_NOFS);
3032                        nr++;
3033                }
3034
3035                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
3036                set_page_dirty(page);
3037
3038                unlock_extent(&BTRFS_I(inode)->io_tree,
3039                              page_start, page_end);
3040                unlock_page(page);
3041                page_cache_release(page);
3042
3043                index++;
3044                balance_dirty_pages_ratelimited(inode->i_mapping);
3045                btrfs_throttle(BTRFS_I(inode)->root);
3046        }
3047        WARN_ON(nr != cluster->nr);
3048out:
3049        kfree(ra);
3050        return ret;
3051}
3052
3053static noinline_for_stack
3054int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3055                         struct file_extent_cluster *cluster)
3056{
3057        int ret;
3058
3059        if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3060                ret = relocate_file_extent_cluster(inode, cluster);
3061                if (ret)
3062                        return ret;
3063                cluster->nr = 0;
3064        }
3065
3066        if (!cluster->nr)
3067                cluster->start = extent_key->objectid;
3068        else
3069                BUG_ON(cluster->nr >= MAX_EXTENTS);
3070        cluster->end = extent_key->objectid + extent_key->offset - 1;
3071        cluster->boundary[cluster->nr] = extent_key->objectid;
3072        cluster->nr++;
3073
3074        if (cluster->nr >= MAX_EXTENTS) {
3075                ret = relocate_file_extent_cluster(inode, cluster);
3076                if (ret)
3077                        return ret;
3078                cluster->nr = 0;
3079        }
3080        return 0;
3081}
3082
3083#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3084static int get_ref_objectid_v0(struct reloc_control *rc,
3085                               struct btrfs_path *path,
3086                               struct btrfs_key *extent_key,
3087                               u64 *ref_objectid, int *path_change)
3088{
3089        struct btrfs_key key;
3090        struct extent_buffer *leaf;
3091        struct btrfs_extent_ref_v0 *ref0;
3092        int ret;
3093        int slot;
3094
3095        leaf = path->nodes[0];
3096        slot = path->slots[0];
3097        while (1) {
3098                if (slot >= btrfs_header_nritems(leaf)) {
3099                        ret = btrfs_next_leaf(rc->extent_root, path);
3100                        if (ret < 0)
3101                                return ret;
3102                        BUG_ON(ret > 0);
3103                        leaf = path->nodes[0];
3104                        slot = path->slots[0];
3105                        if (path_change)
3106                                *path_change = 1;
3107                }
3108                btrfs_item_key_to_cpu(leaf, &key, slot);
3109                if (key.objectid != extent_key->objectid)
3110                        return -ENOENT;
3111
3112                if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3113                        slot++;
3114                        continue;
3115                }
3116                ref0 = btrfs_item_ptr(leaf, slot,
3117                                struct btrfs_extent_ref_v0);
3118                *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3119                break;
3120        }
3121        return 0;
3122}
3123#endif
3124
3125/*
3126 * helper to add a tree block to the list.
3127 * the major work is getting the generation and level of the block
3128 */
3129static int add_tree_block(struct reloc_control *rc,
3130                          struct btrfs_key *extent_key,
3131                          struct btrfs_path *path,
3132                          struct rb_root *blocks)
3133{
3134        struct extent_buffer *eb;
3135        struct btrfs_extent_item *ei;
3136        struct btrfs_tree_block_info *bi;
3137        struct tree_block *block;
3138        struct rb_node *rb_node;
3139        u32 item_size;
3140        int level = -1;
3141        int generation;
3142
3143        eb =  path->nodes[0];
3144        item_size = btrfs_item_size_nr(eb, path->slots[0]);
3145
3146        if (item_size >= sizeof(*ei) + sizeof(*bi)) {
3147                ei = btrfs_item_ptr(eb, path->slots[0],
3148                                struct btrfs_extent_item);
3149                bi = (struct btrfs_tree_block_info *)(ei + 1);
3150                generation = btrfs_extent_generation(eb, ei);
3151                level = btrfs_tree_block_level(eb, bi);
3152        } else {
3153#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3154                u64 ref_owner;
3155                int ret;
3156
3157                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3158                ret = get_ref_objectid_v0(rc, path, extent_key,
3159                                          &ref_owner, NULL);
3160                if (ret < 0)
3161                        return ret;
3162                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3163                level = (int)ref_owner;
3164                /* FIXME: get real generation */
3165                generation = 0;
3166#else
3167                BUG();
3168#endif
3169        }
3170
3171        btrfs_release_path(path);
3172
3173        BUG_ON(level == -1);
3174
3175        block = kmalloc(sizeof(*block), GFP_NOFS);
3176        if (!block)
3177                return -ENOMEM;
3178
3179        block->bytenr = extent_key->objectid;
3180        block->key.objectid = extent_key->offset;
3181        block->key.offset = generation;
3182        block->level = level;
3183        block->key_ready = 0;
3184
3185        rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3186        if (rb_node)
3187                backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3188
3189        return 0;
3190}
3191
3192/*
3193 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3194 */
3195static int __add_tree_block(struct reloc_control *rc,
3196                            u64 bytenr, u32 blocksize,
3197                            struct rb_root *blocks)
3198{
3199        struct btrfs_path *path;
3200        struct btrfs_key key;
3201        int ret;
3202
3203        if (tree_block_processed(bytenr, blocksize, rc))
3204                return 0;
3205
3206        if (tree_search(blocks, bytenr))
3207                return 0;
3208
3209        path = btrfs_alloc_path();
3210        if (!path)
3211                return -ENOMEM;
3212
3213        key.objectid = bytenr;
3214        key.type = BTRFS_EXTENT_ITEM_KEY;
3215        key.offset = blocksize;
3216
3217        path->search_commit_root = 1;
3218        path->skip_locking = 1;
3219        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3220        if (ret < 0)
3221                goto out;
3222        BUG_ON(ret);
3223
3224        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3225        ret = add_tree_block(rc, &key, path, blocks);
3226out:
3227        btrfs_free_path(path);
3228        return ret;
3229}
3230
3231/*
3232 * helper to check if the block use full backrefs for pointers in it
3233 */
3234static int block_use_full_backref(struct reloc_control *rc,
3235                                  struct extent_buffer *eb)
3236{
3237        u64 flags;
3238        int ret;
3239
3240        if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3241            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3242                return 1;
3243
3244        ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
3245                                       eb->start, eb->len, NULL, &flags);
3246        BUG_ON(ret);
3247
3248        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3249                ret = 1;
3250        else
3251                ret = 0;
3252        return ret;
3253}
3254
3255static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3256                                    struct inode *inode, u64 ino)
3257{
3258        struct btrfs_key key;
3259        struct btrfs_path *path;
3260        struct btrfs_root *root = fs_info->tree_root;
3261        struct btrfs_trans_handle *trans;
3262        unsigned long nr;
3263        int ret = 0;
3264
3265        if (inode)
3266                goto truncate;
3267
3268        key.objectid = ino;
3269        key.type = BTRFS_INODE_ITEM_KEY;
3270        key.offset = 0;
3271
3272        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3273        if (IS_ERR(inode) || is_bad_inode(inode)) {
3274                if (!IS_ERR(inode))
3275                        iput(inode);
3276                return -ENOENT;
3277        }
3278
3279truncate:
3280        path = btrfs_alloc_path();
3281        if (!path) {
3282                ret = -ENOMEM;
3283                goto out;
3284        }
3285
3286        trans = btrfs_join_transaction(root);
3287        if (IS_ERR(trans)) {
3288                btrfs_free_path(path);
3289                ret = PTR_ERR(trans);
3290                goto out;
3291        }
3292
3293        ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3294
3295        btrfs_free_path(path);
3296        nr = trans->blocks_used;
3297        btrfs_end_transaction(trans, root);
3298        btrfs_btree_balance_dirty(root, nr);
3299out:
3300        iput(inode);
3301        return ret;
3302}
3303
3304/*
3305 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3306 * this function scans fs tree to find blocks reference the data extent
3307 */
3308static int find_data_references(struct reloc_control *rc,
3309                                struct btrfs_key *extent_key,
3310                                struct extent_buffer *leaf,
3311                                struct btrfs_extent_data_ref *ref,
3312                                struct rb_root *blocks)
3313{
3314        struct btrfs_path *path;
3315        struct tree_block *block;
3316        struct btrfs_root *root;
3317        struct btrfs_file_extent_item *fi;
3318        struct rb_node *rb_node;
3319        struct btrfs_key key;
3320        u64 ref_root;
3321        u64 ref_objectid;
3322        u64 ref_offset;
3323        u32 ref_count;
3324        u32 nritems;
3325        int err = 0;
3326        int added = 0;
3327        int counted;
3328        int ret;
3329
3330        ref_root = btrfs_extent_data_ref_root(leaf, ref);
3331        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3332        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3333        ref_count = btrfs_extent_data_ref_count(leaf, ref);
3334
3335        /*
3336         * This is an extent belonging to the free space cache, lets just delete
3337         * it and redo the search.
3338         */
3339        if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3340                ret = delete_block_group_cache(rc->extent_root->fs_info,
3341                                               NULL, ref_objectid);
3342                if (ret != -ENOENT)
3343                        return ret;
3344                ret = 0;
3345        }
3346
3347        path = btrfs_alloc_path();
3348        if (!path)
3349                return -ENOMEM;
3350        path->reada = 1;
3351
3352        root = read_fs_root(rc->extent_root->fs_info, ref_root);
3353        if (IS_ERR(root)) {
3354                err = PTR_ERR(root);
3355                goto out;
3356        }
3357
3358        key.objectid = ref_objectid;
3359        key.type = BTRFS_EXTENT_DATA_KEY;
3360        if (ref_offset > ((u64)-1 << 32))
3361                key.offset = 0;
3362        else
3363                key.offset = ref_offset;
3364
3365        path->search_commit_root = 1;
3366        path->skip_locking = 1;
3367        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3368        if (ret < 0) {
3369                err = ret;
3370                goto out;
3371        }
3372
3373        leaf = path->nodes[0];
3374        nritems = btrfs_header_nritems(leaf);
3375        /*
3376         * the references in tree blocks that use full backrefs
3377         * are not counted in
3378         */
3379        if (block_use_full_backref(rc, leaf))
3380                counted = 0;
3381        else
3382                counted = 1;
3383        rb_node = tree_search(blocks, leaf->start);
3384        if (rb_node) {
3385                if (counted)
3386                        added = 1;
3387                else
3388                        path->slots[0] = nritems;
3389        }
3390
3391        while (ref_count > 0) {
3392                while (path->slots[0] >= nritems) {
3393                        ret = btrfs_next_leaf(root, path);
3394                        if (ret < 0) {
3395                                err = ret;
3396                                goto out;
3397                        }
3398                        if (ret > 0) {
3399                                WARN_ON(1);
3400                                goto out;
3401                        }
3402
3403                        leaf = path->nodes[0];
3404                        nritems = btrfs_header_nritems(leaf);
3405                        added = 0;
3406
3407                        if (block_use_full_backref(rc, leaf))
3408                                counted = 0;
3409                        else
3410                                counted = 1;
3411                        rb_node = tree_search(blocks, leaf->start);
3412                        if (rb_node) {
3413                                if (counted)
3414                                        added = 1;
3415                                else
3416                                        path->slots[0] = nritems;
3417                        }
3418                }
3419
3420                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3421                if (key.objectid != ref_objectid ||
3422                    key.type != BTRFS_EXTENT_DATA_KEY) {
3423                        WARN_ON(1);
3424                        break;
3425                }
3426
3427                fi = btrfs_item_ptr(leaf, path->slots[0],
3428                                    struct btrfs_file_extent_item);
3429
3430                if (btrfs_file_extent_type(leaf, fi) ==
3431                    BTRFS_FILE_EXTENT_INLINE)
3432                        goto next;
3433
3434                if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3435                    extent_key->objectid)
3436                        goto next;
3437
3438                key.offset -= btrfs_file_extent_offset(leaf, fi);
3439                if (key.offset != ref_offset)
3440                        goto next;
3441
3442                if (counted)
3443                        ref_count--;
3444                if (added)
3445                        goto next;
3446
3447                if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3448                        block = kmalloc(sizeof(*block), GFP_NOFS);
3449                        if (!block) {
3450                                err = -ENOMEM;
3451                                break;
3452                        }
3453                        block->bytenr = leaf->start;
3454                        btrfs_item_key_to_cpu(leaf, &block->key, 0);
3455                        block->level = 0;
3456                        block->key_ready = 1;
3457                        rb_node = tree_insert(blocks, block->bytenr,
3458                                              &block->rb_node);
3459                        if (rb_node)
3460                                backref_tree_panic(rb_node, -EEXIST,
3461                                                   block->bytenr);
3462                }
3463                if (counted)
3464                        added = 1;
3465                else
3466                        path->slots[0] = nritems;
3467next:
3468                path->slots[0]++;
3469
3470        }
3471out:
3472        btrfs_free_path(path);
3473        return err;
3474}
3475
3476/*
3477 * hepler to find all tree blocks that reference a given data extent
3478 */
3479static noinline_for_stack
3480int add_data_references(struct reloc_control *rc,
3481                        struct btrfs_key *extent_key,
3482                        struct btrfs_path *path,
3483                        struct rb_root *blocks)
3484{
3485        struct btrfs_key key;
3486        struct extent_buffer *eb;
3487        struct btrfs_extent_data_ref *dref;
3488        struct btrfs_extent_inline_ref *iref;
3489        unsigned long ptr;
3490        unsigned long end;
3491        u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3492        int ret;
3493        int err = 0;
3494
3495        eb = path->nodes[0];
3496        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3497        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3498#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3499        if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3500                ptr = end;
3501        else
3502#endif
3503                ptr += sizeof(struct btrfs_extent_item);
3504
3505        while (ptr < end) {
3506                iref = (struct btrfs_extent_inline_ref *)ptr;
3507                key.type = btrfs_extent_inline_ref_type(eb, iref);
3508                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3509                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3510                        ret = __add_tree_block(rc, key.offset, blocksize,
3511                                               blocks);
3512                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3513                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3514                        ret = find_data_references(rc, extent_key,
3515                                                   eb, dref, blocks);
3516                } else {
3517                        BUG();
3518                }
3519                ptr += btrfs_extent_inline_ref_size(key.type);
3520        }
3521        WARN_ON(ptr > end);
3522
3523        while (1) {
3524                cond_resched();
3525                eb = path->nodes[0];
3526                if (path->slots[0] >= btrfs_header_nritems(eb)) {
3527                        ret = btrfs_next_leaf(rc->extent_root, path);
3528                        if (ret < 0) {
3529                                err = ret;
3530                                break;
3531                        }
3532                        if (ret > 0)
3533                                break;
3534                        eb = path->nodes[0];
3535                }
3536
3537                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3538                if (key.objectid != extent_key->objectid)
3539                        break;
3540
3541#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3542                if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3543                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3544#else
3545                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3546                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3547#endif
3548                        ret = __add_tree_block(rc, key.offset, blocksize,
3549                                               blocks);
3550                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3551                        dref = btrfs_item_ptr(eb, path->slots[0],
3552                                              struct btrfs_extent_data_ref);
3553                        ret = find_data_references(rc, extent_key,
3554                                                   eb, dref, blocks);
3555                } else {
3556                        ret = 0;
3557                }
3558                if (ret) {
3559                        err = ret;
3560                        break;
3561                }
3562                path->slots[0]++;
3563        }
3564        btrfs_release_path(path);
3565        if (err)
3566                free_block_list(blocks);
3567        return err;
3568}
3569
3570/*
3571 * hepler to find next unprocessed extent
3572 */
3573static noinline_for_stack
3574int find_next_extent(struct btrfs_trans_handle *trans,
3575                     struct reloc_control *rc, struct btrfs_path *path,
3576                     struct btrfs_key *extent_key)
3577{
3578        struct btrfs_key key;
3579        struct extent_buffer *leaf;
3580        u64 start, end, last;
3581        int ret;
3582
3583        last = rc->block_group->key.objectid + rc->block_group->key.offset;
3584        while (1) {
3585                cond_resched();
3586                if (rc->search_start >= last) {
3587                        ret = 1;
3588                        break;
3589                }
3590
3591                key.objectid = rc->search_start;
3592                key.type = BTRFS_EXTENT_ITEM_KEY;
3593                key.offset = 0;
3594
3595                path->search_commit_root = 1;
3596                path->skip_locking = 1;
3597                ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3598                                        0, 0);
3599                if (ret < 0)
3600                        break;
3601next:
3602                leaf = path->nodes[0];
3603                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3604                        ret = btrfs_next_leaf(rc->extent_root, path);
3605                        if (ret != 0)
3606                                break;
3607                        leaf = path->nodes[0];
3608                }
3609
3610                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3611                if (key.objectid >= last) {
3612                        ret = 1;
3613                        break;
3614                }
3615
3616                if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3617                    key.objectid + key.offset <= rc->search_start) {
3618                        path->slots[0]++;
3619                        goto next;
3620                }
3621
3622                ret = find_first_extent_bit(&rc->processed_blocks,
3623                                            key.objectid, &start, &end,
3624                                            EXTENT_DIRTY, NULL);
3625
3626                if (ret == 0 && start <= key.objectid) {
3627                        btrfs_release_path(path);
3628                        rc->search_start = end + 1;
3629                } else {
3630                        rc->search_start = key.objectid + key.offset;
3631                        memcpy(extent_key, &key, sizeof(key));
3632                        return 0;
3633                }
3634        }
3635        btrfs_release_path(path);
3636        return ret;
3637}
3638
3639static void set_reloc_control(struct reloc_control *rc)
3640{
3641        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3642
3643        mutex_lock(&fs_info->reloc_mutex);
3644        fs_info->reloc_ctl = rc;
3645        mutex_unlock(&fs_info->reloc_mutex);
3646}
3647
3648static void unset_reloc_control(struct reloc_control *rc)
3649{
3650        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3651
3652        mutex_lock(&fs_info->reloc_mutex);
3653        fs_info->reloc_ctl = NULL;
3654        mutex_unlock(&fs_info->reloc_mutex);
3655}
3656
3657static int check_extent_flags(u64 flags)
3658{
3659        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3660            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3661                return 1;
3662        if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3663            !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3664                return 1;
3665        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3666            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3667                return 1;
3668        return 0;
3669}
3670
3671static noinline_for_stack
3672int prepare_to_relocate(struct reloc_control *rc)
3673{
3674        struct btrfs_trans_handle *trans;
3675        int ret;
3676
3677        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3678                                              BTRFS_BLOCK_RSV_TEMP);
3679        if (!rc->block_rsv)
3680                return -ENOMEM;
3681
3682        /*
3683         * reserve some space for creating reloc trees.
3684         * btrfs_init_reloc_root will use them when there
3685         * is no reservation in transaction handle.
3686         */
3687        ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3688                                  rc->extent_root->nodesize * 256);
3689        if (ret)
3690                return ret;
3691
3692        memset(&rc->cluster, 0, sizeof(rc->cluster));
3693        rc->search_start = rc->block_group->key.objectid;
3694        rc->extents_found = 0;
3695        rc->nodes_relocated = 0;
3696        rc->merging_rsv_size = 0;
3697
3698        rc->create_reloc_tree = 1;
3699        set_reloc_control(rc);
3700
3701        trans = btrfs_join_transaction(rc->extent_root);
3702        BUG_ON(IS_ERR(trans));
3703        btrfs_commit_transaction(trans, rc->extent_root);
3704        return 0;
3705}
3706
3707static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3708{
3709        struct rb_root blocks = RB_ROOT;
3710        struct btrfs_key key;
3711        struct btrfs_trans_handle *trans = NULL;
3712        struct btrfs_path *path;
3713        struct btrfs_extent_item *ei;
3714        unsigned long nr;
3715        u64 flags;
3716        u32 item_size;
3717        int ret;
3718        int err = 0;
3719        int progress = 0;
3720
3721        path = btrfs_alloc_path();
3722        if (!path)
3723                return -ENOMEM;
3724        path->reada = 1;
3725
3726        ret = prepare_to_relocate(rc);
3727        if (ret) {
3728                err = ret;
3729                goto out_free;
3730        }
3731
3732        while (1) {
3733                progress++;
3734                trans = btrfs_start_transaction(rc->extent_root, 0);
3735                BUG_ON(IS_ERR(trans));
3736restart:
3737                if (update_backref_cache(trans, &rc->backref_cache)) {
3738                        btrfs_end_transaction(trans, rc->extent_root);
3739                        continue;
3740                }
3741
3742                ret = find_next_extent(trans, rc, path, &key);
3743                if (ret < 0)
3744                        err = ret;
3745                if (ret != 0)
3746                        break;
3747
3748                rc->extents_found++;
3749
3750                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3751                                    struct btrfs_extent_item);
3752                item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3753                if (item_size >= sizeof(*ei)) {
3754                        flags = btrfs_extent_flags(path->nodes[0], ei);
3755                        ret = check_extent_flags(flags);
3756                        BUG_ON(ret);
3757
3758                } else {
3759#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3760                        u64 ref_owner;
3761                        int path_change = 0;
3762
3763                        BUG_ON(item_size !=
3764                               sizeof(struct btrfs_extent_item_v0));
3765                        ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3766                                                  &path_change);
3767                        if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3768                                flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3769                        else
3770                                flags = BTRFS_EXTENT_FLAG_DATA;
3771
3772                        if (path_change) {
3773                                btrfs_release_path(path);
3774
3775                                path->search_commit_root = 1;
3776                                path->skip_locking = 1;
3777                                ret = btrfs_search_slot(NULL, rc->extent_root,
3778                                                        &key, path, 0, 0);
3779                                if (ret < 0) {
3780                                        err = ret;
3781                                        break;
3782                                }
3783                                BUG_ON(ret > 0);
3784                        }
3785#else
3786                        BUG();
3787#endif
3788                }
3789
3790                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3791                        ret = add_tree_block(rc, &key, path, &blocks);
3792                } else if (rc->stage == UPDATE_DATA_PTRS &&
3793                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
3794                        ret = add_data_references(rc, &key, path, &blocks);
3795                } else {
3796                        btrfs_release_path(path);
3797                        ret = 0;
3798                }
3799                if (ret < 0) {
3800                        err = ret;
3801                        break;
3802                }
3803
3804                if (!RB_EMPTY_ROOT(&blocks)) {
3805                        ret = relocate_tree_blocks(trans, rc, &blocks);
3806                        if (ret < 0) {
3807                                if (ret != -EAGAIN) {
3808                                        err = ret;
3809                                        break;
3810                                }
3811                                rc->extents_found--;
3812                                rc->search_start = key.objectid;
3813                        }
3814                }
3815
3816                ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3817                if (ret < 0) {
3818                        if (ret != -ENOSPC) {
3819                                err = ret;
3820                                WARN_ON(1);
3821                                break;
3822                        }
3823                        rc->commit_transaction = 1;
3824                }
3825
3826                if (rc->commit_transaction) {
3827                        rc->commit_transaction = 0;
3828                        ret = btrfs_commit_transaction(trans, rc->extent_root);
3829                        BUG_ON(ret);
3830                } else {
3831                        nr = trans->blocks_used;
3832                        btrfs_end_transaction_throttle(trans, rc->extent_root);
3833                        btrfs_btree_balance_dirty(rc->extent_root, nr);
3834                }
3835                trans = NULL;
3836
3837                if (rc->stage == MOVE_DATA_EXTENTS &&
3838                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
3839                        rc->found_file_extent = 1;
3840                        ret = relocate_data_extent(rc->data_inode,
3841                                                   &key, &rc->cluster);
3842                        if (ret < 0) {
3843                                err = ret;
3844                                break;
3845                        }
3846                }
3847        }
3848        if (trans && progress && err == -ENOSPC) {
3849                ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
3850                                              rc->block_group->flags);
3851                if (ret == 0) {
3852                        err = 0;
3853                        progress = 0;
3854                        goto restart;
3855                }
3856        }
3857
3858        btrfs_release_path(path);
3859        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3860                          GFP_NOFS);
3861
3862        if (trans) {
3863                nr = trans->blocks_used;
3864                btrfs_end_transaction_throttle(trans, rc->extent_root);
3865                btrfs_btree_balance_dirty(rc->extent_root, nr);
3866        }
3867
3868        if (!err) {
3869                ret = relocate_file_extent_cluster(rc->data_inode,
3870                                                   &rc->cluster);
3871                if (ret < 0)
3872                        err = ret;
3873        }
3874
3875        rc->create_reloc_tree = 0;
3876        set_reloc_control(rc);
3877
3878        backref_cache_cleanup(&rc->backref_cache);
3879        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3880
3881        err = prepare_to_merge(rc, err);
3882
3883        merge_reloc_roots(rc);
3884
3885        rc->merge_reloc_tree = 0;
3886        unset_reloc_control(rc);
3887        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3888
3889        /* get rid of pinned extents */
3890        trans = btrfs_join_transaction(rc->extent_root);
3891        if (IS_ERR(trans))
3892                err = PTR_ERR(trans);
3893        else
3894                btrfs_commit_transaction(trans, rc->extent_root);
3895out_free:
3896        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3897        btrfs_free_path(path);
3898        return err;
3899}
3900
3901static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3902                                 struct btrfs_root *root, u64 objectid)
3903{
3904        struct btrfs_path *path;
3905        struct btrfs_inode_item *item;
3906        struct extent_buffer *leaf;
3907        int ret;
3908
3909        path = btrfs_alloc_path();
3910        if (!path)
3911                return -ENOMEM;
3912
3913        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3914        if (ret)
3915                goto out;
3916
3917        leaf = path->nodes[0];
3918        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3919        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3920        btrfs_set_inode_generation(leaf, item, 1);
3921        btrfs_set_inode_size(leaf, item, 0);
3922        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3923        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3924                                          BTRFS_INODE_PREALLOC);
3925        btrfs_mark_buffer_dirty(leaf);
3926        btrfs_release_path(path);
3927out:
3928        btrfs_free_path(path);
3929        return ret;
3930}
3931
3932/*
3933 * helper to create inode for data relocation.
3934 * the inode is in data relocation tree and its link count is 0
3935 */
3936static noinline_for_stack
3937struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3938                                 struct btrfs_block_group_cache *group)
3939{
3940        struct inode *inode = NULL;
3941        struct btrfs_trans_handle *trans;
3942        struct btrfs_root *root;
3943        struct btrfs_key key;
3944        unsigned long nr;
3945        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3946        int err = 0;
3947
3948        root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3949        if (IS_ERR(root))
3950                return ERR_CAST(root);
3951
3952        trans = btrfs_start_transaction(root, 6);
3953        if (IS_ERR(trans))
3954                return ERR_CAST(trans);
3955
3956        err = btrfs_find_free_objectid(root, &objectid);
3957        if (err)
3958                goto out;
3959
3960        err = __insert_orphan_inode(trans, root, objectid);
3961        BUG_ON(err);
3962
3963        key.objectid = objectid;
3964        key.type = BTRFS_INODE_ITEM_KEY;
3965        key.offset = 0;
3966        inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
3967        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3968        BTRFS_I(inode)->index_cnt = group->key.objectid;
3969
3970        err = btrfs_orphan_add(trans, inode);
3971out:
3972        nr = trans->blocks_used;
3973        btrfs_end_transaction(trans, root);
3974        btrfs_btree_balance_dirty(root, nr);
3975        if (err) {
3976                if (inode)
3977                        iput(inode);
3978                inode = ERR_PTR(err);
3979        }
3980        return inode;
3981}
3982
3983static struct reloc_control *alloc_reloc_control(void)
3984{
3985        struct reloc_control *rc;
3986
3987        rc = kzalloc(sizeof(*rc), GFP_NOFS);
3988        if (!rc)
3989                return NULL;
3990
3991        INIT_LIST_HEAD(&rc->reloc_roots);
3992        backref_cache_init(&rc->backref_cache);
3993        mapping_tree_init(&rc->reloc_root_tree);
3994        extent_io_tree_init(&rc->processed_blocks, NULL);
3995        return rc;
3996}
3997
3998/*
3999 * function to relocate all extents in a block group.
4000 */
4001int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4002{
4003        struct btrfs_fs_info *fs_info = extent_root->fs_info;
4004        struct reloc_control *rc;
4005        struct inode *inode;
4006        struct btrfs_path *path;
4007        int ret;
4008        int rw = 0;
4009        int err = 0;
4010
4011        rc = alloc_reloc_control();
4012        if (!rc)
4013                return -ENOMEM;
4014
4015        rc->extent_root = extent_root;
4016
4017        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4018        BUG_ON(!rc->block_group);
4019
4020        if (!rc->block_group->ro) {
4021                ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
4022                if (ret) {
4023                        err = ret;
4024                        goto out;
4025                }
4026                rw = 1;
4027        }
4028
4029        path = btrfs_alloc_path();
4030        if (!path) {
4031                err = -ENOMEM;
4032                goto out;
4033        }
4034
4035        inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
4036                                        path);
4037        btrfs_free_path(path);
4038
4039        if (!IS_ERR(inode))
4040                ret = delete_block_group_cache(fs_info, inode, 0);
4041        else
4042                ret = PTR_ERR(inode);
4043
4044        if (ret && ret != -ENOENT) {
4045                err = ret;
4046                goto out;
4047        }
4048
4049        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4050        if (IS_ERR(rc->data_inode)) {
4051                err = PTR_ERR(rc->data_inode);
4052                rc->data_inode = NULL;
4053                goto out;
4054        }
4055
4056        printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
4057               (unsigned long long)rc->block_group->key.objectid,
4058               (unsigned long long)rc->block_group->flags);
4059
4060        btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
4061        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
4062
4063        while (1) {
4064                mutex_lock(&fs_info->cleaner_mutex);
4065
4066                btrfs_clean_old_snapshots(fs_info->tree_root);
4067                ret = relocate_block_group(rc);
4068
4069                mutex_unlock(&fs_info->cleaner_mutex);
4070                if (ret < 0) {
4071                        err = ret;
4072                        goto out;
4073                }
4074
4075                if (rc->extents_found == 0)
4076                        break;
4077
4078                printk(KERN_INFO "btrfs: found %llu extents\n",
4079                        (unsigned long long)rc->extents_found);
4080
4081                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4082                        btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
4083                        invalidate_mapping_pages(rc->data_inode->i_mapping,
4084                                                 0, -1);
4085                        rc->stage = UPDATE_DATA_PTRS;
4086                }
4087        }
4088
4089        filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
4090                                     rc->block_group->key.objectid,
4091                                     rc->block_group->key.objectid +
4092                                     rc->block_group->key.offset - 1);
4093
4094        WARN_ON(rc->block_group->pinned > 0);
4095        WARN_ON(rc->block_group->reserved > 0);
4096        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4097out:
4098        if (err && rw)
4099                btrfs_set_block_group_rw(extent_root, rc->block_group);
4100        iput(rc->data_inode);
4101        btrfs_put_block_group(rc->block_group);
4102        kfree(rc);
4103        return err;
4104}
4105
4106static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4107{
4108        struct btrfs_trans_handle *trans;
4109        int ret, err;
4110
4111        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4112        if (IS_ERR(trans))
4113                return PTR_ERR(trans);
4114
4115        memset(&root->root_item.drop_progress, 0,
4116                sizeof(root->root_item.drop_progress));
4117        root->root_item.drop_level = 0;
4118        btrfs_set_root_refs(&root->root_item, 0);
4119        ret = btrfs_update_root(trans, root->fs_info->tree_root,
4120                                &root->root_key, &root->root_item);
4121
4122        err = btrfs_end_transaction(trans, root->fs_info->tree_root);
4123        if (err)
4124                return err;
4125        return ret;
4126}
4127
4128/*
4129 * recover relocation interrupted by system crash.
4130 *
4131 * this function resumes merging reloc trees with corresponding fs trees.
4132 * this is important for keeping the sharing of tree blocks
4133 */
4134int btrfs_recover_relocation(struct btrfs_root *root)
4135{
4136        LIST_HEAD(reloc_roots);
4137        struct btrfs_key key;
4138        struct btrfs_root *fs_root;
4139        struct btrfs_root *reloc_root;
4140        struct btrfs_path *path;
4141        struct extent_buffer *leaf;
4142        struct reloc_control *rc = NULL;
4143        struct btrfs_trans_handle *trans;
4144        int ret;
4145        int err = 0;
4146
4147        path = btrfs_alloc_path();
4148        if (!path)
4149                return -ENOMEM;
4150        path->reada = -1;
4151
4152        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4153        key.type = BTRFS_ROOT_ITEM_KEY;
4154        key.offset = (u64)-1;
4155
4156        while (1) {
4157                ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
4158                                        path, 0, 0);
4159                if (ret < 0) {
4160                        err = ret;
4161                        goto out;
4162                }
4163                if (ret > 0) {
4164                        if (path->slots[0] == 0)
4165                                break;
4166                        path->slots[0]--;
4167                }
4168                leaf = path->nodes[0];
4169                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4170                btrfs_release_path(path);
4171
4172                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4173                    key.type != BTRFS_ROOT_ITEM_KEY)
4174                        break;
4175
4176                reloc_root = btrfs_read_fs_root_no_radix(root, &key);
4177                if (IS_ERR(reloc_root)) {
4178                        err = PTR_ERR(reloc_root);
4179                        goto out;
4180                }
4181
4182                list_add(&reloc_root->root_list, &reloc_roots);
4183
4184                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4185                        fs_root = read_fs_root(root->fs_info,
4186                                               reloc_root->root_key.offset);
4187                        if (IS_ERR(fs_root)) {
4188                                ret = PTR_ERR(fs_root);
4189                                if (ret != -ENOENT) {
4190                                        err = ret;
4191                                        goto out;
4192                                }
4193                                ret = mark_garbage_root(reloc_root);
4194                                if (ret < 0) {
4195                                        err = ret;
4196                                        goto out;
4197                                }
4198                        }
4199                }
4200
4201                if (key.offset == 0)
4202                        break;
4203
4204                key.offset--;
4205        }
4206        btrfs_release_path(path);
4207
4208        if (list_empty(&reloc_roots))
4209                goto out;
4210
4211        rc = alloc_reloc_control();
4212        if (!rc) {
4213                err = -ENOMEM;
4214                goto out;
4215        }
4216
4217        rc->extent_root = root->fs_info->extent_root;
4218
4219        set_reloc_control(rc);
4220
4221        trans = btrfs_join_transaction(rc->extent_root);
4222        if (IS_ERR(trans)) {
4223                unset_reloc_control(rc);
4224                err = PTR_ERR(trans);
4225                goto out_free;
4226        }
4227
4228        rc->merge_reloc_tree = 1;
4229
4230        while (!list_empty(&reloc_roots)) {
4231                reloc_root = list_entry(reloc_roots.next,
4232                                        struct btrfs_root, root_list);
4233                list_del(&reloc_root->root_list);
4234
4235                if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4236                        list_add_tail(&reloc_root->root_list,
4237                                      &rc->reloc_roots);
4238                        continue;
4239                }
4240
4241                fs_root = read_fs_root(root->fs_info,
4242                                       reloc_root->root_key.offset);
4243                if (IS_ERR(fs_root)) {
4244                        err = PTR_ERR(fs_root);
4245                        goto out_free;
4246                }
4247
4248                err = __add_reloc_root(reloc_root);
4249                BUG_ON(err < 0); /* -ENOMEM or logic error */
4250                fs_root->reloc_root = reloc_root;
4251        }
4252
4253        err = btrfs_commit_transaction(trans, rc->extent_root);
4254        if (err)
4255                goto out_free;
4256
4257        merge_reloc_roots(rc);
4258
4259        unset_reloc_control(rc);
4260
4261        trans = btrfs_join_transaction(rc->extent_root);
4262        if (IS_ERR(trans))
4263                err = PTR_ERR(trans);
4264        else
4265                err = btrfs_commit_transaction(trans, rc->extent_root);
4266out_free:
4267        kfree(rc);
4268out:
4269        while (!list_empty(&reloc_roots)) {
4270                reloc_root = list_entry(reloc_roots.next,
4271                                        struct btrfs_root, root_list);
4272                list_del(&reloc_root->root_list);
4273                free_extent_buffer(reloc_root->node);
4274                free_extent_buffer(reloc_root->commit_root);
4275                kfree(reloc_root);
4276        }
4277        btrfs_free_path(path);
4278
4279        if (err == 0) {
4280                /* cleanup orphan inode in data relocation tree */
4281                fs_root = read_fs_root(root->fs_info,
4282                                       BTRFS_DATA_RELOC_TREE_OBJECTID);
4283                if (IS_ERR(fs_root))
4284                        err = PTR_ERR(fs_root);
4285                else
4286                        err = btrfs_orphan_cleanup(fs_root);
4287        }
4288        return err;
4289}
4290
4291/*
4292 * helper to add ordered checksum for data relocation.
4293 *
4294 * cloning checksum properly handles the nodatasum extents.
4295 * it also saves CPU time to re-calculate the checksum.
4296 */
4297int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4298{
4299        struct btrfs_ordered_sum *sums;
4300        struct btrfs_sector_sum *sector_sum;
4301        struct btrfs_ordered_extent *ordered;
4302        struct btrfs_root *root = BTRFS_I(inode)->root;
4303        size_t offset;
4304        int ret;
4305        u64 disk_bytenr;
4306        LIST_HEAD(list);
4307
4308        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4309        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4310
4311        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4312        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4313                                       disk_bytenr + len - 1, &list, 0);
4314        if (ret)
4315                goto out;
4316
4317        while (!list_empty(&list)) {
4318                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4319                list_del_init(&sums->list);
4320
4321                sector_sum = sums->sums;
4322                sums->bytenr = ordered->start;
4323
4324                offset = 0;
4325                while (offset < sums->len) {
4326                        sector_sum->bytenr += ordered->start - disk_bytenr;
4327                        sector_sum++;
4328                        offset += root->sectorsize;
4329                }
4330
4331                btrfs_add_ordered_sum(inode, ordered, sums);
4332        }
4333out:
4334        btrfs_put_ordered_extent(ordered);
4335        return ret;
4336}
4337
4338void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4339                           struct btrfs_root *root, struct extent_buffer *buf,
4340                           struct extent_buffer *cow)
4341{
4342        struct reloc_control *rc;
4343        struct backref_node *node;
4344        int first_cow = 0;
4345        int level;
4346        int ret;
4347
4348        rc = root->fs_info->reloc_ctl;
4349        if (!rc)
4350                return;
4351
4352        BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4353               root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4354
4355        level = btrfs_header_level(buf);
4356        if (btrfs_header_generation(buf) <=
4357            btrfs_root_last_snapshot(&root->root_item))
4358                first_cow = 1;
4359
4360        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4361            rc->create_reloc_tree) {
4362                WARN_ON(!first_cow && level == 0);
4363
4364                node = rc->backref_cache.path[level];
4365                BUG_ON(node->bytenr != buf->start &&
4366                       node->new_bytenr != buf->start);
4367
4368                drop_node_buffer(node);
4369                extent_buffer_get(cow);
4370                node->eb = cow;
4371                node->new_bytenr = cow->start;
4372
4373                if (!node->pending) {
4374                        list_move_tail(&node->list,
4375                                       &rc->backref_cache.pending[level]);
4376                        node->pending = 1;
4377                }
4378
4379                if (first_cow)
4380                        __mark_block_processed(rc, node);
4381
4382                if (first_cow && level > 0)
4383                        rc->nodes_relocated += buf->len;
4384        }
4385
4386        if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4387                ret = replace_file_extents(trans, rc, root, cow);
4388                BUG_ON(ret);
4389        }
4390}
4391
4392/*
4393 * called before creating snapshot. it calculates metadata reservation
4394 * requried for relocating tree blocks in the snapshot
4395 */
4396void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4397                              struct btrfs_pending_snapshot *pending,
4398                              u64 *bytes_to_reserve)
4399{
4400        struct btrfs_root *root;
4401        struct reloc_control *rc;
4402
4403        root = pending->root;
4404        if (!root->reloc_root)
4405                return;
4406
4407        rc = root->fs_info->reloc_ctl;
4408        if (!rc->merge_reloc_tree)
4409                return;
4410
4411        root = root->reloc_root;
4412        BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4413        /*
4414         * relocation is in the stage of merging trees. the space
4415         * used by merging a reloc tree is twice the size of
4416         * relocated tree nodes in the worst case. half for cowing
4417         * the reloc tree, half for cowing the fs tree. the space
4418         * used by cowing the reloc tree will be freed after the
4419         * tree is dropped. if we create snapshot, cowing the fs
4420         * tree may use more space than it frees. so we need
4421         * reserve extra space.
4422         */
4423        *bytes_to_reserve += rc->nodes_relocated;
4424}
4425
4426/*
4427 * called after snapshot is created. migrate block reservation
4428 * and create reloc root for the newly created snapshot
4429 */
4430int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4431                               struct btrfs_pending_snapshot *pending)
4432{
4433        struct btrfs_root *root = pending->root;
4434        struct btrfs_root *reloc_root;
4435        struct btrfs_root *new_root;
4436        struct reloc_control *rc;
4437        int ret;
4438
4439        if (!root->reloc_root)
4440                return 0;
4441
4442        rc = root->fs_info->reloc_ctl;
4443        rc->merging_rsv_size += rc->nodes_relocated;
4444
4445        if (rc->merge_reloc_tree) {
4446                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4447                                              rc->block_rsv,
4448                                              rc->nodes_relocated);
4449                if (ret)
4450                        return ret;
4451        }
4452
4453        new_root = pending->snap;
4454        reloc_root = create_reloc_root(trans, root->reloc_root,
4455                                       new_root->root_key.objectid);
4456        if (IS_ERR(reloc_root))
4457                return PTR_ERR(reloc_root);
4458
4459        ret = __add_reloc_root(reloc_root);
4460        BUG_ON(ret < 0);
4461        new_root->reloc_root = reloc_root;
4462
4463        if (rc->create_reloc_tree)
4464                ret = clone_backref_node(trans, rc, root, reloc_root);
4465        return ret;
4466}
4467