linux/fs/btrfs/relocation.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2009 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/pagemap.h>
  21#include <linux/writeback.h>
  22#include <linux/blkdev.h>
  23#include <linux/rbtree.h>
  24#include <linux/slab.h>
  25#include "ctree.h"
  26#include "disk-io.h"
  27#include "transaction.h"
  28#include "volumes.h"
  29#include "locking.h"
  30#include "btrfs_inode.h"
  31#include "async-thread.h"
  32#include "free-space-cache.h"
  33#include "inode-map.h"
  34
  35/*
  36 * backref_node, mapping_node and tree_block start with this
  37 */
  38struct tree_entry {
  39        struct rb_node rb_node;
  40        u64 bytenr;
  41};
  42
  43/*
  44 * present a tree block in the backref cache
  45 */
  46struct backref_node {
  47        struct rb_node rb_node;
  48        u64 bytenr;
  49
  50        u64 new_bytenr;
  51        /* objectid of tree block owner, can be not uptodate */
  52        u64 owner;
  53        /* link to pending, changed or detached list */
  54        struct list_head list;
  55        /* list of upper level blocks reference this block */
  56        struct list_head upper;
  57        /* list of child blocks in the cache */
  58        struct list_head lower;
  59        /* NULL if this node is not tree root */
  60        struct btrfs_root *root;
  61        /* extent buffer got by COW the block */
  62        struct extent_buffer *eb;
  63        /* level of tree block */
  64        unsigned int level:8;
  65        /* is the block in non-reference counted tree */
  66        unsigned int cowonly:1;
  67        /* 1 if no child node in the cache */
  68        unsigned int lowest:1;
  69        /* is the extent buffer locked */
  70        unsigned int locked:1;
  71        /* has the block been processed */
  72        unsigned int processed:1;
  73        /* have backrefs of this block been checked */
  74        unsigned int checked:1;
  75        /*
  76         * 1 if corresponding block has been cowed but some upper
  77         * level block pointers may not point to the new location
  78         */
  79        unsigned int pending:1;
  80        /*
  81         * 1 if the backref node isn't connected to any other
  82         * backref node.
  83         */
  84        unsigned int detached:1;
  85};
  86
  87/*
  88 * present a block pointer in the backref cache
  89 */
  90struct backref_edge {
  91        struct list_head list[2];
  92        struct backref_node *node[2];
  93};
  94
  95#define LOWER   0
  96#define UPPER   1
  97#define RELOCATION_RESERVED_NODES       256
  98
  99struct backref_cache {
 100        /* red black tree of all backref nodes in the cache */
 101        struct rb_root rb_root;
 102        /* for passing backref nodes to btrfs_reloc_cow_block */
 103        struct backref_node *path[BTRFS_MAX_LEVEL];
 104        /*
 105         * list of blocks that have been cowed but some block
 106         * pointers in upper level blocks may not reflect the
 107         * new location
 108         */
 109        struct list_head pending[BTRFS_MAX_LEVEL];
 110        /* list of backref nodes with no child node */
 111        struct list_head leaves;
 112        /* list of blocks that have been cowed in current transaction */
 113        struct list_head changed;
 114        /* list of detached backref node. */
 115        struct list_head detached;
 116
 117        u64 last_trans;
 118
 119        int nr_nodes;
 120        int nr_edges;
 121};
 122
 123/*
 124 * map address of tree root to tree
 125 */
 126struct mapping_node {
 127        struct rb_node rb_node;
 128        u64 bytenr;
 129        void *data;
 130};
 131
 132struct mapping_tree {
 133        struct rb_root rb_root;
 134        spinlock_t lock;
 135};
 136
 137/*
 138 * present a tree block to process
 139 */
 140struct tree_block {
 141        struct rb_node rb_node;
 142        u64 bytenr;
 143        struct btrfs_key key;
 144        unsigned int level:8;
 145        unsigned int key_ready:1;
 146};
 147
 148#define MAX_EXTENTS 128
 149
 150struct file_extent_cluster {
 151        u64 start;
 152        u64 end;
 153        u64 boundary[MAX_EXTENTS];
 154        unsigned int nr;
 155};
 156
 157struct reloc_control {
 158        /* block group to relocate */
 159        struct btrfs_block_group_cache *block_group;
 160        /* extent tree */
 161        struct btrfs_root *extent_root;
 162        /* inode for moving data */
 163        struct inode *data_inode;
 164
 165        struct btrfs_block_rsv *block_rsv;
 166
 167        struct backref_cache backref_cache;
 168
 169        struct file_extent_cluster cluster;
 170        /* tree blocks have been processed */
 171        struct extent_io_tree processed_blocks;
 172        /* map start of tree root to corresponding reloc tree */
 173        struct mapping_tree reloc_root_tree;
 174        /* list of reloc trees */
 175        struct list_head reloc_roots;
 176        /* size of metadata reservation for merging reloc trees */
 177        u64 merging_rsv_size;
 178        /* size of relocated tree nodes */
 179        u64 nodes_relocated;
 180        /* reserved size for block group relocation*/
 181        u64 reserved_bytes;
 182
 183        u64 search_start;
 184        u64 extents_found;
 185
 186        unsigned int stage:8;
 187        unsigned int create_reloc_tree:1;
 188        unsigned int merge_reloc_tree:1;
 189        unsigned int found_file_extent:1;
 190};
 191
 192/* stages of data relocation */
 193#define MOVE_DATA_EXTENTS       0
 194#define UPDATE_DATA_PTRS        1
 195
 196static void remove_backref_node(struct backref_cache *cache,
 197                                struct backref_node *node);
 198static void __mark_block_processed(struct reloc_control *rc,
 199                                   struct backref_node *node);
 200
 201static void mapping_tree_init(struct mapping_tree *tree)
 202{
 203        tree->rb_root = RB_ROOT;
 204        spin_lock_init(&tree->lock);
 205}
 206
 207static void backref_cache_init(struct backref_cache *cache)
 208{
 209        int i;
 210        cache->rb_root = RB_ROOT;
 211        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 212                INIT_LIST_HEAD(&cache->pending[i]);
 213        INIT_LIST_HEAD(&cache->changed);
 214        INIT_LIST_HEAD(&cache->detached);
 215        INIT_LIST_HEAD(&cache->leaves);
 216}
 217
 218static void backref_cache_cleanup(struct backref_cache *cache)
 219{
 220        struct backref_node *node;
 221        int i;
 222
 223        while (!list_empty(&cache->detached)) {
 224                node = list_entry(cache->detached.next,
 225                                  struct backref_node, list);
 226                remove_backref_node(cache, node);
 227        }
 228
 229        while (!list_empty(&cache->leaves)) {
 230                node = list_entry(cache->leaves.next,
 231                                  struct backref_node, lower);
 232                remove_backref_node(cache, node);
 233        }
 234
 235        cache->last_trans = 0;
 236
 237        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 238                BUG_ON(!list_empty(&cache->pending[i]));
 239        BUG_ON(!list_empty(&cache->changed));
 240        BUG_ON(!list_empty(&cache->detached));
 241        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
 242        BUG_ON(cache->nr_nodes);
 243        BUG_ON(cache->nr_edges);
 244}
 245
 246static struct backref_node *alloc_backref_node(struct backref_cache *cache)
 247{
 248        struct backref_node *node;
 249
 250        node = kzalloc(sizeof(*node), GFP_NOFS);
 251        if (node) {
 252                INIT_LIST_HEAD(&node->list);
 253                INIT_LIST_HEAD(&node->upper);
 254                INIT_LIST_HEAD(&node->lower);
 255                RB_CLEAR_NODE(&node->rb_node);
 256                cache->nr_nodes++;
 257        }
 258        return node;
 259}
 260
 261static void free_backref_node(struct backref_cache *cache,
 262                              struct backref_node *node)
 263{
 264        if (node) {
 265                cache->nr_nodes--;
 266                kfree(node);
 267        }
 268}
 269
 270static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
 271{
 272        struct backref_edge *edge;
 273
 274        edge = kzalloc(sizeof(*edge), GFP_NOFS);
 275        if (edge)
 276                cache->nr_edges++;
 277        return edge;
 278}
 279
 280static void free_backref_edge(struct backref_cache *cache,
 281                              struct backref_edge *edge)
 282{
 283        if (edge) {
 284                cache->nr_edges--;
 285                kfree(edge);
 286        }
 287}
 288
 289static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 290                                   struct rb_node *node)
 291{
 292        struct rb_node **p = &root->rb_node;
 293        struct rb_node *parent = NULL;
 294        struct tree_entry *entry;
 295
 296        while (*p) {
 297                parent = *p;
 298                entry = rb_entry(parent, struct tree_entry, rb_node);
 299
 300                if (bytenr < entry->bytenr)
 301                        p = &(*p)->rb_left;
 302                else if (bytenr > entry->bytenr)
 303                        p = &(*p)->rb_right;
 304                else
 305                        return parent;
 306        }
 307
 308        rb_link_node(node, parent, p);
 309        rb_insert_color(node, root);
 310        return NULL;
 311}
 312
 313static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 314{
 315        struct rb_node *n = root->rb_node;
 316        struct tree_entry *entry;
 317
 318        while (n) {
 319                entry = rb_entry(n, struct tree_entry, rb_node);
 320
 321                if (bytenr < entry->bytenr)
 322                        n = n->rb_left;
 323                else if (bytenr > entry->bytenr)
 324                        n = n->rb_right;
 325                else
 326                        return n;
 327        }
 328        return NULL;
 329}
 330
 331static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
 332{
 333
 334        struct btrfs_fs_info *fs_info = NULL;
 335        struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
 336                                              rb_node);
 337        if (bnode->root)
 338                fs_info = bnode->root->fs_info;
 339        btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
 340                    "found at offset %llu", bytenr);
 341}
 342
 343/*
 344 * walk up backref nodes until reach node presents tree root
 345 */
 346static struct backref_node *walk_up_backref(struct backref_node *node,
 347                                            struct backref_edge *edges[],
 348                                            int *index)
 349{
 350        struct backref_edge *edge;
 351        int idx = *index;
 352
 353        while (!list_empty(&node->upper)) {
 354                edge = list_entry(node->upper.next,
 355                                  struct backref_edge, list[LOWER]);
 356                edges[idx++] = edge;
 357                node = edge->node[UPPER];
 358        }
 359        BUG_ON(node->detached);
 360        *index = idx;
 361        return node;
 362}
 363
 364/*
 365 * walk down backref nodes to find start of next reference path
 366 */
 367static struct backref_node *walk_down_backref(struct backref_edge *edges[],
 368                                              int *index)
 369{
 370        struct backref_edge *edge;
 371        struct backref_node *lower;
 372        int idx = *index;
 373
 374        while (idx > 0) {
 375                edge = edges[idx - 1];
 376                lower = edge->node[LOWER];
 377                if (list_is_last(&edge->list[LOWER], &lower->upper)) {
 378                        idx--;
 379                        continue;
 380                }
 381                edge = list_entry(edge->list[LOWER].next,
 382                                  struct backref_edge, list[LOWER]);
 383                edges[idx - 1] = edge;
 384                *index = idx;
 385                return edge->node[UPPER];
 386        }
 387        *index = 0;
 388        return NULL;
 389}
 390
 391static void unlock_node_buffer(struct backref_node *node)
 392{
 393        if (node->locked) {
 394                btrfs_tree_unlock(node->eb);
 395                node->locked = 0;
 396        }
 397}
 398
 399static void drop_node_buffer(struct backref_node *node)
 400{
 401        if (node->eb) {
 402                unlock_node_buffer(node);
 403                free_extent_buffer(node->eb);
 404                node->eb = NULL;
 405        }
 406}
 407
 408static void drop_backref_node(struct backref_cache *tree,
 409                              struct backref_node *node)
 410{
 411        BUG_ON(!list_empty(&node->upper));
 412
 413        drop_node_buffer(node);
 414        list_del(&node->list);
 415        list_del(&node->lower);
 416        if (!RB_EMPTY_NODE(&node->rb_node))
 417                rb_erase(&node->rb_node, &tree->rb_root);
 418        free_backref_node(tree, node);
 419}
 420
 421/*
 422 * remove a backref node from the backref cache
 423 */
 424static void remove_backref_node(struct backref_cache *cache,
 425                                struct backref_node *node)
 426{
 427        struct backref_node *upper;
 428        struct backref_edge *edge;
 429
 430        if (!node)
 431                return;
 432
 433        BUG_ON(!node->lowest && !node->detached);
 434        while (!list_empty(&node->upper)) {
 435                edge = list_entry(node->upper.next, struct backref_edge,
 436                                  list[LOWER]);
 437                upper = edge->node[UPPER];
 438                list_del(&edge->list[LOWER]);
 439                list_del(&edge->list[UPPER]);
 440                free_backref_edge(cache, edge);
 441
 442                if (RB_EMPTY_NODE(&upper->rb_node)) {
 443                        BUG_ON(!list_empty(&node->upper));
 444                        drop_backref_node(cache, node);
 445                        node = upper;
 446                        node->lowest = 1;
 447                        continue;
 448                }
 449                /*
 450                 * add the node to leaf node list if no other
 451                 * child block cached.
 452                 */
 453                if (list_empty(&upper->lower)) {
 454                        list_add_tail(&upper->lower, &cache->leaves);
 455                        upper->lowest = 1;
 456                }
 457        }
 458
 459        drop_backref_node(cache, node);
 460}
 461
 462static void update_backref_node(struct backref_cache *cache,
 463                                struct backref_node *node, u64 bytenr)
 464{
 465        struct rb_node *rb_node;
 466        rb_erase(&node->rb_node, &cache->rb_root);
 467        node->bytenr = bytenr;
 468        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
 469        if (rb_node)
 470                backref_tree_panic(rb_node, -EEXIST, bytenr);
 471}
 472
 473/*
 474 * update backref cache after a transaction commit
 475 */
 476static int update_backref_cache(struct btrfs_trans_handle *trans,
 477                                struct backref_cache *cache)
 478{
 479        struct backref_node *node;
 480        int level = 0;
 481
 482        if (cache->last_trans == 0) {
 483                cache->last_trans = trans->transid;
 484                return 0;
 485        }
 486
 487        if (cache->last_trans == trans->transid)
 488                return 0;
 489
 490        /*
 491         * detached nodes are used to avoid unnecessary backref
 492         * lookup. transaction commit changes the extent tree.
 493         * so the detached nodes are no longer useful.
 494         */
 495        while (!list_empty(&cache->detached)) {
 496                node = list_entry(cache->detached.next,
 497                                  struct backref_node, list);
 498                remove_backref_node(cache, node);
 499        }
 500
 501        while (!list_empty(&cache->changed)) {
 502                node = list_entry(cache->changed.next,
 503                                  struct backref_node, list);
 504                list_del_init(&node->list);
 505                BUG_ON(node->pending);
 506                update_backref_node(cache, node, node->new_bytenr);
 507        }
 508
 509        /*
 510         * some nodes can be left in the pending list if there were
 511         * errors during processing the pending nodes.
 512         */
 513        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 514                list_for_each_entry(node, &cache->pending[level], list) {
 515                        BUG_ON(!node->pending);
 516                        if (node->bytenr == node->new_bytenr)
 517                                continue;
 518                        update_backref_node(cache, node, node->new_bytenr);
 519                }
 520        }
 521
 522        cache->last_trans = 0;
 523        return 1;
 524}
 525
 526
 527static int should_ignore_root(struct btrfs_root *root)
 528{
 529        struct btrfs_root *reloc_root;
 530
 531        if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 532                return 0;
 533
 534        reloc_root = root->reloc_root;
 535        if (!reloc_root)
 536                return 0;
 537
 538        if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
 539            root->fs_info->running_transaction->transid - 1)
 540                return 0;
 541        /*
 542         * if there is reloc tree and it was created in previous
 543         * transaction backref lookup can find the reloc tree,
 544         * so backref node for the fs tree root is useless for
 545         * relocation.
 546         */
 547        return 1;
 548}
 549/*
 550 * find reloc tree by address of tree root
 551 */
 552static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
 553                                          u64 bytenr)
 554{
 555        struct rb_node *rb_node;
 556        struct mapping_node *node;
 557        struct btrfs_root *root = NULL;
 558
 559        spin_lock(&rc->reloc_root_tree.lock);
 560        rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
 561        if (rb_node) {
 562                node = rb_entry(rb_node, struct mapping_node, rb_node);
 563                root = (struct btrfs_root *)node->data;
 564        }
 565        spin_unlock(&rc->reloc_root_tree.lock);
 566        return root;
 567}
 568
 569static int is_cowonly_root(u64 root_objectid)
 570{
 571        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
 572            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
 573            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
 574            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
 575            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
 576            root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
 577            root_objectid == BTRFS_UUID_TREE_OBJECTID ||
 578            root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
 579            root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
 580                return 1;
 581        return 0;
 582}
 583
 584static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
 585                                        u64 root_objectid)
 586{
 587        struct btrfs_key key;
 588
 589        key.objectid = root_objectid;
 590        key.type = BTRFS_ROOT_ITEM_KEY;
 591        if (is_cowonly_root(root_objectid))
 592                key.offset = 0;
 593        else
 594                key.offset = (u64)-1;
 595
 596        return btrfs_get_fs_root(fs_info, &key, false);
 597}
 598
 599#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 600static noinline_for_stack
 601struct btrfs_root *find_tree_root(struct reloc_control *rc,
 602                                  struct extent_buffer *leaf,
 603                                  struct btrfs_extent_ref_v0 *ref0)
 604{
 605        struct btrfs_root *root;
 606        u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
 607        u64 generation = btrfs_ref_generation_v0(leaf, ref0);
 608
 609        BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
 610
 611        root = read_fs_root(rc->extent_root->fs_info, root_objectid);
 612        BUG_ON(IS_ERR(root));
 613
 614        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 615            generation != btrfs_root_generation(&root->root_item))
 616                return NULL;
 617
 618        return root;
 619}
 620#endif
 621
 622static noinline_for_stack
 623int find_inline_backref(struct extent_buffer *leaf, int slot,
 624                        unsigned long *ptr, unsigned long *end)
 625{
 626        struct btrfs_key key;
 627        struct btrfs_extent_item *ei;
 628        struct btrfs_tree_block_info *bi;
 629        u32 item_size;
 630
 631        btrfs_item_key_to_cpu(leaf, &key, slot);
 632
 633        item_size = btrfs_item_size_nr(leaf, slot);
 634#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 635        if (item_size < sizeof(*ei)) {
 636                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
 637                return 1;
 638        }
 639#endif
 640        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 641        WARN_ON(!(btrfs_extent_flags(leaf, ei) &
 642                  BTRFS_EXTENT_FLAG_TREE_BLOCK));
 643
 644        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 645            item_size <= sizeof(*ei) + sizeof(*bi)) {
 646                WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 647                return 1;
 648        }
 649        if (key.type == BTRFS_METADATA_ITEM_KEY &&
 650            item_size <= sizeof(*ei)) {
 651                WARN_ON(item_size < sizeof(*ei));
 652                return 1;
 653        }
 654
 655        if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 656                bi = (struct btrfs_tree_block_info *)(ei + 1);
 657                *ptr = (unsigned long)(bi + 1);
 658        } else {
 659                *ptr = (unsigned long)(ei + 1);
 660        }
 661        *end = (unsigned long)ei + item_size;
 662        return 0;
 663}
 664
 665/*
 666 * build backref tree for a given tree block. root of the backref tree
 667 * corresponds the tree block, leaves of the backref tree correspond
 668 * roots of b-trees that reference the tree block.
 669 *
 670 * the basic idea of this function is check backrefs of a given block
 671 * to find upper level blocks that refernece the block, and then check
 672 * bakcrefs of these upper level blocks recursively. the recursion stop
 673 * when tree root is reached or backrefs for the block is cached.
 674 *
 675 * NOTE: if we find backrefs for a block are cached, we know backrefs
 676 * for all upper level blocks that directly/indirectly reference the
 677 * block are also cached.
 678 */
 679static noinline_for_stack
 680struct backref_node *build_backref_tree(struct reloc_control *rc,
 681                                        struct btrfs_key *node_key,
 682                                        int level, u64 bytenr)
 683{
 684        struct backref_cache *cache = &rc->backref_cache;
 685        struct btrfs_path *path1;
 686        struct btrfs_path *path2;
 687        struct extent_buffer *eb;
 688        struct btrfs_root *root;
 689        struct backref_node *cur;
 690        struct backref_node *upper;
 691        struct backref_node *lower;
 692        struct backref_node *node = NULL;
 693        struct backref_node *exist = NULL;
 694        struct backref_edge *edge;
 695        struct rb_node *rb_node;
 696        struct btrfs_key key;
 697        unsigned long end;
 698        unsigned long ptr;
 699        LIST_HEAD(list);
 700        LIST_HEAD(useless);
 701        int cowonly;
 702        int ret;
 703        int err = 0;
 704        bool need_check = true;
 705
 706        path1 = btrfs_alloc_path();
 707        path2 = btrfs_alloc_path();
 708        if (!path1 || !path2) {
 709                err = -ENOMEM;
 710                goto out;
 711        }
 712        path1->reada = READA_FORWARD;
 713        path2->reada = READA_FORWARD;
 714
 715        node = alloc_backref_node(cache);
 716        if (!node) {
 717                err = -ENOMEM;
 718                goto out;
 719        }
 720
 721        node->bytenr = bytenr;
 722        node->level = level;
 723        node->lowest = 1;
 724        cur = node;
 725again:
 726        end = 0;
 727        ptr = 0;
 728        key.objectid = cur->bytenr;
 729        key.type = BTRFS_METADATA_ITEM_KEY;
 730        key.offset = (u64)-1;
 731
 732        path1->search_commit_root = 1;
 733        path1->skip_locking = 1;
 734        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
 735                                0, 0);
 736        if (ret < 0) {
 737                err = ret;
 738                goto out;
 739        }
 740        ASSERT(ret);
 741        ASSERT(path1->slots[0]);
 742
 743        path1->slots[0]--;
 744
 745        WARN_ON(cur->checked);
 746        if (!list_empty(&cur->upper)) {
 747                /*
 748                 * the backref was added previously when processing
 749                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
 750                 */
 751                ASSERT(list_is_singular(&cur->upper));
 752                edge = list_entry(cur->upper.next, struct backref_edge,
 753                                  list[LOWER]);
 754                ASSERT(list_empty(&edge->list[UPPER]));
 755                exist = edge->node[UPPER];
 756                /*
 757                 * add the upper level block to pending list if we need
 758                 * check its backrefs
 759                 */
 760                if (!exist->checked)
 761                        list_add_tail(&edge->list[UPPER], &list);
 762        } else {
 763                exist = NULL;
 764        }
 765
 766        while (1) {
 767                cond_resched();
 768                eb = path1->nodes[0];
 769
 770                if (ptr >= end) {
 771                        if (path1->slots[0] >= btrfs_header_nritems(eb)) {
 772                                ret = btrfs_next_leaf(rc->extent_root, path1);
 773                                if (ret < 0) {
 774                                        err = ret;
 775                                        goto out;
 776                                }
 777                                if (ret > 0)
 778                                        break;
 779                                eb = path1->nodes[0];
 780                        }
 781
 782                        btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
 783                        if (key.objectid != cur->bytenr) {
 784                                WARN_ON(exist);
 785                                break;
 786                        }
 787
 788                        if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 789                            key.type == BTRFS_METADATA_ITEM_KEY) {
 790                                ret = find_inline_backref(eb, path1->slots[0],
 791                                                          &ptr, &end);
 792                                if (ret)
 793                                        goto next;
 794                        }
 795                }
 796
 797                if (ptr < end) {
 798                        /* update key for inline back ref */
 799                        struct btrfs_extent_inline_ref *iref;
 800                        iref = (struct btrfs_extent_inline_ref *)ptr;
 801                        key.type = btrfs_extent_inline_ref_type(eb, iref);
 802                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
 803                        WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
 804                                key.type != BTRFS_SHARED_BLOCK_REF_KEY);
 805                }
 806
 807                if (exist &&
 808                    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
 809                      exist->owner == key.offset) ||
 810                     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
 811                      exist->bytenr == key.offset))) {
 812                        exist = NULL;
 813                        goto next;
 814                }
 815
 816#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 817                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
 818                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
 819                        if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
 820                                struct btrfs_extent_ref_v0 *ref0;
 821                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
 822                                                struct btrfs_extent_ref_v0);
 823                                if (key.objectid == key.offset) {
 824                                        root = find_tree_root(rc, eb, ref0);
 825                                        if (root && !should_ignore_root(root))
 826                                                cur->root = root;
 827                                        else
 828                                                list_add(&cur->list, &useless);
 829                                        break;
 830                                }
 831                                if (is_cowonly_root(btrfs_ref_root_v0(eb,
 832                                                                      ref0)))
 833                                        cur->cowonly = 1;
 834                        }
 835#else
 836                ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
 837                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
 838#endif
 839                        if (key.objectid == key.offset) {
 840                                /*
 841                                 * only root blocks of reloc trees use
 842                                 * backref of this type.
 843                                 */
 844                                root = find_reloc_root(rc, cur->bytenr);
 845                                ASSERT(root);
 846                                cur->root = root;
 847                                break;
 848                        }
 849
 850                        edge = alloc_backref_edge(cache);
 851                        if (!edge) {
 852                                err = -ENOMEM;
 853                                goto out;
 854                        }
 855                        rb_node = tree_search(&cache->rb_root, key.offset);
 856                        if (!rb_node) {
 857                                upper = alloc_backref_node(cache);
 858                                if (!upper) {
 859                                        free_backref_edge(cache, edge);
 860                                        err = -ENOMEM;
 861                                        goto out;
 862                                }
 863                                upper->bytenr = key.offset;
 864                                upper->level = cur->level + 1;
 865                                /*
 866                                 *  backrefs for the upper level block isn't
 867                                 *  cached, add the block to pending list
 868                                 */
 869                                list_add_tail(&edge->list[UPPER], &list);
 870                        } else {
 871                                upper = rb_entry(rb_node, struct backref_node,
 872                                                 rb_node);
 873                                ASSERT(upper->checked);
 874                                INIT_LIST_HEAD(&edge->list[UPPER]);
 875                        }
 876                        list_add_tail(&edge->list[LOWER], &cur->upper);
 877                        edge->node[LOWER] = cur;
 878                        edge->node[UPPER] = upper;
 879
 880                        goto next;
 881                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
 882                        goto next;
 883                }
 884
 885                /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
 886                root = read_fs_root(rc->extent_root->fs_info, key.offset);
 887                if (IS_ERR(root)) {
 888                        err = PTR_ERR(root);
 889                        goto out;
 890                }
 891
 892                if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 893                        cur->cowonly = 1;
 894
 895                if (btrfs_root_level(&root->root_item) == cur->level) {
 896                        /* tree root */
 897                        ASSERT(btrfs_root_bytenr(&root->root_item) ==
 898                               cur->bytenr);
 899                        if (should_ignore_root(root))
 900                                list_add(&cur->list, &useless);
 901                        else
 902                                cur->root = root;
 903                        break;
 904                }
 905
 906                level = cur->level + 1;
 907
 908                /*
 909                 * searching the tree to find upper level blocks
 910                 * reference the block.
 911                 */
 912                path2->search_commit_root = 1;
 913                path2->skip_locking = 1;
 914                path2->lowest_level = level;
 915                ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
 916                path2->lowest_level = 0;
 917                if (ret < 0) {
 918                        err = ret;
 919                        goto out;
 920                }
 921                if (ret > 0 && path2->slots[level] > 0)
 922                        path2->slots[level]--;
 923
 924                eb = path2->nodes[level];
 925                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
 926                        cur->bytenr);
 927
 928                lower = cur;
 929                need_check = true;
 930                for (; level < BTRFS_MAX_LEVEL; level++) {
 931                        if (!path2->nodes[level]) {
 932                                ASSERT(btrfs_root_bytenr(&root->root_item) ==
 933                                       lower->bytenr);
 934                                if (should_ignore_root(root))
 935                                        list_add(&lower->list, &useless);
 936                                else
 937                                        lower->root = root;
 938                                break;
 939                        }
 940
 941                        edge = alloc_backref_edge(cache);
 942                        if (!edge) {
 943                                err = -ENOMEM;
 944                                goto out;
 945                        }
 946
 947                        eb = path2->nodes[level];
 948                        rb_node = tree_search(&cache->rb_root, eb->start);
 949                        if (!rb_node) {
 950                                upper = alloc_backref_node(cache);
 951                                if (!upper) {
 952                                        free_backref_edge(cache, edge);
 953                                        err = -ENOMEM;
 954                                        goto out;
 955                                }
 956                                upper->bytenr = eb->start;
 957                                upper->owner = btrfs_header_owner(eb);
 958                                upper->level = lower->level + 1;
 959                                if (!test_bit(BTRFS_ROOT_REF_COWS,
 960                                              &root->state))
 961                                        upper->cowonly = 1;
 962
 963                                /*
 964                                 * if we know the block isn't shared
 965                                 * we can void checking its backrefs.
 966                                 */
 967                                if (btrfs_block_can_be_shared(root, eb))
 968                                        upper->checked = 0;
 969                                else
 970                                        upper->checked = 1;
 971
 972                                /*
 973                                 * add the block to pending list if we
 974                                 * need check its backrefs, we only do this once
 975                                 * while walking up a tree as we will catch
 976                                 * anything else later on.
 977                                 */
 978                                if (!upper->checked && need_check) {
 979                                        need_check = false;
 980                                        list_add_tail(&edge->list[UPPER],
 981                                                      &list);
 982                                } else {
 983                                        if (upper->checked)
 984                                                need_check = true;
 985                                        INIT_LIST_HEAD(&edge->list[UPPER]);
 986                                }
 987                        } else {
 988                                upper = rb_entry(rb_node, struct backref_node,
 989                                                 rb_node);
 990                                ASSERT(upper->checked);
 991                                INIT_LIST_HEAD(&edge->list[UPPER]);
 992                                if (!upper->owner)
 993                                        upper->owner = btrfs_header_owner(eb);
 994                        }
 995                        list_add_tail(&edge->list[LOWER], &lower->upper);
 996                        edge->node[LOWER] = lower;
 997                        edge->node[UPPER] = upper;
 998
 999                        if (rb_node)
1000                                break;
1001                        lower = upper;
1002                        upper = NULL;
1003                }
1004                btrfs_release_path(path2);
1005next:
1006                if (ptr < end) {
1007                        ptr += btrfs_extent_inline_ref_size(key.type);
1008                        if (ptr >= end) {
1009                                WARN_ON(ptr > end);
1010                                ptr = 0;
1011                                end = 0;
1012                        }
1013                }
1014                if (ptr >= end)
1015                        path1->slots[0]++;
1016        }
1017        btrfs_release_path(path1);
1018
1019        cur->checked = 1;
1020        WARN_ON(exist);
1021
1022        /* the pending list isn't empty, take the first block to process */
1023        if (!list_empty(&list)) {
1024                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1025                list_del_init(&edge->list[UPPER]);
1026                cur = edge->node[UPPER];
1027                goto again;
1028        }
1029
1030        /*
1031         * everything goes well, connect backref nodes and insert backref nodes
1032         * into the cache.
1033         */
1034        ASSERT(node->checked);
1035        cowonly = node->cowonly;
1036        if (!cowonly) {
1037                rb_node = tree_insert(&cache->rb_root, node->bytenr,
1038                                      &node->rb_node);
1039                if (rb_node)
1040                        backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1041                list_add_tail(&node->lower, &cache->leaves);
1042        }
1043
1044        list_for_each_entry(edge, &node->upper, list[LOWER])
1045                list_add_tail(&edge->list[UPPER], &list);
1046
1047        while (!list_empty(&list)) {
1048                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1049                list_del_init(&edge->list[UPPER]);
1050                upper = edge->node[UPPER];
1051                if (upper->detached) {
1052                        list_del(&edge->list[LOWER]);
1053                        lower = edge->node[LOWER];
1054                        free_backref_edge(cache, edge);
1055                        if (list_empty(&lower->upper))
1056                                list_add(&lower->list, &useless);
1057                        continue;
1058                }
1059
1060                if (!RB_EMPTY_NODE(&upper->rb_node)) {
1061                        if (upper->lowest) {
1062                                list_del_init(&upper->lower);
1063                                upper->lowest = 0;
1064                        }
1065
1066                        list_add_tail(&edge->list[UPPER], &upper->lower);
1067                        continue;
1068                }
1069
1070                if (!upper->checked) {
1071                        /*
1072                         * Still want to blow up for developers since this is a
1073                         * logic bug.
1074                         */
1075                        ASSERT(0);
1076                        err = -EINVAL;
1077                        goto out;
1078                }
1079                if (cowonly != upper->cowonly) {
1080                        ASSERT(0);
1081                        err = -EINVAL;
1082                        goto out;
1083                }
1084
1085                if (!cowonly) {
1086                        rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1087                                              &upper->rb_node);
1088                        if (rb_node)
1089                                backref_tree_panic(rb_node, -EEXIST,
1090                                                   upper->bytenr);
1091                }
1092
1093                list_add_tail(&edge->list[UPPER], &upper->lower);
1094
1095                list_for_each_entry(edge, &upper->upper, list[LOWER])
1096                        list_add_tail(&edge->list[UPPER], &list);
1097        }
1098        /*
1099         * process useless backref nodes. backref nodes for tree leaves
1100         * are deleted from the cache. backref nodes for upper level
1101         * tree blocks are left in the cache to avoid unnecessary backref
1102         * lookup.
1103         */
1104        while (!list_empty(&useless)) {
1105                upper = list_entry(useless.next, struct backref_node, list);
1106                list_del_init(&upper->list);
1107                ASSERT(list_empty(&upper->upper));
1108                if (upper == node)
1109                        node = NULL;
1110                if (upper->lowest) {
1111                        list_del_init(&upper->lower);
1112                        upper->lowest = 0;
1113                }
1114                while (!list_empty(&upper->lower)) {
1115                        edge = list_entry(upper->lower.next,
1116                                          struct backref_edge, list[UPPER]);
1117                        list_del(&edge->list[UPPER]);
1118                        list_del(&edge->list[LOWER]);
1119                        lower = edge->node[LOWER];
1120                        free_backref_edge(cache, edge);
1121
1122                        if (list_empty(&lower->upper))
1123                                list_add(&lower->list, &useless);
1124                }
1125                __mark_block_processed(rc, upper);
1126                if (upper->level > 0) {
1127                        list_add(&upper->list, &cache->detached);
1128                        upper->detached = 1;
1129                } else {
1130                        rb_erase(&upper->rb_node, &cache->rb_root);
1131                        free_backref_node(cache, upper);
1132                }
1133        }
1134out:
1135        btrfs_free_path(path1);
1136        btrfs_free_path(path2);
1137        if (err) {
1138                while (!list_empty(&useless)) {
1139                        lower = list_entry(useless.next,
1140                                           struct backref_node, list);
1141                        list_del_init(&lower->list);
1142                }
1143                while (!list_empty(&list)) {
1144                        edge = list_first_entry(&list, struct backref_edge,
1145                                                list[UPPER]);
1146                        list_del(&edge->list[UPPER]);
1147                        list_del(&edge->list[LOWER]);
1148                        lower = edge->node[LOWER];
1149                        upper = edge->node[UPPER];
1150                        free_backref_edge(cache, edge);
1151
1152                        /*
1153                         * Lower is no longer linked to any upper backref nodes
1154                         * and isn't in the cache, we can free it ourselves.
1155                         */
1156                        if (list_empty(&lower->upper) &&
1157                            RB_EMPTY_NODE(&lower->rb_node))
1158                                list_add(&lower->list, &useless);
1159
1160                        if (!RB_EMPTY_NODE(&upper->rb_node))
1161                                continue;
1162
1163                        /* Add this guy's upper edges to the list to proces */
1164                        list_for_each_entry(edge, &upper->upper, list[LOWER])
1165                                list_add_tail(&edge->list[UPPER], &list);
1166                        if (list_empty(&upper->upper))
1167                                list_add(&upper->list, &useless);
1168                }
1169
1170                while (!list_empty(&useless)) {
1171                        lower = list_entry(useless.next,
1172                                           struct backref_node, list);
1173                        list_del_init(&lower->list);
1174                        free_backref_node(cache, lower);
1175                }
1176                return ERR_PTR(err);
1177        }
1178        ASSERT(!node || !node->detached);
1179        return node;
1180}
1181
1182/*
1183 * helper to add backref node for the newly created snapshot.
1184 * the backref node is created by cloning backref node that
1185 * corresponds to root of source tree
1186 */
1187static int clone_backref_node(struct btrfs_trans_handle *trans,
1188                              struct reloc_control *rc,
1189                              struct btrfs_root *src,
1190                              struct btrfs_root *dest)
1191{
1192        struct btrfs_root *reloc_root = src->reloc_root;
1193        struct backref_cache *cache = &rc->backref_cache;
1194        struct backref_node *node = NULL;
1195        struct backref_node *new_node;
1196        struct backref_edge *edge;
1197        struct backref_edge *new_edge;
1198        struct rb_node *rb_node;
1199
1200        if (cache->last_trans > 0)
1201                update_backref_cache(trans, cache);
1202
1203        rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1204        if (rb_node) {
1205                node = rb_entry(rb_node, struct backref_node, rb_node);
1206                if (node->detached)
1207                        node = NULL;
1208                else
1209                        BUG_ON(node->new_bytenr != reloc_root->node->start);
1210        }
1211
1212        if (!node) {
1213                rb_node = tree_search(&cache->rb_root,
1214                                      reloc_root->commit_root->start);
1215                if (rb_node) {
1216                        node = rb_entry(rb_node, struct backref_node,
1217                                        rb_node);
1218                        BUG_ON(node->detached);
1219                }
1220        }
1221
1222        if (!node)
1223                return 0;
1224
1225        new_node = alloc_backref_node(cache);
1226        if (!new_node)
1227                return -ENOMEM;
1228
1229        new_node->bytenr = dest->node->start;
1230        new_node->level = node->level;
1231        new_node->lowest = node->lowest;
1232        new_node->checked = 1;
1233        new_node->root = dest;
1234
1235        if (!node->lowest) {
1236                list_for_each_entry(edge, &node->lower, list[UPPER]) {
1237                        new_edge = alloc_backref_edge(cache);
1238                        if (!new_edge)
1239                                goto fail;
1240
1241                        new_edge->node[UPPER] = new_node;
1242                        new_edge->node[LOWER] = edge->node[LOWER];
1243                        list_add_tail(&new_edge->list[UPPER],
1244                                      &new_node->lower);
1245                }
1246        } else {
1247                list_add_tail(&new_node->lower, &cache->leaves);
1248        }
1249
1250        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1251                              &new_node->rb_node);
1252        if (rb_node)
1253                backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1254
1255        if (!new_node->lowest) {
1256                list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1257                        list_add_tail(&new_edge->list[LOWER],
1258                                      &new_edge->node[LOWER]->upper);
1259                }
1260        }
1261        return 0;
1262fail:
1263        while (!list_empty(&new_node->lower)) {
1264                new_edge = list_entry(new_node->lower.next,
1265                                      struct backref_edge, list[UPPER]);
1266                list_del(&new_edge->list[UPPER]);
1267                free_backref_edge(cache, new_edge);
1268        }
1269        free_backref_node(cache, new_node);
1270        return -ENOMEM;
1271}
1272
1273/*
1274 * helper to add 'address of tree root -> reloc tree' mapping
1275 */
1276static int __must_check __add_reloc_root(struct btrfs_root *root)
1277{
1278        struct rb_node *rb_node;
1279        struct mapping_node *node;
1280        struct reloc_control *rc = root->fs_info->reloc_ctl;
1281
1282        node = kmalloc(sizeof(*node), GFP_NOFS);
1283        if (!node)
1284                return -ENOMEM;
1285
1286        node->bytenr = root->node->start;
1287        node->data = root;
1288
1289        spin_lock(&rc->reloc_root_tree.lock);
1290        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1291                              node->bytenr, &node->rb_node);
1292        spin_unlock(&rc->reloc_root_tree.lock);
1293        if (rb_node) {
1294                btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1295                            "for start=%llu while inserting into relocation "
1296                            "tree", node->bytenr);
1297                kfree(node);
1298                return -EEXIST;
1299        }
1300
1301        list_add_tail(&root->root_list, &rc->reloc_roots);
1302        return 0;
1303}
1304
1305/*
1306 * helper to delete the 'address of tree root -> reloc tree'
1307 * mapping
1308 */
1309static void __del_reloc_root(struct btrfs_root *root)
1310{
1311        struct rb_node *rb_node;
1312        struct mapping_node *node = NULL;
1313        struct reloc_control *rc = root->fs_info->reloc_ctl;
1314
1315        spin_lock(&rc->reloc_root_tree.lock);
1316        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1317                              root->node->start);
1318        if (rb_node) {
1319                node = rb_entry(rb_node, struct mapping_node, rb_node);
1320                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1321        }
1322        spin_unlock(&rc->reloc_root_tree.lock);
1323
1324        if (!node)
1325                return;
1326        BUG_ON((struct btrfs_root *)node->data != root);
1327
1328        spin_lock(&root->fs_info->trans_lock);
1329        list_del_init(&root->root_list);
1330        spin_unlock(&root->fs_info->trans_lock);
1331        kfree(node);
1332}
1333
1334/*
1335 * helper to update the 'address of tree root -> reloc tree'
1336 * mapping
1337 */
1338static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1339{
1340        struct rb_node *rb_node;
1341        struct mapping_node *node = NULL;
1342        struct reloc_control *rc = root->fs_info->reloc_ctl;
1343
1344        spin_lock(&rc->reloc_root_tree.lock);
1345        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1346                              root->node->start);
1347        if (rb_node) {
1348                node = rb_entry(rb_node, struct mapping_node, rb_node);
1349                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1350        }
1351        spin_unlock(&rc->reloc_root_tree.lock);
1352
1353        if (!node)
1354                return 0;
1355        BUG_ON((struct btrfs_root *)node->data != root);
1356
1357        spin_lock(&rc->reloc_root_tree.lock);
1358        node->bytenr = new_bytenr;
1359        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1360                              node->bytenr, &node->rb_node);
1361        spin_unlock(&rc->reloc_root_tree.lock);
1362        if (rb_node)
1363                backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1364        return 0;
1365}
1366
1367static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1368                                        struct btrfs_root *root, u64 objectid)
1369{
1370        struct btrfs_root *reloc_root;
1371        struct extent_buffer *eb;
1372        struct btrfs_root_item *root_item;
1373        struct btrfs_key root_key;
1374        u64 last_snap = 0;
1375        int ret;
1376
1377        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1378        BUG_ON(!root_item);
1379
1380        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1381        root_key.type = BTRFS_ROOT_ITEM_KEY;
1382        root_key.offset = objectid;
1383
1384        if (root->root_key.objectid == objectid) {
1385                /* called by btrfs_init_reloc_root */
1386                ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1387                                      BTRFS_TREE_RELOC_OBJECTID);
1388                BUG_ON(ret);
1389
1390                last_snap = btrfs_root_last_snapshot(&root->root_item);
1391                btrfs_set_root_last_snapshot(&root->root_item,
1392                                             trans->transid - 1);
1393        } else {
1394                /*
1395                 * called by btrfs_reloc_post_snapshot_hook.
1396                 * the source tree is a reloc tree, all tree blocks
1397                 * modified after it was created have RELOC flag
1398                 * set in their headers. so it's OK to not update
1399                 * the 'last_snapshot'.
1400                 */
1401                ret = btrfs_copy_root(trans, root, root->node, &eb,
1402                                      BTRFS_TREE_RELOC_OBJECTID);
1403                BUG_ON(ret);
1404        }
1405
1406        memcpy(root_item, &root->root_item, sizeof(*root_item));
1407        btrfs_set_root_bytenr(root_item, eb->start);
1408        btrfs_set_root_level(root_item, btrfs_header_level(eb));
1409        btrfs_set_root_generation(root_item, trans->transid);
1410
1411        if (root->root_key.objectid == objectid) {
1412                btrfs_set_root_refs(root_item, 0);
1413                memset(&root_item->drop_progress, 0,
1414                       sizeof(struct btrfs_disk_key));
1415                root_item->drop_level = 0;
1416                /*
1417                 * abuse rtransid, it is safe because it is impossible to
1418                 * receive data into a relocation tree.
1419                 */
1420                btrfs_set_root_rtransid(root_item, last_snap);
1421                btrfs_set_root_otransid(root_item, trans->transid);
1422        }
1423
1424        btrfs_tree_unlock(eb);
1425        free_extent_buffer(eb);
1426
1427        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
1428                                &root_key, root_item);
1429        BUG_ON(ret);
1430        kfree(root_item);
1431
1432        reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1433        BUG_ON(IS_ERR(reloc_root));
1434        reloc_root->last_trans = trans->transid;
1435        return reloc_root;
1436}
1437
1438/*
1439 * create reloc tree for a given fs tree. reloc tree is just a
1440 * snapshot of the fs tree with special root objectid.
1441 */
1442int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1443                          struct btrfs_root *root)
1444{
1445        struct btrfs_root *reloc_root;
1446        struct reloc_control *rc = root->fs_info->reloc_ctl;
1447        struct btrfs_block_rsv *rsv;
1448        int clear_rsv = 0;
1449        int ret;
1450
1451        if (root->reloc_root) {
1452                reloc_root = root->reloc_root;
1453                reloc_root->last_trans = trans->transid;
1454                return 0;
1455        }
1456
1457        if (!rc || !rc->create_reloc_tree ||
1458            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1459                return 0;
1460
1461        if (!trans->reloc_reserved) {
1462                rsv = trans->block_rsv;
1463                trans->block_rsv = rc->block_rsv;
1464                clear_rsv = 1;
1465        }
1466        reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1467        if (clear_rsv)
1468                trans->block_rsv = rsv;
1469
1470        ret = __add_reloc_root(reloc_root);
1471        BUG_ON(ret < 0);
1472        root->reloc_root = reloc_root;
1473        return 0;
1474}
1475
1476/*
1477 * update root item of reloc tree
1478 */
1479int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1480                            struct btrfs_root *root)
1481{
1482        struct btrfs_root *reloc_root;
1483        struct btrfs_root_item *root_item;
1484        int ret;
1485
1486        if (!root->reloc_root)
1487                goto out;
1488
1489        reloc_root = root->reloc_root;
1490        root_item = &reloc_root->root_item;
1491
1492        if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1493            btrfs_root_refs(root_item) == 0) {
1494                root->reloc_root = NULL;
1495                __del_reloc_root(reloc_root);
1496        }
1497
1498        if (reloc_root->commit_root != reloc_root->node) {
1499                btrfs_set_root_node(root_item, reloc_root->node);
1500                free_extent_buffer(reloc_root->commit_root);
1501                reloc_root->commit_root = btrfs_root_node(reloc_root);
1502        }
1503
1504        ret = btrfs_update_root(trans, root->fs_info->tree_root,
1505                                &reloc_root->root_key, root_item);
1506        BUG_ON(ret);
1507
1508out:
1509        return 0;
1510}
1511
1512/*
1513 * helper to find first cached inode with inode number >= objectid
1514 * in a subvolume
1515 */
1516static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1517{
1518        struct rb_node *node;
1519        struct rb_node *prev;
1520        struct btrfs_inode *entry;
1521        struct inode *inode;
1522
1523        spin_lock(&root->inode_lock);
1524again:
1525        node = root->inode_tree.rb_node;
1526        prev = NULL;
1527        while (node) {
1528                prev = node;
1529                entry = rb_entry(node, struct btrfs_inode, rb_node);
1530
1531                if (objectid < btrfs_ino(&entry->vfs_inode))
1532                        node = node->rb_left;
1533                else if (objectid > btrfs_ino(&entry->vfs_inode))
1534                        node = node->rb_right;
1535                else
1536                        break;
1537        }
1538        if (!node) {
1539                while (prev) {
1540                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
1541                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
1542                                node = prev;
1543                                break;
1544                        }
1545                        prev = rb_next(prev);
1546                }
1547        }
1548        while (node) {
1549                entry = rb_entry(node, struct btrfs_inode, rb_node);
1550                inode = igrab(&entry->vfs_inode);
1551                if (inode) {
1552                        spin_unlock(&root->inode_lock);
1553                        return inode;
1554                }
1555
1556                objectid = btrfs_ino(&entry->vfs_inode) + 1;
1557                if (cond_resched_lock(&root->inode_lock))
1558                        goto again;
1559
1560                node = rb_next(node);
1561        }
1562        spin_unlock(&root->inode_lock);
1563        return NULL;
1564}
1565
1566static int in_block_group(u64 bytenr,
1567                          struct btrfs_block_group_cache *block_group)
1568{
1569        if (bytenr >= block_group->key.objectid &&
1570            bytenr < block_group->key.objectid + block_group->key.offset)
1571                return 1;
1572        return 0;
1573}
1574
1575/*
1576 * get new location of data
1577 */
1578static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1579                            u64 bytenr, u64 num_bytes)
1580{
1581        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1582        struct btrfs_path *path;
1583        struct btrfs_file_extent_item *fi;
1584        struct extent_buffer *leaf;
1585        int ret;
1586
1587        path = btrfs_alloc_path();
1588        if (!path)
1589                return -ENOMEM;
1590
1591        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1592        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
1593                                       bytenr, 0);
1594        if (ret < 0)
1595                goto out;
1596        if (ret > 0) {
1597                ret = -ENOENT;
1598                goto out;
1599        }
1600
1601        leaf = path->nodes[0];
1602        fi = btrfs_item_ptr(leaf, path->slots[0],
1603                            struct btrfs_file_extent_item);
1604
1605        BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1606               btrfs_file_extent_compression(leaf, fi) ||
1607               btrfs_file_extent_encryption(leaf, fi) ||
1608               btrfs_file_extent_other_encoding(leaf, fi));
1609
1610        if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1611                ret = -EINVAL;
1612                goto out;
1613        }
1614
1615        *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1616        ret = 0;
1617out:
1618        btrfs_free_path(path);
1619        return ret;
1620}
1621
1622/*
1623 * update file extent items in the tree leaf to point to
1624 * the new locations.
1625 */
1626static noinline_for_stack
1627int replace_file_extents(struct btrfs_trans_handle *trans,
1628                         struct reloc_control *rc,
1629                         struct btrfs_root *root,
1630                         struct extent_buffer *leaf)
1631{
1632        struct btrfs_key key;
1633        struct btrfs_file_extent_item *fi;
1634        struct inode *inode = NULL;
1635        u64 parent;
1636        u64 bytenr;
1637        u64 new_bytenr = 0;
1638        u64 num_bytes;
1639        u64 end;
1640        u32 nritems;
1641        u32 i;
1642        int ret = 0;
1643        int first = 1;
1644        int dirty = 0;
1645
1646        if (rc->stage != UPDATE_DATA_PTRS)
1647                return 0;
1648
1649        /* reloc trees always use full backref */
1650        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1651                parent = leaf->start;
1652        else
1653                parent = 0;
1654
1655        nritems = btrfs_header_nritems(leaf);
1656        for (i = 0; i < nritems; i++) {
1657                cond_resched();
1658                btrfs_item_key_to_cpu(leaf, &key, i);
1659                if (key.type != BTRFS_EXTENT_DATA_KEY)
1660                        continue;
1661                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1662                if (btrfs_file_extent_type(leaf, fi) ==
1663                    BTRFS_FILE_EXTENT_INLINE)
1664                        continue;
1665                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1666                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1667                if (bytenr == 0)
1668                        continue;
1669                if (!in_block_group(bytenr, rc->block_group))
1670                        continue;
1671
1672                /*
1673                 * if we are modifying block in fs tree, wait for readpage
1674                 * to complete and drop the extent cache
1675                 */
1676                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1677                        if (first) {
1678                                inode = find_next_inode(root, key.objectid);
1679                                first = 0;
1680                        } else if (inode && btrfs_ino(inode) < key.objectid) {
1681                                btrfs_add_delayed_iput(inode);
1682                                inode = find_next_inode(root, key.objectid);
1683                        }
1684                        if (inode && btrfs_ino(inode) == key.objectid) {
1685                                end = key.offset +
1686                                      btrfs_file_extent_num_bytes(leaf, fi);
1687                                WARN_ON(!IS_ALIGNED(key.offset,
1688                                                    root->sectorsize));
1689                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1690                                end--;
1691                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1692                                                      key.offset, end);
1693                                if (!ret)
1694                                        continue;
1695
1696                                btrfs_drop_extent_cache(inode, key.offset, end,
1697                                                        1);
1698                                unlock_extent(&BTRFS_I(inode)->io_tree,
1699                                              key.offset, end);
1700                        }
1701                }
1702
1703                ret = get_new_location(rc->data_inode, &new_bytenr,
1704                                       bytenr, num_bytes);
1705                if (ret) {
1706                        /*
1707                         * Don't have to abort since we've not changed anything
1708                         * in the file extent yet.
1709                         */
1710                        break;
1711                }
1712
1713                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1714                dirty = 1;
1715
1716                key.offset -= btrfs_file_extent_offset(leaf, fi);
1717                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1718                                           num_bytes, parent,
1719                                           btrfs_header_owner(leaf),
1720                                           key.objectid, key.offset);
1721                if (ret) {
1722                        btrfs_abort_transaction(trans, root, ret);
1723                        break;
1724                }
1725
1726                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1727                                        parent, btrfs_header_owner(leaf),
1728                                        key.objectid, key.offset);
1729                if (ret) {
1730                        btrfs_abort_transaction(trans, root, ret);
1731                        break;
1732                }
1733        }
1734        if (dirty)
1735                btrfs_mark_buffer_dirty(leaf);
1736        if (inode)
1737                btrfs_add_delayed_iput(inode);
1738        return ret;
1739}
1740
1741static noinline_for_stack
1742int memcmp_node_keys(struct extent_buffer *eb, int slot,
1743                     struct btrfs_path *path, int level)
1744{
1745        struct btrfs_disk_key key1;
1746        struct btrfs_disk_key key2;
1747        btrfs_node_key(eb, &key1, slot);
1748        btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1749        return memcmp(&key1, &key2, sizeof(key1));
1750}
1751
1752/*
1753 * try to replace tree blocks in fs tree with the new blocks
1754 * in reloc tree. tree blocks haven't been modified since the
1755 * reloc tree was create can be replaced.
1756 *
1757 * if a block was replaced, level of the block + 1 is returned.
1758 * if no block got replaced, 0 is returned. if there are other
1759 * errors, a negative error number is returned.
1760 */
1761static noinline_for_stack
1762int replace_path(struct btrfs_trans_handle *trans,
1763                 struct btrfs_root *dest, struct btrfs_root *src,
1764                 struct btrfs_path *path, struct btrfs_key *next_key,
1765                 int lowest_level, int max_level)
1766{
1767        struct extent_buffer *eb;
1768        struct extent_buffer *parent;
1769        struct btrfs_key key;
1770        u64 old_bytenr;
1771        u64 new_bytenr;
1772        u64 old_ptr_gen;
1773        u64 new_ptr_gen;
1774        u64 last_snapshot;
1775        u32 blocksize;
1776        int cow = 0;
1777        int level;
1778        int ret;
1779        int slot;
1780
1781        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1782        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1783
1784        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1785again:
1786        slot = path->slots[lowest_level];
1787        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1788
1789        eb = btrfs_lock_root_node(dest);
1790        btrfs_set_lock_blocking(eb);
1791        level = btrfs_header_level(eb);
1792
1793        if (level < lowest_level) {
1794                btrfs_tree_unlock(eb);
1795                free_extent_buffer(eb);
1796                return 0;
1797        }
1798
1799        if (cow) {
1800                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1801                BUG_ON(ret);
1802        }
1803        btrfs_set_lock_blocking(eb);
1804
1805        if (next_key) {
1806                next_key->objectid = (u64)-1;
1807                next_key->type = (u8)-1;
1808                next_key->offset = (u64)-1;
1809        }
1810
1811        parent = eb;
1812        while (1) {
1813                level = btrfs_header_level(parent);
1814                BUG_ON(level < lowest_level);
1815
1816                ret = btrfs_bin_search(parent, &key, level, &slot);
1817                if (ret && slot > 0)
1818                        slot--;
1819
1820                if (next_key && slot + 1 < btrfs_header_nritems(parent))
1821                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1822
1823                old_bytenr = btrfs_node_blockptr(parent, slot);
1824                blocksize = dest->nodesize;
1825                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1826
1827                if (level <= max_level) {
1828                        eb = path->nodes[level];
1829                        new_bytenr = btrfs_node_blockptr(eb,
1830                                                        path->slots[level]);
1831                        new_ptr_gen = btrfs_node_ptr_generation(eb,
1832                                                        path->slots[level]);
1833                } else {
1834                        new_bytenr = 0;
1835                        new_ptr_gen = 0;
1836                }
1837
1838                if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1839                        ret = level;
1840                        break;
1841                }
1842
1843                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1844                    memcmp_node_keys(parent, slot, path, level)) {
1845                        if (level <= lowest_level) {
1846                                ret = 0;
1847                                break;
1848                        }
1849
1850                        eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
1851                        if (IS_ERR(eb)) {
1852                                ret = PTR_ERR(eb);
1853                                break;
1854                        } else if (!extent_buffer_uptodate(eb)) {
1855                                ret = -EIO;
1856                                free_extent_buffer(eb);
1857                                break;
1858                        }
1859                        btrfs_tree_lock(eb);
1860                        if (cow) {
1861                                ret = btrfs_cow_block(trans, dest, eb, parent,
1862                                                      slot, &eb);
1863                                BUG_ON(ret);
1864                        }
1865                        btrfs_set_lock_blocking(eb);
1866
1867                        btrfs_tree_unlock(parent);
1868                        free_extent_buffer(parent);
1869
1870                        parent = eb;
1871                        continue;
1872                }
1873
1874                if (!cow) {
1875                        btrfs_tree_unlock(parent);
1876                        free_extent_buffer(parent);
1877                        cow = 1;
1878                        goto again;
1879                }
1880
1881                btrfs_node_key_to_cpu(path->nodes[level], &key,
1882                                      path->slots[level]);
1883                btrfs_release_path(path);
1884
1885                path->lowest_level = level;
1886                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1887                path->lowest_level = 0;
1888                BUG_ON(ret);
1889
1890                /*
1891                 * swap blocks in fs tree and reloc tree.
1892                 */
1893                btrfs_set_node_blockptr(parent, slot, new_bytenr);
1894                btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1895                btrfs_mark_buffer_dirty(parent);
1896
1897                btrfs_set_node_blockptr(path->nodes[level],
1898                                        path->slots[level], old_bytenr);
1899                btrfs_set_node_ptr_generation(path->nodes[level],
1900                                              path->slots[level], old_ptr_gen);
1901                btrfs_mark_buffer_dirty(path->nodes[level]);
1902
1903                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1904                                        path->nodes[level]->start,
1905                                        src->root_key.objectid, level - 1, 0);
1906                BUG_ON(ret);
1907                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1908                                        0, dest->root_key.objectid, level - 1,
1909                                        0);
1910                BUG_ON(ret);
1911
1912                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1913                                        path->nodes[level]->start,
1914                                        src->root_key.objectid, level - 1, 0);
1915                BUG_ON(ret);
1916
1917                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1918                                        0, dest->root_key.objectid, level - 1,
1919                                        0);
1920                BUG_ON(ret);
1921
1922                btrfs_unlock_up_safe(path, 0);
1923
1924                ret = level;
1925                break;
1926        }
1927        btrfs_tree_unlock(parent);
1928        free_extent_buffer(parent);
1929        return ret;
1930}
1931
1932/*
1933 * helper to find next relocated block in reloc tree
1934 */
1935static noinline_for_stack
1936int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1937                       int *level)
1938{
1939        struct extent_buffer *eb;
1940        int i;
1941        u64 last_snapshot;
1942        u32 nritems;
1943
1944        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1945
1946        for (i = 0; i < *level; i++) {
1947                free_extent_buffer(path->nodes[i]);
1948                path->nodes[i] = NULL;
1949        }
1950
1951        for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1952                eb = path->nodes[i];
1953                nritems = btrfs_header_nritems(eb);
1954                while (path->slots[i] + 1 < nritems) {
1955                        path->slots[i]++;
1956                        if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1957                            last_snapshot)
1958                                continue;
1959
1960                        *level = i;
1961                        return 0;
1962                }
1963                free_extent_buffer(path->nodes[i]);
1964                path->nodes[i] = NULL;
1965        }
1966        return 1;
1967}
1968
1969/*
1970 * walk down reloc tree to find relocated block of lowest level
1971 */
1972static noinline_for_stack
1973int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1974                         int *level)
1975{
1976        struct extent_buffer *eb = NULL;
1977        int i;
1978        u64 bytenr;
1979        u64 ptr_gen = 0;
1980        u64 last_snapshot;
1981        u32 nritems;
1982
1983        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1984
1985        for (i = *level; i > 0; i--) {
1986                eb = path->nodes[i];
1987                nritems = btrfs_header_nritems(eb);
1988                while (path->slots[i] < nritems) {
1989                        ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1990                        if (ptr_gen > last_snapshot)
1991                                break;
1992                        path->slots[i]++;
1993                }
1994                if (path->slots[i] >= nritems) {
1995                        if (i == *level)
1996                                break;
1997                        *level = i + 1;
1998                        return 0;
1999                }
2000                if (i == 1) {
2001                        *level = i;
2002                        return 0;
2003                }
2004
2005                bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2006                eb = read_tree_block(root, bytenr, ptr_gen);
2007                if (IS_ERR(eb)) {
2008                        return PTR_ERR(eb);
2009                } else if (!extent_buffer_uptodate(eb)) {
2010                        free_extent_buffer(eb);
2011                        return -EIO;
2012                }
2013                BUG_ON(btrfs_header_level(eb) != i - 1);
2014                path->nodes[i - 1] = eb;
2015                path->slots[i - 1] = 0;
2016        }
2017        return 1;
2018}
2019
2020/*
2021 * invalidate extent cache for file extents whose key in range of
2022 * [min_key, max_key)
2023 */
2024static int invalidate_extent_cache(struct btrfs_root *root,
2025                                   struct btrfs_key *min_key,
2026                                   struct btrfs_key *max_key)
2027{
2028        struct inode *inode = NULL;
2029        u64 objectid;
2030        u64 start, end;
2031        u64 ino;
2032
2033        objectid = min_key->objectid;
2034        while (1) {
2035                cond_resched();
2036                iput(inode);
2037
2038                if (objectid > max_key->objectid)
2039                        break;
2040
2041                inode = find_next_inode(root, objectid);
2042                if (!inode)
2043                        break;
2044                ino = btrfs_ino(inode);
2045
2046                if (ino > max_key->objectid) {
2047                        iput(inode);
2048                        break;
2049                }
2050
2051                objectid = ino + 1;
2052                if (!S_ISREG(inode->i_mode))
2053                        continue;
2054
2055                if (unlikely(min_key->objectid == ino)) {
2056                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2057                                continue;
2058                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2059                                start = 0;
2060                        else {
2061                                start = min_key->offset;
2062                                WARN_ON(!IS_ALIGNED(start, root->sectorsize));
2063                        }
2064                } else {
2065                        start = 0;
2066                }
2067
2068                if (unlikely(max_key->objectid == ino)) {
2069                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2070                                continue;
2071                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2072                                end = (u64)-1;
2073                        } else {
2074                                if (max_key->offset == 0)
2075                                        continue;
2076                                end = max_key->offset;
2077                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
2078                                end--;
2079                        }
2080                } else {
2081                        end = (u64)-1;
2082                }
2083
2084                /* the lock_extent waits for readpage to complete */
2085                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2086                btrfs_drop_extent_cache(inode, start, end, 1);
2087                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2088        }
2089        return 0;
2090}
2091
2092static int find_next_key(struct btrfs_path *path, int level,
2093                         struct btrfs_key *key)
2094
2095{
2096        while (level < BTRFS_MAX_LEVEL) {
2097                if (!path->nodes[level])
2098                        break;
2099                if (path->slots[level] + 1 <
2100                    btrfs_header_nritems(path->nodes[level])) {
2101                        btrfs_node_key_to_cpu(path->nodes[level], key,
2102                                              path->slots[level] + 1);
2103                        return 0;
2104                }
2105                level++;
2106        }
2107        return 1;
2108}
2109
2110/*
2111 * merge the relocated tree blocks in reloc tree with corresponding
2112 * fs tree.
2113 */
2114static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2115                                               struct btrfs_root *root)
2116{
2117        LIST_HEAD(inode_list);
2118        struct btrfs_key key;
2119        struct btrfs_key next_key;
2120        struct btrfs_trans_handle *trans = NULL;
2121        struct btrfs_root *reloc_root;
2122        struct btrfs_root_item *root_item;
2123        struct btrfs_path *path;
2124        struct extent_buffer *leaf;
2125        int level;
2126        int max_level;
2127        int replaced = 0;
2128        int ret;
2129        int err = 0;
2130        u32 min_reserved;
2131
2132        path = btrfs_alloc_path();
2133        if (!path)
2134                return -ENOMEM;
2135        path->reada = READA_FORWARD;
2136
2137        reloc_root = root->reloc_root;
2138        root_item = &reloc_root->root_item;
2139
2140        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2141                level = btrfs_root_level(root_item);
2142                extent_buffer_get(reloc_root->node);
2143                path->nodes[level] = reloc_root->node;
2144                path->slots[level] = 0;
2145        } else {
2146                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2147
2148                level = root_item->drop_level;
2149                BUG_ON(level == 0);
2150                path->lowest_level = level;
2151                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2152                path->lowest_level = 0;
2153                if (ret < 0) {
2154                        btrfs_free_path(path);
2155                        return ret;
2156                }
2157
2158                btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2159                                      path->slots[level]);
2160                WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2161
2162                btrfs_unlock_up_safe(path, 0);
2163        }
2164
2165        min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2166        memset(&next_key, 0, sizeof(next_key));
2167
2168        while (1) {
2169                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2170                                             BTRFS_RESERVE_FLUSH_ALL);
2171                if (ret) {
2172                        err = ret;
2173                        goto out;
2174                }
2175                trans = btrfs_start_transaction(root, 0);
2176                if (IS_ERR(trans)) {
2177                        err = PTR_ERR(trans);
2178                        trans = NULL;
2179                        goto out;
2180                }
2181                trans->block_rsv = rc->block_rsv;
2182
2183                replaced = 0;
2184                max_level = level;
2185
2186                ret = walk_down_reloc_tree(reloc_root, path, &level);
2187                if (ret < 0) {
2188                        err = ret;
2189                        goto out;
2190                }
2191                if (ret > 0)
2192                        break;
2193
2194                if (!find_next_key(path, level, &key) &&
2195                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2196                        ret = 0;
2197                } else {
2198                        ret = replace_path(trans, root, reloc_root, path,
2199                                           &next_key, level, max_level);
2200                }
2201                if (ret < 0) {
2202                        err = ret;
2203                        goto out;
2204                }
2205
2206                if (ret > 0) {
2207                        level = ret;
2208                        btrfs_node_key_to_cpu(path->nodes[level], &key,
2209                                              path->slots[level]);
2210                        replaced = 1;
2211                }
2212
2213                ret = walk_up_reloc_tree(reloc_root, path, &level);
2214                if (ret > 0)
2215                        break;
2216
2217                BUG_ON(level == 0);
2218                /*
2219                 * save the merging progress in the drop_progress.
2220                 * this is OK since root refs == 1 in this case.
2221                 */
2222                btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2223                               path->slots[level]);
2224                root_item->drop_level = level;
2225
2226                btrfs_end_transaction_throttle(trans, root);
2227                trans = NULL;
2228
2229                btrfs_btree_balance_dirty(root);
2230
2231                if (replaced && rc->stage == UPDATE_DATA_PTRS)
2232                        invalidate_extent_cache(root, &key, &next_key);
2233        }
2234
2235        /*
2236         * handle the case only one block in the fs tree need to be
2237         * relocated and the block is tree root.
2238         */
2239        leaf = btrfs_lock_root_node(root);
2240        ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2241        btrfs_tree_unlock(leaf);
2242        free_extent_buffer(leaf);
2243        if (ret < 0)
2244                err = ret;
2245out:
2246        btrfs_free_path(path);
2247
2248        if (err == 0) {
2249                memset(&root_item->drop_progress, 0,
2250                       sizeof(root_item->drop_progress));
2251                root_item->drop_level = 0;
2252                btrfs_set_root_refs(root_item, 0);
2253                btrfs_update_reloc_root(trans, root);
2254        }
2255
2256        if (trans)
2257                btrfs_end_transaction_throttle(trans, root);
2258
2259        btrfs_btree_balance_dirty(root);
2260
2261        if (replaced && rc->stage == UPDATE_DATA_PTRS)
2262                invalidate_extent_cache(root, &key, &next_key);
2263
2264        return err;
2265}
2266
2267static noinline_for_stack
2268int prepare_to_merge(struct reloc_control *rc, int err)
2269{
2270        struct btrfs_root *root = rc->extent_root;
2271        struct btrfs_root *reloc_root;
2272        struct btrfs_trans_handle *trans;
2273        LIST_HEAD(reloc_roots);
2274        u64 num_bytes = 0;
2275        int ret;
2276
2277        mutex_lock(&root->fs_info->reloc_mutex);
2278        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2279        rc->merging_rsv_size += rc->nodes_relocated * 2;
2280        mutex_unlock(&root->fs_info->reloc_mutex);
2281
2282again:
2283        if (!err) {
2284                num_bytes = rc->merging_rsv_size;
2285                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2286                                          BTRFS_RESERVE_FLUSH_ALL);
2287                if (ret)
2288                        err = ret;
2289        }
2290
2291        trans = btrfs_join_transaction(rc->extent_root);
2292        if (IS_ERR(trans)) {
2293                if (!err)
2294                        btrfs_block_rsv_release(rc->extent_root,
2295                                                rc->block_rsv, num_bytes);
2296                return PTR_ERR(trans);
2297        }
2298
2299        if (!err) {
2300                if (num_bytes != rc->merging_rsv_size) {
2301                        btrfs_end_transaction(trans, rc->extent_root);
2302                        btrfs_block_rsv_release(rc->extent_root,
2303                                                rc->block_rsv, num_bytes);
2304                        goto again;
2305                }
2306        }
2307
2308        rc->merge_reloc_tree = 1;
2309
2310        while (!list_empty(&rc->reloc_roots)) {
2311                reloc_root = list_entry(rc->reloc_roots.next,
2312                                        struct btrfs_root, root_list);
2313                list_del_init(&reloc_root->root_list);
2314
2315                root = read_fs_root(reloc_root->fs_info,
2316                                    reloc_root->root_key.offset);
2317                BUG_ON(IS_ERR(root));
2318                BUG_ON(root->reloc_root != reloc_root);
2319
2320                /*
2321                 * set reference count to 1, so btrfs_recover_relocation
2322                 * knows it should resumes merging
2323                 */
2324                if (!err)
2325                        btrfs_set_root_refs(&reloc_root->root_item, 1);
2326                btrfs_update_reloc_root(trans, root);
2327
2328                list_add(&reloc_root->root_list, &reloc_roots);
2329        }
2330
2331        list_splice(&reloc_roots, &rc->reloc_roots);
2332
2333        if (!err)
2334                btrfs_commit_transaction(trans, rc->extent_root);
2335        else
2336                btrfs_end_transaction(trans, rc->extent_root);
2337        return err;
2338}
2339
2340static noinline_for_stack
2341void free_reloc_roots(struct list_head *list)
2342{
2343        struct btrfs_root *reloc_root;
2344
2345        while (!list_empty(list)) {
2346                reloc_root = list_entry(list->next, struct btrfs_root,
2347                                        root_list);
2348                __del_reloc_root(reloc_root);
2349        }
2350}
2351
2352static noinline_for_stack
2353void merge_reloc_roots(struct reloc_control *rc)
2354{
2355        struct btrfs_root *root;
2356        struct btrfs_root *reloc_root;
2357        u64 last_snap;
2358        u64 otransid;
2359        u64 objectid;
2360        LIST_HEAD(reloc_roots);
2361        int found = 0;
2362        int ret = 0;
2363again:
2364        root = rc->extent_root;
2365
2366        /*
2367         * this serializes us with btrfs_record_root_in_transaction,
2368         * we have to make sure nobody is in the middle of
2369         * adding their roots to the list while we are
2370         * doing this splice
2371         */
2372        mutex_lock(&root->fs_info->reloc_mutex);
2373        list_splice_init(&rc->reloc_roots, &reloc_roots);
2374        mutex_unlock(&root->fs_info->reloc_mutex);
2375
2376        while (!list_empty(&reloc_roots)) {
2377                found = 1;
2378                reloc_root = list_entry(reloc_roots.next,
2379                                        struct btrfs_root, root_list);
2380
2381                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2382                        root = read_fs_root(reloc_root->fs_info,
2383                                            reloc_root->root_key.offset);
2384                        BUG_ON(IS_ERR(root));
2385                        BUG_ON(root->reloc_root != reloc_root);
2386
2387                        ret = merge_reloc_root(rc, root);
2388                        if (ret) {
2389                                if (list_empty(&reloc_root->root_list))
2390                                        list_add_tail(&reloc_root->root_list,
2391                                                      &reloc_roots);
2392                                goto out;
2393                        }
2394                } else {
2395                        list_del_init(&reloc_root->root_list);
2396                }
2397
2398                /*
2399                 * we keep the old last snapshod transid in rtranid when we
2400                 * created the relocation tree.
2401                 */
2402                last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2403                otransid = btrfs_root_otransid(&reloc_root->root_item);
2404                objectid = reloc_root->root_key.offset;
2405
2406                ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2407                if (ret < 0) {
2408                        if (list_empty(&reloc_root->root_list))
2409                                list_add_tail(&reloc_root->root_list,
2410                                              &reloc_roots);
2411                        goto out;
2412                }
2413        }
2414
2415        if (found) {
2416                found = 0;
2417                goto again;
2418        }
2419out:
2420        if (ret) {
2421                btrfs_std_error(root->fs_info, ret, NULL);
2422                if (!list_empty(&reloc_roots))
2423                        free_reloc_roots(&reloc_roots);
2424
2425                /* new reloc root may be added */
2426                mutex_lock(&root->fs_info->reloc_mutex);
2427                list_splice_init(&rc->reloc_roots, &reloc_roots);
2428                mutex_unlock(&root->fs_info->reloc_mutex);
2429                if (!list_empty(&reloc_roots))
2430                        free_reloc_roots(&reloc_roots);
2431        }
2432
2433        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2434}
2435
2436static void free_block_list(struct rb_root *blocks)
2437{
2438        struct tree_block *block;
2439        struct rb_node *rb_node;
2440        while ((rb_node = rb_first(blocks))) {
2441                block = rb_entry(rb_node, struct tree_block, rb_node);
2442                rb_erase(rb_node, blocks);
2443                kfree(block);
2444        }
2445}
2446
2447static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2448                                      struct btrfs_root *reloc_root)
2449{
2450        struct btrfs_root *root;
2451
2452        if (reloc_root->last_trans == trans->transid)
2453                return 0;
2454
2455        root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
2456        BUG_ON(IS_ERR(root));
2457        BUG_ON(root->reloc_root != reloc_root);
2458
2459        return btrfs_record_root_in_trans(trans, root);
2460}
2461
2462static noinline_for_stack
2463struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2464                                     struct reloc_control *rc,
2465                                     struct backref_node *node,
2466                                     struct backref_edge *edges[])
2467{
2468        struct backref_node *next;
2469        struct btrfs_root *root;
2470        int index = 0;
2471
2472        next = node;
2473        while (1) {
2474                cond_resched();
2475                next = walk_up_backref(next, edges, &index);
2476                root = next->root;
2477                BUG_ON(!root);
2478                BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2479
2480                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2481                        record_reloc_root_in_trans(trans, root);
2482                        break;
2483                }
2484
2485                btrfs_record_root_in_trans(trans, root);
2486                root = root->reloc_root;
2487
2488                if (next->new_bytenr != root->node->start) {
2489                        BUG_ON(next->new_bytenr);
2490                        BUG_ON(!list_empty(&next->list));
2491                        next->new_bytenr = root->node->start;
2492                        next->root = root;
2493                        list_add_tail(&next->list,
2494                                      &rc->backref_cache.changed);
2495                        __mark_block_processed(rc, next);
2496                        break;
2497                }
2498
2499                WARN_ON(1);
2500                root = NULL;
2501                next = walk_down_backref(edges, &index);
2502                if (!next || next->level <= node->level)
2503                        break;
2504        }
2505        if (!root)
2506                return NULL;
2507
2508        next = node;
2509        /* setup backref node path for btrfs_reloc_cow_block */
2510        while (1) {
2511                rc->backref_cache.path[next->level] = next;
2512                if (--index < 0)
2513                        break;
2514                next = edges[index]->node[UPPER];
2515        }
2516        return root;
2517}
2518
2519/*
2520 * select a tree root for relocation. return NULL if the block
2521 * is reference counted. we should use do_relocation() in this
2522 * case. return a tree root pointer if the block isn't reference
2523 * counted. return -ENOENT if the block is root of reloc tree.
2524 */
2525static noinline_for_stack
2526struct btrfs_root *select_one_root(struct backref_node *node)
2527{
2528        struct backref_node *next;
2529        struct btrfs_root *root;
2530        struct btrfs_root *fs_root = NULL;
2531        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2532        int index = 0;
2533
2534        next = node;
2535        while (1) {
2536                cond_resched();
2537                next = walk_up_backref(next, edges, &index);
2538                root = next->root;
2539                BUG_ON(!root);
2540
2541                /* no other choice for non-references counted tree */
2542                if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2543                        return root;
2544
2545                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2546                        fs_root = root;
2547
2548                if (next != node)
2549                        return NULL;
2550
2551                next = walk_down_backref(edges, &index);
2552                if (!next || next->level <= node->level)
2553                        break;
2554        }
2555
2556        if (!fs_root)
2557                return ERR_PTR(-ENOENT);
2558        return fs_root;
2559}
2560
2561static noinline_for_stack
2562u64 calcu_metadata_size(struct reloc_control *rc,
2563                        struct backref_node *node, int reserve)
2564{
2565        struct backref_node *next = node;
2566        struct backref_edge *edge;
2567        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2568        u64 num_bytes = 0;
2569        int index = 0;
2570
2571        BUG_ON(reserve && node->processed);
2572
2573        while (next) {
2574                cond_resched();
2575                while (1) {
2576                        if (next->processed && (reserve || next != node))
2577                                break;
2578
2579                        num_bytes += rc->extent_root->nodesize;
2580
2581                        if (list_empty(&next->upper))
2582                                break;
2583
2584                        edge = list_entry(next->upper.next,
2585                                          struct backref_edge, list[LOWER]);
2586                        edges[index++] = edge;
2587                        next = edge->node[UPPER];
2588                }
2589                next = walk_down_backref(edges, &index);
2590        }
2591        return num_bytes;
2592}
2593
2594static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2595                                  struct reloc_control *rc,
2596                                  struct backref_node *node)
2597{
2598        struct btrfs_root *root = rc->extent_root;
2599        u64 num_bytes;
2600        int ret;
2601        u64 tmp;
2602
2603        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2604
2605        trans->block_rsv = rc->block_rsv;
2606        rc->reserved_bytes += num_bytes;
2607        ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2608                                BTRFS_RESERVE_FLUSH_ALL);
2609        if (ret) {
2610                if (ret == -EAGAIN) {
2611                        tmp = rc->extent_root->nodesize *
2612                                RELOCATION_RESERVED_NODES;
2613                        while (tmp <= rc->reserved_bytes)
2614                                tmp <<= 1;
2615                        /*
2616                         * only one thread can access block_rsv at this point,
2617                         * so we don't need hold lock to protect block_rsv.
2618                         * we expand more reservation size here to allow enough
2619                         * space for relocation and we will return eailer in
2620                         * enospc case.
2621                         */
2622                        rc->block_rsv->size = tmp + rc->extent_root->nodesize *
2623                                              RELOCATION_RESERVED_NODES;
2624                }
2625                return ret;
2626        }
2627
2628        return 0;
2629}
2630
2631/*
2632 * relocate a block tree, and then update pointers in upper level
2633 * blocks that reference the block to point to the new location.
2634 *
2635 * if called by link_to_upper, the block has already been relocated.
2636 * in that case this function just updates pointers.
2637 */
2638static int do_relocation(struct btrfs_trans_handle *trans,
2639                         struct reloc_control *rc,
2640                         struct backref_node *node,
2641                         struct btrfs_key *key,
2642                         struct btrfs_path *path, int lowest)
2643{
2644        struct backref_node *upper;
2645        struct backref_edge *edge;
2646        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2647        struct btrfs_root *root;
2648        struct extent_buffer *eb;
2649        u32 blocksize;
2650        u64 bytenr;
2651        u64 generation;
2652        int slot;
2653        int ret;
2654        int err = 0;
2655
2656        BUG_ON(lowest && node->eb);
2657
2658        path->lowest_level = node->level + 1;
2659        rc->backref_cache.path[node->level] = node;
2660        list_for_each_entry(edge, &node->upper, list[LOWER]) {
2661                cond_resched();
2662
2663                upper = edge->node[UPPER];
2664                root = select_reloc_root(trans, rc, upper, edges);
2665                BUG_ON(!root);
2666
2667                if (upper->eb && !upper->locked) {
2668                        if (!lowest) {
2669                                ret = btrfs_bin_search(upper->eb, key,
2670                                                       upper->level, &slot);
2671                                BUG_ON(ret);
2672                                bytenr = btrfs_node_blockptr(upper->eb, slot);
2673                                if (node->eb->start == bytenr)
2674                                        goto next;
2675                        }
2676                        drop_node_buffer(upper);
2677                }
2678
2679                if (!upper->eb) {
2680                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2681                        if (ret < 0) {
2682                                err = ret;
2683                                break;
2684                        }
2685                        BUG_ON(ret > 0);
2686
2687                        if (!upper->eb) {
2688                                upper->eb = path->nodes[upper->level];
2689                                path->nodes[upper->level] = NULL;
2690                        } else {
2691                                BUG_ON(upper->eb != path->nodes[upper->level]);
2692                        }
2693
2694                        upper->locked = 1;
2695                        path->locks[upper->level] = 0;
2696
2697                        slot = path->slots[upper->level];
2698                        btrfs_release_path(path);
2699                } else {
2700                        ret = btrfs_bin_search(upper->eb, key, upper->level,
2701                                               &slot);
2702                        BUG_ON(ret);
2703                }
2704
2705                bytenr = btrfs_node_blockptr(upper->eb, slot);
2706                if (lowest) {
2707                        BUG_ON(bytenr != node->bytenr);
2708                } else {
2709                        if (node->eb->start == bytenr)
2710                                goto next;
2711                }
2712
2713                blocksize = root->nodesize;
2714                generation = btrfs_node_ptr_generation(upper->eb, slot);
2715                eb = read_tree_block(root, bytenr, generation);
2716                if (IS_ERR(eb)) {
2717                        err = PTR_ERR(eb);
2718                        goto next;
2719                } else if (!extent_buffer_uptodate(eb)) {
2720                        free_extent_buffer(eb);
2721                        err = -EIO;
2722                        goto next;
2723                }
2724                btrfs_tree_lock(eb);
2725                btrfs_set_lock_blocking(eb);
2726
2727                if (!node->eb) {
2728                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
2729                                              slot, &eb);
2730                        btrfs_tree_unlock(eb);
2731                        free_extent_buffer(eb);
2732                        if (ret < 0) {
2733                                err = ret;
2734                                goto next;
2735                        }
2736                        BUG_ON(node->eb != eb);
2737                } else {
2738                        btrfs_set_node_blockptr(upper->eb, slot,
2739                                                node->eb->start);
2740                        btrfs_set_node_ptr_generation(upper->eb, slot,
2741                                                      trans->transid);
2742                        btrfs_mark_buffer_dirty(upper->eb);
2743
2744                        ret = btrfs_inc_extent_ref(trans, root,
2745                                                node->eb->start, blocksize,
2746                                                upper->eb->start,
2747                                                btrfs_header_owner(upper->eb),
2748                                                node->level, 0);
2749                        BUG_ON(ret);
2750
2751                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2752                        BUG_ON(ret);
2753                }
2754next:
2755                if (!upper->pending)
2756                        drop_node_buffer(upper);
2757                else
2758                        unlock_node_buffer(upper);
2759                if (err)
2760                        break;
2761        }
2762
2763        if (!err && node->pending) {
2764                drop_node_buffer(node);
2765                list_move_tail(&node->list, &rc->backref_cache.changed);
2766                node->pending = 0;
2767        }
2768
2769        path->lowest_level = 0;
2770        BUG_ON(err == -ENOSPC);
2771        return err;
2772}
2773
2774static int link_to_upper(struct btrfs_trans_handle *trans,
2775                         struct reloc_control *rc,
2776                         struct backref_node *node,
2777                         struct btrfs_path *path)
2778{
2779        struct btrfs_key key;
2780
2781        btrfs_node_key_to_cpu(node->eb, &key, 0);
2782        return do_relocation(trans, rc, node, &key, path, 0);
2783}
2784
2785static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2786                                struct reloc_control *rc,
2787                                struct btrfs_path *path, int err)
2788{
2789        LIST_HEAD(list);
2790        struct backref_cache *cache = &rc->backref_cache;
2791        struct backref_node *node;
2792        int level;
2793        int ret;
2794
2795        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2796                while (!list_empty(&cache->pending[level])) {
2797                        node = list_entry(cache->pending[level].next,
2798                                          struct backref_node, list);
2799                        list_move_tail(&node->list, &list);
2800                        BUG_ON(!node->pending);
2801
2802                        if (!err) {
2803                                ret = link_to_upper(trans, rc, node, path);
2804                                if (ret < 0)
2805                                        err = ret;
2806                        }
2807                }
2808                list_splice_init(&list, &cache->pending[level]);
2809        }
2810        return err;
2811}
2812
2813static void mark_block_processed(struct reloc_control *rc,
2814                                 u64 bytenr, u32 blocksize)
2815{
2816        set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2817                        EXTENT_DIRTY, GFP_NOFS);
2818}
2819
2820static void __mark_block_processed(struct reloc_control *rc,
2821                                   struct backref_node *node)
2822{
2823        u32 blocksize;
2824        if (node->level == 0 ||
2825            in_block_group(node->bytenr, rc->block_group)) {
2826                blocksize = rc->extent_root->nodesize;
2827                mark_block_processed(rc, node->bytenr, blocksize);
2828        }
2829        node->processed = 1;
2830}
2831
2832/*
2833 * mark a block and all blocks directly/indirectly reference the block
2834 * as processed.
2835 */
2836static void update_processed_blocks(struct reloc_control *rc,
2837                                    struct backref_node *node)
2838{
2839        struct backref_node *next = node;
2840        struct backref_edge *edge;
2841        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2842        int index = 0;
2843
2844        while (next) {
2845                cond_resched();
2846                while (1) {
2847                        if (next->processed)
2848                                break;
2849
2850                        __mark_block_processed(rc, next);
2851
2852                        if (list_empty(&next->upper))
2853                                break;
2854
2855                        edge = list_entry(next->upper.next,
2856                                          struct backref_edge, list[LOWER]);
2857                        edges[index++] = edge;
2858                        next = edge->node[UPPER];
2859                }
2860                next = walk_down_backref(edges, &index);
2861        }
2862}
2863
2864static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2865{
2866        u32 blocksize = rc->extent_root->nodesize;
2867
2868        if (test_range_bit(&rc->processed_blocks, bytenr,
2869                           bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2870                return 1;
2871        return 0;
2872}
2873
2874static int get_tree_block_key(struct reloc_control *rc,
2875                              struct tree_block *block)
2876{
2877        struct extent_buffer *eb;
2878
2879        BUG_ON(block->key_ready);
2880        eb = read_tree_block(rc->extent_root, block->bytenr,
2881                             block->key.offset);
2882        if (IS_ERR(eb)) {
2883                return PTR_ERR(eb);
2884        } else if (!extent_buffer_uptodate(eb)) {
2885                free_extent_buffer(eb);
2886                return -EIO;
2887        }
2888        WARN_ON(btrfs_header_level(eb) != block->level);
2889        if (block->level == 0)
2890                btrfs_item_key_to_cpu(eb, &block->key, 0);
2891        else
2892                btrfs_node_key_to_cpu(eb, &block->key, 0);
2893        free_extent_buffer(eb);
2894        block->key_ready = 1;
2895        return 0;
2896}
2897
2898/*
2899 * helper function to relocate a tree block
2900 */
2901static int relocate_tree_block(struct btrfs_trans_handle *trans,
2902                                struct reloc_control *rc,
2903                                struct backref_node *node,
2904                                struct btrfs_key *key,
2905                                struct btrfs_path *path)
2906{
2907        struct btrfs_root *root;
2908        int ret = 0;
2909
2910        if (!node)
2911                return 0;
2912
2913        BUG_ON(node->processed);
2914        root = select_one_root(node);
2915        if (root == ERR_PTR(-ENOENT)) {
2916                update_processed_blocks(rc, node);
2917                goto out;
2918        }
2919
2920        if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2921                ret = reserve_metadata_space(trans, rc, node);
2922                if (ret)
2923                        goto out;
2924        }
2925
2926        if (root) {
2927                if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2928                        BUG_ON(node->new_bytenr);
2929                        BUG_ON(!list_empty(&node->list));
2930                        btrfs_record_root_in_trans(trans, root);
2931                        root = root->reloc_root;
2932                        node->new_bytenr = root->node->start;
2933                        node->root = root;
2934                        list_add_tail(&node->list, &rc->backref_cache.changed);
2935                } else {
2936                        path->lowest_level = node->level;
2937                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2938                        btrfs_release_path(path);
2939                        if (ret > 0)
2940                                ret = 0;
2941                }
2942                if (!ret)
2943                        update_processed_blocks(rc, node);
2944        } else {
2945                ret = do_relocation(trans, rc, node, key, path, 1);
2946        }
2947out:
2948        if (ret || node->level == 0 || node->cowonly)
2949                remove_backref_node(&rc->backref_cache, node);
2950        return ret;
2951}
2952
2953/*
2954 * relocate a list of blocks
2955 */
2956static noinline_for_stack
2957int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2958                         struct reloc_control *rc, struct rb_root *blocks)
2959{
2960        struct backref_node *node;
2961        struct btrfs_path *path;
2962        struct tree_block *block;
2963        struct rb_node *rb_node;
2964        int ret;
2965        int err = 0;
2966
2967        path = btrfs_alloc_path();
2968        if (!path) {
2969                err = -ENOMEM;
2970                goto out_free_blocks;
2971        }
2972
2973        rb_node = rb_first(blocks);
2974        while (rb_node) {
2975                block = rb_entry(rb_node, struct tree_block, rb_node);
2976                if (!block->key_ready)
2977                        readahead_tree_block(rc->extent_root, block->bytenr);
2978                rb_node = rb_next(rb_node);
2979        }
2980
2981        rb_node = rb_first(blocks);
2982        while (rb_node) {
2983                block = rb_entry(rb_node, struct tree_block, rb_node);
2984                if (!block->key_ready) {
2985                        err = get_tree_block_key(rc, block);
2986                        if (err)
2987                                goto out_free_path;
2988                }
2989                rb_node = rb_next(rb_node);
2990        }
2991
2992        rb_node = rb_first(blocks);
2993        while (rb_node) {
2994                block = rb_entry(rb_node, struct tree_block, rb_node);
2995
2996                node = build_backref_tree(rc, &block->key,
2997                                          block->level, block->bytenr);
2998                if (IS_ERR(node)) {
2999                        err = PTR_ERR(node);
3000                        goto out;
3001                }
3002
3003                ret = relocate_tree_block(trans, rc, node, &block->key,
3004                                          path);
3005                if (ret < 0) {
3006                        if (ret != -EAGAIN || rb_node == rb_first(blocks))
3007                                err = ret;
3008                        goto out;
3009                }
3010                rb_node = rb_next(rb_node);
3011        }
3012out:
3013        err = finish_pending_nodes(trans, rc, path, err);
3014
3015out_free_path:
3016        btrfs_free_path(path);
3017out_free_blocks:
3018        free_block_list(blocks);
3019        return err;
3020}
3021
3022static noinline_for_stack
3023int prealloc_file_extent_cluster(struct inode *inode,
3024                                 struct file_extent_cluster *cluster)
3025{
3026        u64 alloc_hint = 0;
3027        u64 start;
3028        u64 end;
3029        u64 offset = BTRFS_I(inode)->index_cnt;
3030        u64 num_bytes;
3031        int nr = 0;
3032        int ret = 0;
3033
3034        BUG_ON(cluster->start != cluster->boundary[0]);
3035        inode_lock(inode);
3036
3037        ret = btrfs_check_data_free_space(inode, cluster->start,
3038                                          cluster->end + 1 - cluster->start);
3039        if (ret)
3040                goto out;
3041
3042        while (nr < cluster->nr) {
3043                start = cluster->boundary[nr] - offset;
3044                if (nr + 1 < cluster->nr)
3045                        end = cluster->boundary[nr + 1] - 1 - offset;
3046                else
3047                        end = cluster->end - offset;
3048
3049                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3050                num_bytes = end + 1 - start;
3051                ret = btrfs_prealloc_file_range(inode, 0, start,
3052                                                num_bytes, num_bytes,
3053                                                end + 1, &alloc_hint);
3054                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3055                if (ret)
3056                        break;
3057                nr++;
3058        }
3059        btrfs_free_reserved_data_space(inode, cluster->start,
3060                                       cluster->end + 1 - cluster->start);
3061out:
3062        inode_unlock(inode);
3063        return ret;
3064}
3065
3066static noinline_for_stack
3067int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3068                         u64 block_start)
3069{
3070        struct btrfs_root *root = BTRFS_I(inode)->root;
3071        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3072        struct extent_map *em;
3073        int ret = 0;
3074
3075        em = alloc_extent_map();
3076        if (!em)
3077                return -ENOMEM;
3078
3079        em->start = start;
3080        em->len = end + 1 - start;
3081        em->block_len = em->len;
3082        em->block_start = block_start;
3083        em->bdev = root->fs_info->fs_devices->latest_bdev;
3084        set_bit(EXTENT_FLAG_PINNED, &em->flags);
3085
3086        lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3087        while (1) {
3088                write_lock(&em_tree->lock);
3089                ret = add_extent_mapping(em_tree, em, 0);
3090                write_unlock(&em_tree->lock);
3091                if (ret != -EEXIST) {
3092                        free_extent_map(em);
3093                        break;
3094                }
3095                btrfs_drop_extent_cache(inode, start, end, 0);
3096        }
3097        unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3098        return ret;
3099}
3100
3101static int relocate_file_extent_cluster(struct inode *inode,
3102                                        struct file_extent_cluster *cluster)
3103{
3104        u64 page_start;
3105        u64 page_end;
3106        u64 offset = BTRFS_I(inode)->index_cnt;
3107        unsigned long index;
3108        unsigned long last_index;
3109        struct page *page;
3110        struct file_ra_state *ra;
3111        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3112        int nr = 0;
3113        int ret = 0;
3114
3115        if (!cluster->nr)
3116                return 0;
3117
3118        ra = kzalloc(sizeof(*ra), GFP_NOFS);
3119        if (!ra)
3120                return -ENOMEM;
3121
3122        ret = prealloc_file_extent_cluster(inode, cluster);
3123        if (ret)
3124                goto out;
3125
3126        file_ra_state_init(ra, inode->i_mapping);
3127
3128        ret = setup_extent_mapping(inode, cluster->start - offset,
3129                                   cluster->end - offset, cluster->start);
3130        if (ret)
3131                goto out;
3132
3133        index = (cluster->start - offset) >> PAGE_SHIFT;
3134        last_index = (cluster->end - offset) >> PAGE_SHIFT;
3135        while (index <= last_index) {
3136                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE);
3137                if (ret)
3138                        goto out;
3139
3140                page = find_lock_page(inode->i_mapping, index);
3141                if (!page) {
3142                        page_cache_sync_readahead(inode->i_mapping,
3143                                                  ra, NULL, index,
3144                                                  last_index + 1 - index);
3145                        page = find_or_create_page(inode->i_mapping, index,
3146                                                   mask);
3147                        if (!page) {
3148                                btrfs_delalloc_release_metadata(inode,
3149                                                        PAGE_SIZE);
3150                                ret = -ENOMEM;
3151                                goto out;
3152                        }
3153                }
3154
3155                if (PageReadahead(page)) {
3156                        page_cache_async_readahead(inode->i_mapping,
3157                                                   ra, NULL, page, index,
3158                                                   last_index + 1 - index);
3159                }
3160
3161                if (!PageUptodate(page)) {
3162                        btrfs_readpage(NULL, page);
3163                        lock_page(page);
3164                        if (!PageUptodate(page)) {
3165                                unlock_page(page);
3166                                put_page(page);
3167                                btrfs_delalloc_release_metadata(inode,
3168                                                        PAGE_SIZE);
3169                                ret = -EIO;
3170                                goto out;
3171                        }
3172                }
3173
3174                page_start = page_offset(page);
3175                page_end = page_start + PAGE_SIZE - 1;
3176
3177                lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3178
3179                set_page_extent_mapped(page);
3180
3181                if (nr < cluster->nr &&
3182                    page_start + offset == cluster->boundary[nr]) {
3183                        set_extent_bits(&BTRFS_I(inode)->io_tree,
3184                                        page_start, page_end,
3185                                        EXTENT_BOUNDARY, GFP_NOFS);
3186                        nr++;
3187                }
3188
3189                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
3190                set_page_dirty(page);
3191
3192                unlock_extent(&BTRFS_I(inode)->io_tree,
3193                              page_start, page_end);
3194                unlock_page(page);
3195                put_page(page);
3196
3197                index++;
3198                balance_dirty_pages_ratelimited(inode->i_mapping);
3199                btrfs_throttle(BTRFS_I(inode)->root);
3200        }
3201        WARN_ON(nr != cluster->nr);
3202out:
3203        kfree(ra);
3204        return ret;
3205}
3206
3207static noinline_for_stack
3208int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3209                         struct file_extent_cluster *cluster)
3210{
3211        int ret;
3212
3213        if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3214                ret = relocate_file_extent_cluster(inode, cluster);
3215                if (ret)
3216                        return ret;
3217                cluster->nr = 0;
3218        }
3219
3220        if (!cluster->nr)
3221                cluster->start = extent_key->objectid;
3222        else
3223                BUG_ON(cluster->nr >= MAX_EXTENTS);
3224        cluster->end = extent_key->objectid + extent_key->offset - 1;
3225        cluster->boundary[cluster->nr] = extent_key->objectid;
3226        cluster->nr++;
3227
3228        if (cluster->nr >= MAX_EXTENTS) {
3229                ret = relocate_file_extent_cluster(inode, cluster);
3230                if (ret)
3231                        return ret;
3232                cluster->nr = 0;
3233        }
3234        return 0;
3235}
3236
3237#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3238static int get_ref_objectid_v0(struct reloc_control *rc,
3239                               struct btrfs_path *path,
3240                               struct btrfs_key *extent_key,
3241                               u64 *ref_objectid, int *path_change)
3242{
3243        struct btrfs_key key;
3244        struct extent_buffer *leaf;
3245        struct btrfs_extent_ref_v0 *ref0;
3246        int ret;
3247        int slot;
3248
3249        leaf = path->nodes[0];
3250        slot = path->slots[0];
3251        while (1) {
3252                if (slot >= btrfs_header_nritems(leaf)) {
3253                        ret = btrfs_next_leaf(rc->extent_root, path);
3254                        if (ret < 0)
3255                                return ret;
3256                        BUG_ON(ret > 0);
3257                        leaf = path->nodes[0];
3258                        slot = path->slots[0];
3259                        if (path_change)
3260                                *path_change = 1;
3261                }
3262                btrfs_item_key_to_cpu(leaf, &key, slot);
3263                if (key.objectid != extent_key->objectid)
3264                        return -ENOENT;
3265
3266                if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3267                        slot++;
3268                        continue;
3269                }
3270                ref0 = btrfs_item_ptr(leaf, slot,
3271                                struct btrfs_extent_ref_v0);
3272                *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3273                break;
3274        }
3275        return 0;
3276}
3277#endif
3278
3279/*
3280 * helper to add a tree block to the list.
3281 * the major work is getting the generation and level of the block
3282 */
3283static int add_tree_block(struct reloc_control *rc,
3284                          struct btrfs_key *extent_key,
3285                          struct btrfs_path *path,
3286                          struct rb_root *blocks)
3287{
3288        struct extent_buffer *eb;
3289        struct btrfs_extent_item *ei;
3290        struct btrfs_tree_block_info *bi;
3291        struct tree_block *block;
3292        struct rb_node *rb_node;
3293        u32 item_size;
3294        int level = -1;
3295        u64 generation;
3296
3297        eb =  path->nodes[0];
3298        item_size = btrfs_item_size_nr(eb, path->slots[0]);
3299
3300        if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3301            item_size >= sizeof(*ei) + sizeof(*bi)) {
3302                ei = btrfs_item_ptr(eb, path->slots[0],
3303                                struct btrfs_extent_item);
3304                if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3305                        bi = (struct btrfs_tree_block_info *)(ei + 1);
3306                        level = btrfs_tree_block_level(eb, bi);
3307                } else {
3308                        level = (int)extent_key->offset;
3309                }
3310                generation = btrfs_extent_generation(eb, ei);
3311        } else {
3312#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3313                u64 ref_owner;
3314                int ret;
3315
3316                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3317                ret = get_ref_objectid_v0(rc, path, extent_key,
3318                                          &ref_owner, NULL);
3319                if (ret < 0)
3320                        return ret;
3321                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3322                level = (int)ref_owner;
3323                /* FIXME: get real generation */
3324                generation = 0;
3325#else
3326                BUG();
3327#endif
3328        }
3329
3330        btrfs_release_path(path);
3331
3332        BUG_ON(level == -1);
3333
3334        block = kmalloc(sizeof(*block), GFP_NOFS);
3335        if (!block)
3336                return -ENOMEM;
3337
3338        block->bytenr = extent_key->objectid;
3339        block->key.objectid = rc->extent_root->nodesize;
3340        block->key.offset = generation;
3341        block->level = level;
3342        block->key_ready = 0;
3343
3344        rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3345        if (rb_node)
3346                backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3347
3348        return 0;
3349}
3350
3351/*
3352 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3353 */
3354static int __add_tree_block(struct reloc_control *rc,
3355                            u64 bytenr, u32 blocksize,
3356                            struct rb_root *blocks)
3357{
3358        struct btrfs_path *path;
3359        struct btrfs_key key;
3360        int ret;
3361        bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3362                                        SKINNY_METADATA);
3363
3364        if (tree_block_processed(bytenr, rc))
3365                return 0;
3366
3367        if (tree_search(blocks, bytenr))
3368                return 0;
3369
3370        path = btrfs_alloc_path();
3371        if (!path)
3372                return -ENOMEM;
3373again:
3374        key.objectid = bytenr;
3375        if (skinny) {
3376                key.type = BTRFS_METADATA_ITEM_KEY;
3377                key.offset = (u64)-1;
3378        } else {
3379                key.type = BTRFS_EXTENT_ITEM_KEY;
3380                key.offset = blocksize;
3381        }
3382
3383        path->search_commit_root = 1;
3384        path->skip_locking = 1;
3385        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3386        if (ret < 0)
3387                goto out;
3388
3389        if (ret > 0 && skinny) {
3390                if (path->slots[0]) {
3391                        path->slots[0]--;
3392                        btrfs_item_key_to_cpu(path->nodes[0], &key,
3393                                              path->slots[0]);
3394                        if (key.objectid == bytenr &&
3395                            (key.type == BTRFS_METADATA_ITEM_KEY ||
3396                             (key.type == BTRFS_EXTENT_ITEM_KEY &&
3397                              key.offset == blocksize)))
3398                                ret = 0;
3399                }
3400
3401                if (ret) {
3402                        skinny = false;
3403                        btrfs_release_path(path);
3404                        goto again;
3405                }
3406        }
3407        BUG_ON(ret);
3408
3409        ret = add_tree_block(rc, &key, path, blocks);
3410out:
3411        btrfs_free_path(path);
3412        return ret;
3413}
3414
3415/*
3416 * helper to check if the block use full backrefs for pointers in it
3417 */
3418static int block_use_full_backref(struct reloc_control *rc,
3419                                  struct extent_buffer *eb)
3420{
3421        u64 flags;
3422        int ret;
3423
3424        if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3425            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3426                return 1;
3427
3428        ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
3429                                       eb->start, btrfs_header_level(eb), 1,
3430                                       NULL, &flags);
3431        BUG_ON(ret);
3432
3433        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3434                ret = 1;
3435        else
3436                ret = 0;
3437        return ret;
3438}
3439
3440static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3441                                    struct btrfs_block_group_cache *block_group,
3442                                    struct inode *inode,
3443                                    u64 ino)
3444{
3445        struct btrfs_key key;
3446        struct btrfs_root *root = fs_info->tree_root;
3447        struct btrfs_trans_handle *trans;
3448        int ret = 0;
3449
3450        if (inode)
3451                goto truncate;
3452
3453        key.objectid = ino;
3454        key.type = BTRFS_INODE_ITEM_KEY;
3455        key.offset = 0;
3456
3457        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3458        if (IS_ERR(inode) || is_bad_inode(inode)) {
3459                if (!IS_ERR(inode))
3460                        iput(inode);
3461                return -ENOENT;
3462        }
3463
3464truncate:
3465        ret = btrfs_check_trunc_cache_free_space(root,
3466                                                 &fs_info->global_block_rsv);
3467        if (ret)
3468                goto out;
3469
3470        trans = btrfs_join_transaction(root);
3471        if (IS_ERR(trans)) {
3472                ret = PTR_ERR(trans);
3473                goto out;
3474        }
3475
3476        ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
3477
3478        btrfs_end_transaction(trans, root);
3479        btrfs_btree_balance_dirty(root);
3480out:
3481        iput(inode);
3482        return ret;
3483}
3484
3485/*
3486 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3487 * this function scans fs tree to find blocks reference the data extent
3488 */
3489static int find_data_references(struct reloc_control *rc,
3490                                struct btrfs_key *extent_key,
3491                                struct extent_buffer *leaf,
3492                                struct btrfs_extent_data_ref *ref,
3493                                struct rb_root *blocks)
3494{
3495        struct btrfs_path *path;
3496        struct tree_block *block;
3497        struct btrfs_root *root;
3498        struct btrfs_file_extent_item *fi;
3499        struct rb_node *rb_node;
3500        struct btrfs_key key;
3501        u64 ref_root;
3502        u64 ref_objectid;
3503        u64 ref_offset;
3504        u32 ref_count;
3505        u32 nritems;
3506        int err = 0;
3507        int added = 0;
3508        int counted;
3509        int ret;
3510
3511        ref_root = btrfs_extent_data_ref_root(leaf, ref);
3512        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3513        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3514        ref_count = btrfs_extent_data_ref_count(leaf, ref);
3515
3516        /*
3517         * This is an extent belonging to the free space cache, lets just delete
3518         * it and redo the search.
3519         */
3520        if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3521                ret = delete_block_group_cache(rc->extent_root->fs_info,
3522                                               rc->block_group,
3523                                               NULL, ref_objectid);
3524                if (ret != -ENOENT)
3525                        return ret;
3526                ret = 0;
3527        }
3528
3529        path = btrfs_alloc_path();
3530        if (!path)
3531                return -ENOMEM;
3532        path->reada = READA_FORWARD;
3533
3534        root = read_fs_root(rc->extent_root->fs_info, ref_root);
3535        if (IS_ERR(root)) {
3536                err = PTR_ERR(root);
3537                goto out;
3538        }
3539
3540        key.objectid = ref_objectid;
3541        key.type = BTRFS_EXTENT_DATA_KEY;
3542        if (ref_offset > ((u64)-1 << 32))
3543                key.offset = 0;
3544        else
3545                key.offset = ref_offset;
3546
3547        path->search_commit_root = 1;
3548        path->skip_locking = 1;
3549        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3550        if (ret < 0) {
3551                err = ret;
3552                goto out;
3553        }
3554
3555        leaf = path->nodes[0];
3556        nritems = btrfs_header_nritems(leaf);
3557        /*
3558         * the references in tree blocks that use full backrefs
3559         * are not counted in
3560         */
3561        if (block_use_full_backref(rc, leaf))
3562                counted = 0;
3563        else
3564                counted = 1;
3565        rb_node = tree_search(blocks, leaf->start);
3566        if (rb_node) {
3567                if (counted)
3568                        added = 1;
3569                else
3570                        path->slots[0] = nritems;
3571        }
3572
3573        while (ref_count > 0) {
3574                while (path->slots[0] >= nritems) {
3575                        ret = btrfs_next_leaf(root, path);
3576                        if (ret < 0) {
3577                                err = ret;
3578                                goto out;
3579                        }
3580                        if (WARN_ON(ret > 0))
3581                                goto out;
3582
3583                        leaf = path->nodes[0];
3584                        nritems = btrfs_header_nritems(leaf);
3585                        added = 0;
3586
3587                        if (block_use_full_backref(rc, leaf))
3588                                counted = 0;
3589                        else
3590                                counted = 1;
3591                        rb_node = tree_search(blocks, leaf->start);
3592                        if (rb_node) {
3593                                if (counted)
3594                                        added = 1;
3595                                else
3596                                        path->slots[0] = nritems;
3597                        }
3598                }
3599
3600                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3601                if (WARN_ON(key.objectid != ref_objectid ||
3602                    key.type != BTRFS_EXTENT_DATA_KEY))
3603                        break;
3604
3605                fi = btrfs_item_ptr(leaf, path->slots[0],
3606                                    struct btrfs_file_extent_item);
3607
3608                if (btrfs_file_extent_type(leaf, fi) ==
3609                    BTRFS_FILE_EXTENT_INLINE)
3610                        goto next;
3611
3612                if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3613                    extent_key->objectid)
3614                        goto next;
3615
3616                key.offset -= btrfs_file_extent_offset(leaf, fi);
3617                if (key.offset != ref_offset)
3618                        goto next;
3619
3620                if (counted)
3621                        ref_count--;
3622                if (added)
3623                        goto next;
3624
3625                if (!tree_block_processed(leaf->start, rc)) {
3626                        block = kmalloc(sizeof(*block), GFP_NOFS);
3627                        if (!block) {
3628                                err = -ENOMEM;
3629                                break;
3630                        }
3631                        block->bytenr = leaf->start;
3632                        btrfs_item_key_to_cpu(leaf, &block->key, 0);
3633                        block->level = 0;
3634                        block->key_ready = 1;
3635                        rb_node = tree_insert(blocks, block->bytenr,
3636                                              &block->rb_node);
3637                        if (rb_node)
3638                                backref_tree_panic(rb_node, -EEXIST,
3639                                                   block->bytenr);
3640                }
3641                if (counted)
3642                        added = 1;
3643                else
3644                        path->slots[0] = nritems;
3645next:
3646                path->slots[0]++;
3647
3648        }
3649out:
3650        btrfs_free_path(path);
3651        return err;
3652}
3653
3654/*
3655 * helper to find all tree blocks that reference a given data extent
3656 */
3657static noinline_for_stack
3658int add_data_references(struct reloc_control *rc,
3659                        struct btrfs_key *extent_key,
3660                        struct btrfs_path *path,
3661                        struct rb_root *blocks)
3662{
3663        struct btrfs_key key;
3664        struct extent_buffer *eb;
3665        struct btrfs_extent_data_ref *dref;
3666        struct btrfs_extent_inline_ref *iref;
3667        unsigned long ptr;
3668        unsigned long end;
3669        u32 blocksize = rc->extent_root->nodesize;
3670        int ret = 0;
3671        int err = 0;
3672
3673        eb = path->nodes[0];
3674        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3675        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3676#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3677        if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3678                ptr = end;
3679        else
3680#endif
3681                ptr += sizeof(struct btrfs_extent_item);
3682
3683        while (ptr < end) {
3684                iref = (struct btrfs_extent_inline_ref *)ptr;
3685                key.type = btrfs_extent_inline_ref_type(eb, iref);
3686                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3687                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3688                        ret = __add_tree_block(rc, key.offset, blocksize,
3689                                               blocks);
3690                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3691                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3692                        ret = find_data_references(rc, extent_key,
3693                                                   eb, dref, blocks);
3694                } else {
3695                        BUG();
3696                }
3697                if (ret) {
3698                        err = ret;
3699                        goto out;
3700                }
3701                ptr += btrfs_extent_inline_ref_size(key.type);
3702        }
3703        WARN_ON(ptr > end);
3704
3705        while (1) {
3706                cond_resched();
3707                eb = path->nodes[0];
3708                if (path->slots[0] >= btrfs_header_nritems(eb)) {
3709                        ret = btrfs_next_leaf(rc->extent_root, path);
3710                        if (ret < 0) {
3711                                err = ret;
3712                                break;
3713                        }
3714                        if (ret > 0)
3715                                break;
3716                        eb = path->nodes[0];
3717                }
3718
3719                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3720                if (key.objectid != extent_key->objectid)
3721                        break;
3722
3723#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3724                if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3725                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3726#else
3727                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3728                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3729#endif
3730                        ret = __add_tree_block(rc, key.offset, blocksize,
3731                                               blocks);
3732                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3733                        dref = btrfs_item_ptr(eb, path->slots[0],
3734                                              struct btrfs_extent_data_ref);
3735                        ret = find_data_references(rc, extent_key,
3736                                                   eb, dref, blocks);
3737                } else {
3738                        ret = 0;
3739                }
3740                if (ret) {
3741                        err = ret;
3742                        break;
3743                }
3744                path->slots[0]++;
3745        }
3746out:
3747        btrfs_release_path(path);
3748        if (err)
3749                free_block_list(blocks);
3750        return err;
3751}
3752
3753/*
3754 * helper to find next unprocessed extent
3755 */
3756static noinline_for_stack
3757int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3758                     struct btrfs_key *extent_key)
3759{
3760        struct btrfs_key key;
3761        struct extent_buffer *leaf;
3762        u64 start, end, last;
3763        int ret;
3764
3765        last = rc->block_group->key.objectid + rc->block_group->key.offset;
3766        while (1) {
3767                cond_resched();
3768                if (rc->search_start >= last) {
3769                        ret = 1;
3770                        break;
3771                }
3772
3773                key.objectid = rc->search_start;
3774                key.type = BTRFS_EXTENT_ITEM_KEY;
3775                key.offset = 0;
3776
3777                path->search_commit_root = 1;
3778                path->skip_locking = 1;
3779                ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3780                                        0, 0);
3781                if (ret < 0)
3782                        break;
3783next:
3784                leaf = path->nodes[0];
3785                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3786                        ret = btrfs_next_leaf(rc->extent_root, path);
3787                        if (ret != 0)
3788                                break;
3789                        leaf = path->nodes[0];
3790                }
3791
3792                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3793                if (key.objectid >= last) {
3794                        ret = 1;
3795                        break;
3796                }
3797
3798                if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3799                    key.type != BTRFS_METADATA_ITEM_KEY) {
3800                        path->slots[0]++;
3801                        goto next;
3802                }
3803
3804                if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3805                    key.objectid + key.offset <= rc->search_start) {
3806                        path->slots[0]++;
3807                        goto next;
3808                }
3809
3810                if (key.type == BTRFS_METADATA_ITEM_KEY &&
3811                    key.objectid + rc->extent_root->nodesize <=
3812                    rc->search_start) {
3813                        path->slots[0]++;
3814                        goto next;
3815                }
3816
3817                ret = find_first_extent_bit(&rc->processed_blocks,
3818                                            key.objectid, &start, &end,
3819                                            EXTENT_DIRTY, NULL);
3820
3821                if (ret == 0 && start <= key.objectid) {
3822                        btrfs_release_path(path);
3823                        rc->search_start = end + 1;
3824                } else {
3825                        if (key.type == BTRFS_EXTENT_ITEM_KEY)
3826                                rc->search_start = key.objectid + key.offset;
3827                        else
3828                                rc->search_start = key.objectid +
3829                                        rc->extent_root->nodesize;
3830                        memcpy(extent_key, &key, sizeof(key));
3831                        return 0;
3832                }
3833        }
3834        btrfs_release_path(path);
3835        return ret;
3836}
3837
3838static void set_reloc_control(struct reloc_control *rc)
3839{
3840        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3841
3842        mutex_lock(&fs_info->reloc_mutex);
3843        fs_info->reloc_ctl = rc;
3844        mutex_unlock(&fs_info->reloc_mutex);
3845}
3846
3847static void unset_reloc_control(struct reloc_control *rc)
3848{
3849        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3850
3851        mutex_lock(&fs_info->reloc_mutex);
3852        fs_info->reloc_ctl = NULL;
3853        mutex_unlock(&fs_info->reloc_mutex);
3854}
3855
3856static int check_extent_flags(u64 flags)
3857{
3858        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3859            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3860                return 1;
3861        if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3862            !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3863                return 1;
3864        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3865            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3866                return 1;
3867        return 0;
3868}
3869
3870static noinline_for_stack
3871int prepare_to_relocate(struct reloc_control *rc)
3872{
3873        struct btrfs_trans_handle *trans;
3874
3875        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3876                                              BTRFS_BLOCK_RSV_TEMP);
3877        if (!rc->block_rsv)
3878                return -ENOMEM;
3879
3880        memset(&rc->cluster, 0, sizeof(rc->cluster));
3881        rc->search_start = rc->block_group->key.objectid;
3882        rc->extents_found = 0;
3883        rc->nodes_relocated = 0;
3884        rc->merging_rsv_size = 0;
3885        rc->reserved_bytes = 0;
3886        rc->block_rsv->size = rc->extent_root->nodesize *
3887                              RELOCATION_RESERVED_NODES;
3888
3889        rc->create_reloc_tree = 1;
3890        set_reloc_control(rc);
3891
3892        trans = btrfs_join_transaction(rc->extent_root);
3893        if (IS_ERR(trans)) {
3894                unset_reloc_control(rc);
3895                /*
3896                 * extent tree is not a ref_cow tree and has no reloc_root to
3897                 * cleanup.  And callers are responsible to free the above
3898                 * block rsv.
3899                 */
3900                return PTR_ERR(trans);
3901        }
3902        btrfs_commit_transaction(trans, rc->extent_root);
3903        return 0;
3904}
3905
3906static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3907{
3908        struct rb_root blocks = RB_ROOT;
3909        struct btrfs_key key;
3910        struct btrfs_trans_handle *trans = NULL;
3911        struct btrfs_path *path;
3912        struct btrfs_extent_item *ei;
3913        u64 flags;
3914        u32 item_size;
3915        int ret;
3916        int err = 0;
3917        int progress = 0;
3918
3919        path = btrfs_alloc_path();
3920        if (!path)
3921                return -ENOMEM;
3922        path->reada = READA_FORWARD;
3923
3924        ret = prepare_to_relocate(rc);
3925        if (ret) {
3926                err = ret;
3927                goto out_free;
3928        }
3929
3930        while (1) {
3931                rc->reserved_bytes = 0;
3932                ret = btrfs_block_rsv_refill(rc->extent_root,
3933                                        rc->block_rsv, rc->block_rsv->size,
3934                                        BTRFS_RESERVE_FLUSH_ALL);
3935                if (ret) {
3936                        err = ret;
3937                        break;
3938                }
3939                progress++;
3940                trans = btrfs_start_transaction(rc->extent_root, 0);
3941                if (IS_ERR(trans)) {
3942                        err = PTR_ERR(trans);
3943                        trans = NULL;
3944                        break;
3945                }
3946restart:
3947                if (update_backref_cache(trans, &rc->backref_cache)) {
3948                        btrfs_end_transaction(trans, rc->extent_root);
3949                        continue;
3950                }
3951
3952                ret = find_next_extent(rc, path, &key);
3953                if (ret < 0)
3954                        err = ret;
3955                if (ret != 0)
3956                        break;
3957
3958                rc->extents_found++;
3959
3960                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3961                                    struct btrfs_extent_item);
3962                item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3963                if (item_size >= sizeof(*ei)) {
3964                        flags = btrfs_extent_flags(path->nodes[0], ei);
3965                        ret = check_extent_flags(flags);
3966                        BUG_ON(ret);
3967
3968                } else {
3969#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3970                        u64 ref_owner;
3971                        int path_change = 0;
3972
3973                        BUG_ON(item_size !=
3974                               sizeof(struct btrfs_extent_item_v0));
3975                        ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3976                                                  &path_change);
3977                        if (ret < 0) {
3978                                err = ret;
3979                                break;
3980                        }
3981                        if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3982                                flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3983                        else
3984                                flags = BTRFS_EXTENT_FLAG_DATA;
3985
3986                        if (path_change) {
3987                                btrfs_release_path(path);
3988
3989                                path->search_commit_root = 1;
3990                                path->skip_locking = 1;
3991                                ret = btrfs_search_slot(NULL, rc->extent_root,
3992                                                        &key, path, 0, 0);
3993                                if (ret < 0) {
3994                                        err = ret;
3995                                        break;
3996                                }
3997                                BUG_ON(ret > 0);
3998                        }
3999#else
4000                        BUG();
4001#endif
4002                }
4003
4004                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4005                        ret = add_tree_block(rc, &key, path, &blocks);
4006                } else if (rc->stage == UPDATE_DATA_PTRS &&
4007                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
4008                        ret = add_data_references(rc, &key, path, &blocks);
4009                } else {
4010                        btrfs_release_path(path);
4011                        ret = 0;
4012                }
4013                if (ret < 0) {
4014                        err = ret;
4015                        break;
4016                }
4017
4018                if (!RB_EMPTY_ROOT(&blocks)) {
4019                        ret = relocate_tree_blocks(trans, rc, &blocks);
4020                        if (ret < 0) {
4021                                /*
4022                                 * if we fail to relocate tree blocks, force to update
4023                                 * backref cache when committing transaction.
4024                                 */
4025                                rc->backref_cache.last_trans = trans->transid - 1;
4026
4027                                if (ret != -EAGAIN) {
4028                                        err = ret;
4029                                        break;
4030                                }
4031                                rc->extents_found--;
4032                                rc->search_start = key.objectid;
4033                        }
4034                }
4035
4036                btrfs_end_transaction_throttle(trans, rc->extent_root);
4037                btrfs_btree_balance_dirty(rc->extent_root);
4038                trans = NULL;
4039
4040                if (rc->stage == MOVE_DATA_EXTENTS &&
4041                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4042                        rc->found_file_extent = 1;
4043                        ret = relocate_data_extent(rc->data_inode,
4044                                                   &key, &rc->cluster);
4045                        if (ret < 0) {
4046                                err = ret;
4047                                break;
4048                        }
4049                }
4050        }
4051        if (trans && progress && err == -ENOSPC) {
4052                ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
4053                                              rc->block_group->flags);
4054                if (ret == 1) {
4055                        err = 0;
4056                        progress = 0;
4057                        goto restart;
4058                }
4059        }
4060
4061        btrfs_release_path(path);
4062        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
4063                          GFP_NOFS);
4064
4065        if (trans) {
4066                btrfs_end_transaction_throttle(trans, rc->extent_root);
4067                btrfs_btree_balance_dirty(rc->extent_root);
4068        }
4069
4070        if (!err) {
4071                ret = relocate_file_extent_cluster(rc->data_inode,
4072                                                   &rc->cluster);
4073                if (ret < 0)
4074                        err = ret;
4075        }
4076
4077        rc->create_reloc_tree = 0;
4078        set_reloc_control(rc);
4079
4080        backref_cache_cleanup(&rc->backref_cache);
4081        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4082
4083        err = prepare_to_merge(rc, err);
4084
4085        merge_reloc_roots(rc);
4086
4087        rc->merge_reloc_tree = 0;
4088        unset_reloc_control(rc);
4089        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4090
4091        /* get rid of pinned extents */
4092        trans = btrfs_join_transaction(rc->extent_root);
4093        if (IS_ERR(trans))
4094                err = PTR_ERR(trans);
4095        else
4096                btrfs_commit_transaction(trans, rc->extent_root);
4097out_free:
4098        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
4099        btrfs_free_path(path);
4100        return err;
4101}
4102
4103static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4104                                 struct btrfs_root *root, u64 objectid)
4105{
4106        struct btrfs_path *path;
4107        struct btrfs_inode_item *item;
4108        struct extent_buffer *leaf;
4109        int ret;
4110
4111        path = btrfs_alloc_path();
4112        if (!path)
4113                return -ENOMEM;
4114
4115        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4116        if (ret)
4117                goto out;
4118
4119        leaf = path->nodes[0];
4120        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4121        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
4122        btrfs_set_inode_generation(leaf, item, 1);
4123        btrfs_set_inode_size(leaf, item, 0);
4124        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4125        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4126                                          BTRFS_INODE_PREALLOC);
4127        btrfs_mark_buffer_dirty(leaf);
4128out:
4129        btrfs_free_path(path);
4130        return ret;
4131}
4132
4133/*
4134 * helper to create inode for data relocation.
4135 * the inode is in data relocation tree and its link count is 0
4136 */
4137static noinline_for_stack
4138struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4139                                 struct btrfs_block_group_cache *group)
4140{
4141        struct inode *inode = NULL;
4142        struct btrfs_trans_handle *trans;
4143        struct btrfs_root *root;
4144        struct btrfs_key key;
4145        u64 objectid;
4146        int err = 0;
4147
4148        root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4149        if (IS_ERR(root))
4150                return ERR_CAST(root);
4151
4152        trans = btrfs_start_transaction(root, 6);
4153        if (IS_ERR(trans))
4154                return ERR_CAST(trans);
4155
4156        err = btrfs_find_free_objectid(root, &objectid);
4157        if (err)
4158                goto out;
4159
4160        err = __insert_orphan_inode(trans, root, objectid);
4161        BUG_ON(err);
4162
4163        key.objectid = objectid;
4164        key.type = BTRFS_INODE_ITEM_KEY;
4165        key.offset = 0;
4166        inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
4167        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4168        BTRFS_I(inode)->index_cnt = group->key.objectid;
4169
4170        err = btrfs_orphan_add(trans, inode);
4171out:
4172        btrfs_end_transaction(trans, root);
4173        btrfs_btree_balance_dirty(root);
4174        if (err) {
4175                if (inode)
4176                        iput(inode);
4177                inode = ERR_PTR(err);
4178        }
4179        return inode;
4180}
4181
4182static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4183{
4184        struct reloc_control *rc;
4185
4186        rc = kzalloc(sizeof(*rc), GFP_NOFS);
4187        if (!rc)
4188                return NULL;
4189
4190        INIT_LIST_HEAD(&rc->reloc_roots);
4191        backref_cache_init(&rc->backref_cache);
4192        mapping_tree_init(&rc->reloc_root_tree);
4193        extent_io_tree_init(&rc->processed_blocks,
4194                            fs_info->btree_inode->i_mapping);
4195        return rc;
4196}
4197
4198/*
4199 * function to relocate all extents in a block group.
4200 */
4201int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4202{
4203        struct btrfs_fs_info *fs_info = extent_root->fs_info;
4204        struct reloc_control *rc;
4205        struct inode *inode;
4206        struct btrfs_path *path;
4207        int ret;
4208        int rw = 0;
4209        int err = 0;
4210
4211        rc = alloc_reloc_control(fs_info);
4212        if (!rc)
4213                return -ENOMEM;
4214
4215        rc->extent_root = extent_root;
4216
4217        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4218        BUG_ON(!rc->block_group);
4219
4220        ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
4221        if (ret) {
4222                err = ret;
4223                goto out;
4224        }
4225        rw = 1;
4226
4227        path = btrfs_alloc_path();
4228        if (!path) {
4229                err = -ENOMEM;
4230                goto out;
4231        }
4232
4233        inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
4234                                        path);
4235        btrfs_free_path(path);
4236
4237        if (!IS_ERR(inode))
4238                ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4239        else
4240                ret = PTR_ERR(inode);
4241
4242        if (ret && ret != -ENOENT) {
4243                err = ret;
4244                goto out;
4245        }
4246
4247        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4248        if (IS_ERR(rc->data_inode)) {
4249                err = PTR_ERR(rc->data_inode);
4250                rc->data_inode = NULL;
4251                goto out;
4252        }
4253
4254        btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4255               rc->block_group->key.objectid, rc->block_group->flags);
4256
4257        ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
4258        if (ret < 0) {
4259                err = ret;
4260                goto out;
4261        }
4262        btrfs_wait_ordered_roots(fs_info, -1);
4263
4264        while (1) {
4265                mutex_lock(&fs_info->cleaner_mutex);
4266                ret = relocate_block_group(rc);
4267                mutex_unlock(&fs_info->cleaner_mutex);
4268                if (ret < 0) {
4269                        err = ret;
4270                        goto out;
4271                }
4272
4273                if (rc->extents_found == 0)
4274                        break;
4275
4276                btrfs_info(extent_root->fs_info, "found %llu extents",
4277                        rc->extents_found);
4278
4279                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4280                        ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4281                                                       (u64)-1);
4282                        if (ret) {
4283                                err = ret;
4284                                goto out;
4285                        }
4286                        invalidate_mapping_pages(rc->data_inode->i_mapping,
4287                                                 0, -1);
4288                        rc->stage = UPDATE_DATA_PTRS;
4289                }
4290        }
4291
4292        WARN_ON(rc->block_group->pinned > 0);
4293        WARN_ON(rc->block_group->reserved > 0);
4294        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4295out:
4296        if (err && rw)
4297                btrfs_dec_block_group_ro(extent_root, rc->block_group);
4298        iput(rc->data_inode);
4299        btrfs_put_block_group(rc->block_group);
4300        kfree(rc);
4301        return err;
4302}
4303
4304static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4305{
4306        struct btrfs_trans_handle *trans;
4307        int ret, err;
4308
4309        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4310        if (IS_ERR(trans))
4311                return PTR_ERR(trans);
4312
4313        memset(&root->root_item.drop_progress, 0,
4314                sizeof(root->root_item.drop_progress));
4315        root->root_item.drop_level = 0;
4316        btrfs_set_root_refs(&root->root_item, 0);
4317        ret = btrfs_update_root(trans, root->fs_info->tree_root,
4318                                &root->root_key, &root->root_item);
4319
4320        err = btrfs_end_transaction(trans, root->fs_info->tree_root);
4321        if (err)
4322                return err;
4323        return ret;
4324}
4325
4326/*
4327 * recover relocation interrupted by system crash.
4328 *
4329 * this function resumes merging reloc trees with corresponding fs trees.
4330 * this is important for keeping the sharing of tree blocks
4331 */
4332int btrfs_recover_relocation(struct btrfs_root *root)
4333{
4334        LIST_HEAD(reloc_roots);
4335        struct btrfs_key key;
4336        struct btrfs_root *fs_root;
4337        struct btrfs_root *reloc_root;
4338        struct btrfs_path *path;
4339        struct extent_buffer *leaf;
4340        struct reloc_control *rc = NULL;
4341        struct btrfs_trans_handle *trans;
4342        int ret;
4343        int err = 0;
4344
4345        path = btrfs_alloc_path();
4346        if (!path)
4347                return -ENOMEM;
4348        path->reada = READA_BACK;
4349
4350        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4351        key.type = BTRFS_ROOT_ITEM_KEY;
4352        key.offset = (u64)-1;
4353
4354        while (1) {
4355                ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
4356                                        path, 0, 0);
4357                if (ret < 0) {
4358                        err = ret;
4359                        goto out;
4360                }
4361                if (ret > 0) {
4362                        if (path->slots[0] == 0)
4363                                break;
4364                        path->slots[0]--;
4365                }
4366                leaf = path->nodes[0];
4367                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4368                btrfs_release_path(path);
4369
4370                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4371                    key.type != BTRFS_ROOT_ITEM_KEY)
4372                        break;
4373
4374                reloc_root = btrfs_read_fs_root(root, &key);
4375                if (IS_ERR(reloc_root)) {
4376                        err = PTR_ERR(reloc_root);
4377                        goto out;
4378                }
4379
4380                list_add(&reloc_root->root_list, &reloc_roots);
4381
4382                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4383                        fs_root = read_fs_root(root->fs_info,
4384                                               reloc_root->root_key.offset);
4385                        if (IS_ERR(fs_root)) {
4386                                ret = PTR_ERR(fs_root);
4387                                if (ret != -ENOENT) {
4388                                        err = ret;
4389                                        goto out;
4390                                }
4391                                ret = mark_garbage_root(reloc_root);
4392                                if (ret < 0) {
4393                                        err = ret;
4394                                        goto out;
4395                                }
4396                        }
4397                }
4398
4399                if (key.offset == 0)
4400                        break;
4401
4402                key.offset--;
4403        }
4404        btrfs_release_path(path);
4405
4406        if (list_empty(&reloc_roots))
4407                goto out;
4408
4409        rc = alloc_reloc_control(root->fs_info);
4410        if (!rc) {
4411                err = -ENOMEM;
4412                goto out;
4413        }
4414
4415        rc->extent_root = root->fs_info->extent_root;
4416
4417        set_reloc_control(rc);
4418
4419        trans = btrfs_join_transaction(rc->extent_root);
4420        if (IS_ERR(trans)) {
4421                unset_reloc_control(rc);
4422                err = PTR_ERR(trans);
4423                goto out_free;
4424        }
4425
4426        rc->merge_reloc_tree = 1;
4427
4428        while (!list_empty(&reloc_roots)) {
4429                reloc_root = list_entry(reloc_roots.next,
4430                                        struct btrfs_root, root_list);
4431                list_del(&reloc_root->root_list);
4432
4433                if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4434                        list_add_tail(&reloc_root->root_list,
4435                                      &rc->reloc_roots);
4436                        continue;
4437                }
4438
4439                fs_root = read_fs_root(root->fs_info,
4440                                       reloc_root->root_key.offset);
4441                if (IS_ERR(fs_root)) {
4442                        err = PTR_ERR(fs_root);
4443                        goto out_free;
4444                }
4445
4446                err = __add_reloc_root(reloc_root);
4447                BUG_ON(err < 0); /* -ENOMEM or logic error */
4448                fs_root->reloc_root = reloc_root;
4449        }
4450
4451        err = btrfs_commit_transaction(trans, rc->extent_root);
4452        if (err)
4453                goto out_free;
4454
4455        merge_reloc_roots(rc);
4456
4457        unset_reloc_control(rc);
4458
4459        trans = btrfs_join_transaction(rc->extent_root);
4460        if (IS_ERR(trans))
4461                err = PTR_ERR(trans);
4462        else
4463                err = btrfs_commit_transaction(trans, rc->extent_root);
4464out_free:
4465        kfree(rc);
4466out:
4467        if (!list_empty(&reloc_roots))
4468                free_reloc_roots(&reloc_roots);
4469
4470        btrfs_free_path(path);
4471
4472        if (err == 0) {
4473                /* cleanup orphan inode in data relocation tree */
4474                fs_root = read_fs_root(root->fs_info,
4475                                       BTRFS_DATA_RELOC_TREE_OBJECTID);
4476                if (IS_ERR(fs_root))
4477                        err = PTR_ERR(fs_root);
4478                else
4479                        err = btrfs_orphan_cleanup(fs_root);
4480        }
4481        return err;
4482}
4483
4484/*
4485 * helper to add ordered checksum for data relocation.
4486 *
4487 * cloning checksum properly handles the nodatasum extents.
4488 * it also saves CPU time to re-calculate the checksum.
4489 */
4490int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4491{
4492        struct btrfs_ordered_sum *sums;
4493        struct btrfs_ordered_extent *ordered;
4494        struct btrfs_root *root = BTRFS_I(inode)->root;
4495        int ret;
4496        u64 disk_bytenr;
4497        u64 new_bytenr;
4498        LIST_HEAD(list);
4499
4500        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4501        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4502
4503        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4504        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4505                                       disk_bytenr + len - 1, &list, 0);
4506        if (ret)
4507                goto out;
4508
4509        while (!list_empty(&list)) {
4510                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4511                list_del_init(&sums->list);
4512
4513                /*
4514                 * We need to offset the new_bytenr based on where the csum is.
4515                 * We need to do this because we will read in entire prealloc
4516                 * extents but we may have written to say the middle of the
4517                 * prealloc extent, so we need to make sure the csum goes with
4518                 * the right disk offset.
4519                 *
4520                 * We can do this because the data reloc inode refers strictly
4521                 * to the on disk bytes, so we don't have to worry about
4522                 * disk_len vs real len like with real inodes since it's all
4523                 * disk length.
4524                 */
4525                new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4526                sums->bytenr = new_bytenr;
4527
4528                btrfs_add_ordered_sum(inode, ordered, sums);
4529        }
4530out:
4531        btrfs_put_ordered_extent(ordered);
4532        return ret;
4533}
4534
4535int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4536                          struct btrfs_root *root, struct extent_buffer *buf,
4537                          struct extent_buffer *cow)
4538{
4539        struct reloc_control *rc;
4540        struct backref_node *node;
4541        int first_cow = 0;
4542        int level;
4543        int ret = 0;
4544
4545        rc = root->fs_info->reloc_ctl;
4546        if (!rc)
4547                return 0;
4548
4549        BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4550               root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4551
4552        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4553                if (buf == root->node)
4554                        __update_reloc_root(root, cow->start);
4555        }
4556
4557        level = btrfs_header_level(buf);
4558        if (btrfs_header_generation(buf) <=
4559            btrfs_root_last_snapshot(&root->root_item))
4560                first_cow = 1;
4561
4562        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4563            rc->create_reloc_tree) {
4564                WARN_ON(!first_cow && level == 0);
4565
4566                node = rc->backref_cache.path[level];
4567                BUG_ON(node->bytenr != buf->start &&
4568                       node->new_bytenr != buf->start);
4569
4570                drop_node_buffer(node);
4571                extent_buffer_get(cow);
4572                node->eb = cow;
4573                node->new_bytenr = cow->start;
4574
4575                if (!node->pending) {
4576                        list_move_tail(&node->list,
4577                                       &rc->backref_cache.pending[level]);
4578                        node->pending = 1;
4579                }
4580
4581                if (first_cow)
4582                        __mark_block_processed(rc, node);
4583
4584                if (first_cow && level > 0)
4585                        rc->nodes_relocated += buf->len;
4586        }
4587
4588        if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4589                ret = replace_file_extents(trans, rc, root, cow);
4590        return ret;
4591}
4592
4593/*
4594 * called before creating snapshot. it calculates metadata reservation
4595 * requried for relocating tree blocks in the snapshot
4596 */
4597void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4598                              u64 *bytes_to_reserve)
4599{
4600        struct btrfs_root *root;
4601        struct reloc_control *rc;
4602
4603        root = pending->root;
4604        if (!root->reloc_root)
4605                return;
4606
4607        rc = root->fs_info->reloc_ctl;
4608        if (!rc->merge_reloc_tree)
4609                return;
4610
4611        root = root->reloc_root;
4612        BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4613        /*
4614         * relocation is in the stage of merging trees. the space
4615         * used by merging a reloc tree is twice the size of
4616         * relocated tree nodes in the worst case. half for cowing
4617         * the reloc tree, half for cowing the fs tree. the space
4618         * used by cowing the reloc tree will be freed after the
4619         * tree is dropped. if we create snapshot, cowing the fs
4620         * tree may use more space than it frees. so we need
4621         * reserve extra space.
4622         */
4623        *bytes_to_reserve += rc->nodes_relocated;
4624}
4625
4626/*
4627 * called after snapshot is created. migrate block reservation
4628 * and create reloc root for the newly created snapshot
4629 */
4630int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4631                               struct btrfs_pending_snapshot *pending)
4632{
4633        struct btrfs_root *root = pending->root;
4634        struct btrfs_root *reloc_root;
4635        struct btrfs_root *new_root;
4636        struct reloc_control *rc;
4637        int ret;
4638
4639        if (!root->reloc_root)
4640                return 0;
4641
4642        rc = root->fs_info->reloc_ctl;
4643        rc->merging_rsv_size += rc->nodes_relocated;
4644
4645        if (rc->merge_reloc_tree) {
4646                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4647                                              rc->block_rsv,
4648                                              rc->nodes_relocated);
4649                if (ret)
4650                        return ret;
4651        }
4652
4653        new_root = pending->snap;
4654        reloc_root = create_reloc_root(trans, root->reloc_root,
4655                                       new_root->root_key.objectid);
4656        if (IS_ERR(reloc_root))
4657                return PTR_ERR(reloc_root);
4658
4659        ret = __add_reloc_root(reloc_root);
4660        BUG_ON(ret < 0);
4661        new_root->reloc_root = reloc_root;
4662
4663        if (rc->create_reloc_tree)
4664                ret = clone_backref_node(trans, rc, root, reloc_root);
4665        return ret;
4666}
4667