linux/fs/btrfs/relocation.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2009 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/pagemap.h>
  21#include <linux/writeback.h>
  22#include <linux/blkdev.h>
  23#include <linux/rbtree.h>
  24#include <linux/slab.h>
  25#include "ctree.h"
  26#include "disk-io.h"
  27#include "transaction.h"
  28#include "volumes.h"
  29#include "locking.h"
  30#include "btrfs_inode.h"
  31#include "async-thread.h"
  32#include "free-space-cache.h"
  33#include "inode-map.h"
  34
  35/*
  36 * backref_node, mapping_node and tree_block start with this
  37 */
  38struct tree_entry {
  39        struct rb_node rb_node;
  40        u64 bytenr;
  41};
  42
  43/*
  44 * present a tree block in the backref cache
  45 */
  46struct backref_node {
  47        struct rb_node rb_node;
  48        u64 bytenr;
  49
  50        u64 new_bytenr;
  51        /* objectid of tree block owner, can be not uptodate */
  52        u64 owner;
  53        /* link to pending, changed or detached list */
  54        struct list_head list;
  55        /* list of upper level blocks reference this block */
  56        struct list_head upper;
  57        /* list of child blocks in the cache */
  58        struct list_head lower;
  59        /* NULL if this node is not tree root */
  60        struct btrfs_root *root;
  61        /* extent buffer got by COW the block */
  62        struct extent_buffer *eb;
  63        /* level of tree block */
  64        unsigned int level:8;
  65        /* is the block in non-reference counted tree */
  66        unsigned int cowonly:1;
  67        /* 1 if no child node in the cache */
  68        unsigned int lowest:1;
  69        /* is the extent buffer locked */
  70        unsigned int locked:1;
  71        /* has the block been processed */
  72        unsigned int processed:1;
  73        /* have backrefs of this block been checked */
  74        unsigned int checked:1;
  75        /*
  76         * 1 if corresponding block has been cowed but some upper
  77         * level block pointers may not point to the new location
  78         */
  79        unsigned int pending:1;
  80        /*
  81         * 1 if the backref node isn't connected to any other
  82         * backref node.
  83         */
  84        unsigned int detached:1;
  85};
  86
  87/*
  88 * present a block pointer in the backref cache
  89 */
  90struct backref_edge {
  91        struct list_head list[2];
  92        struct backref_node *node[2];
  93};
  94
  95#define LOWER   0
  96#define UPPER   1
  97#define RELOCATION_RESERVED_NODES       256
  98
  99struct backref_cache {
 100        /* red black tree of all backref nodes in the cache */
 101        struct rb_root rb_root;
 102        /* for passing backref nodes to btrfs_reloc_cow_block */
 103        struct backref_node *path[BTRFS_MAX_LEVEL];
 104        /*
 105         * list of blocks that have been cowed but some block
 106         * pointers in upper level blocks may not reflect the
 107         * new location
 108         */
 109        struct list_head pending[BTRFS_MAX_LEVEL];
 110        /* list of backref nodes with no child node */
 111        struct list_head leaves;
 112        /* list of blocks that have been cowed in current transaction */
 113        struct list_head changed;
 114        /* list of detached backref node. */
 115        struct list_head detached;
 116
 117        u64 last_trans;
 118
 119        int nr_nodes;
 120        int nr_edges;
 121};
 122
 123/*
 124 * map address of tree root to tree
 125 */
 126struct mapping_node {
 127        struct rb_node rb_node;
 128        u64 bytenr;
 129        void *data;
 130};
 131
 132struct mapping_tree {
 133        struct rb_root rb_root;
 134        spinlock_t lock;
 135};
 136
 137/*
 138 * present a tree block to process
 139 */
 140struct tree_block {
 141        struct rb_node rb_node;
 142        u64 bytenr;
 143        struct btrfs_key key;
 144        unsigned int level:8;
 145        unsigned int key_ready:1;
 146};
 147
 148#define MAX_EXTENTS 128
 149
 150struct file_extent_cluster {
 151        u64 start;
 152        u64 end;
 153        u64 boundary[MAX_EXTENTS];
 154        unsigned int nr;
 155};
 156
 157struct reloc_control {
 158        /* block group to relocate */
 159        struct btrfs_block_group_cache *block_group;
 160        /* extent tree */
 161        struct btrfs_root *extent_root;
 162        /* inode for moving data */
 163        struct inode *data_inode;
 164
 165        struct btrfs_block_rsv *block_rsv;
 166
 167        struct backref_cache backref_cache;
 168
 169        struct file_extent_cluster cluster;
 170        /* tree blocks have been processed */
 171        struct extent_io_tree processed_blocks;
 172        /* map start of tree root to corresponding reloc tree */
 173        struct mapping_tree reloc_root_tree;
 174        /* list of reloc trees */
 175        struct list_head reloc_roots;
 176        /* size of metadata reservation for merging reloc trees */
 177        u64 merging_rsv_size;
 178        /* size of relocated tree nodes */
 179        u64 nodes_relocated;
 180        /* reserved size for block group relocation*/
 181        u64 reserved_bytes;
 182
 183        u64 search_start;
 184        u64 extents_found;
 185
 186        unsigned int stage:8;
 187        unsigned int create_reloc_tree:1;
 188        unsigned int merge_reloc_tree:1;
 189        unsigned int found_file_extent:1;
 190};
 191
 192/* stages of data relocation */
 193#define MOVE_DATA_EXTENTS       0
 194#define UPDATE_DATA_PTRS        1
 195
 196static void remove_backref_node(struct backref_cache *cache,
 197                                struct backref_node *node);
 198static void __mark_block_processed(struct reloc_control *rc,
 199                                   struct backref_node *node);
 200
 201static void mapping_tree_init(struct mapping_tree *tree)
 202{
 203        tree->rb_root = RB_ROOT;
 204        spin_lock_init(&tree->lock);
 205}
 206
 207static void backref_cache_init(struct backref_cache *cache)
 208{
 209        int i;
 210        cache->rb_root = RB_ROOT;
 211        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 212                INIT_LIST_HEAD(&cache->pending[i]);
 213        INIT_LIST_HEAD(&cache->changed);
 214        INIT_LIST_HEAD(&cache->detached);
 215        INIT_LIST_HEAD(&cache->leaves);
 216}
 217
 218static void backref_cache_cleanup(struct backref_cache *cache)
 219{
 220        struct backref_node *node;
 221        int i;
 222
 223        while (!list_empty(&cache->detached)) {
 224                node = list_entry(cache->detached.next,
 225                                  struct backref_node, list);
 226                remove_backref_node(cache, node);
 227        }
 228
 229        while (!list_empty(&cache->leaves)) {
 230                node = list_entry(cache->leaves.next,
 231                                  struct backref_node, lower);
 232                remove_backref_node(cache, node);
 233        }
 234
 235        cache->last_trans = 0;
 236
 237        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 238                BUG_ON(!list_empty(&cache->pending[i]));
 239        BUG_ON(!list_empty(&cache->changed));
 240        BUG_ON(!list_empty(&cache->detached));
 241        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
 242        BUG_ON(cache->nr_nodes);
 243        BUG_ON(cache->nr_edges);
 244}
 245
 246static struct backref_node *alloc_backref_node(struct backref_cache *cache)
 247{
 248        struct backref_node *node;
 249
 250        node = kzalloc(sizeof(*node), GFP_NOFS);
 251        if (node) {
 252                INIT_LIST_HEAD(&node->list);
 253                INIT_LIST_HEAD(&node->upper);
 254                INIT_LIST_HEAD(&node->lower);
 255                RB_CLEAR_NODE(&node->rb_node);
 256                cache->nr_nodes++;
 257        }
 258        return node;
 259}
 260
 261static void free_backref_node(struct backref_cache *cache,
 262                              struct backref_node *node)
 263{
 264        if (node) {
 265                cache->nr_nodes--;
 266                kfree(node);
 267        }
 268}
 269
 270static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
 271{
 272        struct backref_edge *edge;
 273
 274        edge = kzalloc(sizeof(*edge), GFP_NOFS);
 275        if (edge)
 276                cache->nr_edges++;
 277        return edge;
 278}
 279
 280static void free_backref_edge(struct backref_cache *cache,
 281                              struct backref_edge *edge)
 282{
 283        if (edge) {
 284                cache->nr_edges--;
 285                kfree(edge);
 286        }
 287}
 288
 289static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 290                                   struct rb_node *node)
 291{
 292        struct rb_node **p = &root->rb_node;
 293        struct rb_node *parent = NULL;
 294        struct tree_entry *entry;
 295
 296        while (*p) {
 297                parent = *p;
 298                entry = rb_entry(parent, struct tree_entry, rb_node);
 299
 300                if (bytenr < entry->bytenr)
 301                        p = &(*p)->rb_left;
 302                else if (bytenr > entry->bytenr)
 303                        p = &(*p)->rb_right;
 304                else
 305                        return parent;
 306        }
 307
 308        rb_link_node(node, parent, p);
 309        rb_insert_color(node, root);
 310        return NULL;
 311}
 312
 313static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 314{
 315        struct rb_node *n = root->rb_node;
 316        struct tree_entry *entry;
 317
 318        while (n) {
 319                entry = rb_entry(n, struct tree_entry, rb_node);
 320
 321                if (bytenr < entry->bytenr)
 322                        n = n->rb_left;
 323                else if (bytenr > entry->bytenr)
 324                        n = n->rb_right;
 325                else
 326                        return n;
 327        }
 328        return NULL;
 329}
 330
 331static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
 332{
 333
 334        struct btrfs_fs_info *fs_info = NULL;
 335        struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
 336                                              rb_node);
 337        if (bnode->root)
 338                fs_info = bnode->root->fs_info;
 339        btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
 340                    "found at offset %llu\n", bytenr);
 341}
 342
 343/*
 344 * walk up backref nodes until reach node presents tree root
 345 */
 346static struct backref_node *walk_up_backref(struct backref_node *node,
 347                                            struct backref_edge *edges[],
 348                                            int *index)
 349{
 350        struct backref_edge *edge;
 351        int idx = *index;
 352
 353        while (!list_empty(&node->upper)) {
 354                edge = list_entry(node->upper.next,
 355                                  struct backref_edge, list[LOWER]);
 356                edges[idx++] = edge;
 357                node = edge->node[UPPER];
 358        }
 359        BUG_ON(node->detached);
 360        *index = idx;
 361        return node;
 362}
 363
 364/*
 365 * walk down backref nodes to find start of next reference path
 366 */
 367static struct backref_node *walk_down_backref(struct backref_edge *edges[],
 368                                              int *index)
 369{
 370        struct backref_edge *edge;
 371        struct backref_node *lower;
 372        int idx = *index;
 373
 374        while (idx > 0) {
 375                edge = edges[idx - 1];
 376                lower = edge->node[LOWER];
 377                if (list_is_last(&edge->list[LOWER], &lower->upper)) {
 378                        idx--;
 379                        continue;
 380                }
 381                edge = list_entry(edge->list[LOWER].next,
 382                                  struct backref_edge, list[LOWER]);
 383                edges[idx - 1] = edge;
 384                *index = idx;
 385                return edge->node[UPPER];
 386        }
 387        *index = 0;
 388        return NULL;
 389}
 390
 391static void unlock_node_buffer(struct backref_node *node)
 392{
 393        if (node->locked) {
 394                btrfs_tree_unlock(node->eb);
 395                node->locked = 0;
 396        }
 397}
 398
 399static void drop_node_buffer(struct backref_node *node)
 400{
 401        if (node->eb) {
 402                unlock_node_buffer(node);
 403                free_extent_buffer(node->eb);
 404                node->eb = NULL;
 405        }
 406}
 407
 408static void drop_backref_node(struct backref_cache *tree,
 409                              struct backref_node *node)
 410{
 411        BUG_ON(!list_empty(&node->upper));
 412
 413        drop_node_buffer(node);
 414        list_del(&node->list);
 415        list_del(&node->lower);
 416        if (!RB_EMPTY_NODE(&node->rb_node))
 417                rb_erase(&node->rb_node, &tree->rb_root);
 418        free_backref_node(tree, node);
 419}
 420
 421/*
 422 * remove a backref node from the backref cache
 423 */
 424static void remove_backref_node(struct backref_cache *cache,
 425                                struct backref_node *node)
 426{
 427        struct backref_node *upper;
 428        struct backref_edge *edge;
 429
 430        if (!node)
 431                return;
 432
 433        BUG_ON(!node->lowest && !node->detached);
 434        while (!list_empty(&node->upper)) {
 435                edge = list_entry(node->upper.next, struct backref_edge,
 436                                  list[LOWER]);
 437                upper = edge->node[UPPER];
 438                list_del(&edge->list[LOWER]);
 439                list_del(&edge->list[UPPER]);
 440                free_backref_edge(cache, edge);
 441
 442                if (RB_EMPTY_NODE(&upper->rb_node)) {
 443                        BUG_ON(!list_empty(&node->upper));
 444                        drop_backref_node(cache, node);
 445                        node = upper;
 446                        node->lowest = 1;
 447                        continue;
 448                }
 449                /*
 450                 * add the node to leaf node list if no other
 451                 * child block cached.
 452                 */
 453                if (list_empty(&upper->lower)) {
 454                        list_add_tail(&upper->lower, &cache->leaves);
 455                        upper->lowest = 1;
 456                }
 457        }
 458
 459        drop_backref_node(cache, node);
 460}
 461
 462static void update_backref_node(struct backref_cache *cache,
 463                                struct backref_node *node, u64 bytenr)
 464{
 465        struct rb_node *rb_node;
 466        rb_erase(&node->rb_node, &cache->rb_root);
 467        node->bytenr = bytenr;
 468        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
 469        if (rb_node)
 470                backref_tree_panic(rb_node, -EEXIST, bytenr);
 471}
 472
 473/*
 474 * update backref cache after a transaction commit
 475 */
 476static int update_backref_cache(struct btrfs_trans_handle *trans,
 477                                struct backref_cache *cache)
 478{
 479        struct backref_node *node;
 480        int level = 0;
 481
 482        if (cache->last_trans == 0) {
 483                cache->last_trans = trans->transid;
 484                return 0;
 485        }
 486
 487        if (cache->last_trans == trans->transid)
 488                return 0;
 489
 490        /*
 491         * detached nodes are used to avoid unnecessary backref
 492         * lookup. transaction commit changes the extent tree.
 493         * so the detached nodes are no longer useful.
 494         */
 495        while (!list_empty(&cache->detached)) {
 496                node = list_entry(cache->detached.next,
 497                                  struct backref_node, list);
 498                remove_backref_node(cache, node);
 499        }
 500
 501        while (!list_empty(&cache->changed)) {
 502                node = list_entry(cache->changed.next,
 503                                  struct backref_node, list);
 504                list_del_init(&node->list);
 505                BUG_ON(node->pending);
 506                update_backref_node(cache, node, node->new_bytenr);
 507        }
 508
 509        /*
 510         * some nodes can be left in the pending list if there were
 511         * errors during processing the pending nodes.
 512         */
 513        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 514                list_for_each_entry(node, &cache->pending[level], list) {
 515                        BUG_ON(!node->pending);
 516                        if (node->bytenr == node->new_bytenr)
 517                                continue;
 518                        update_backref_node(cache, node, node->new_bytenr);
 519                }
 520        }
 521
 522        cache->last_trans = 0;
 523        return 1;
 524}
 525
 526
 527static int should_ignore_root(struct btrfs_root *root)
 528{
 529        struct btrfs_root *reloc_root;
 530
 531        if (!root->ref_cows)
 532                return 0;
 533
 534        reloc_root = root->reloc_root;
 535        if (!reloc_root)
 536                return 0;
 537
 538        if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
 539            root->fs_info->running_transaction->transid - 1)
 540                return 0;
 541        /*
 542         * if there is reloc tree and it was created in previous
 543         * transaction backref lookup can find the reloc tree,
 544         * so backref node for the fs tree root is useless for
 545         * relocation.
 546         */
 547        return 1;
 548}
 549/*
 550 * find reloc tree by address of tree root
 551 */
 552static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
 553                                          u64 bytenr)
 554{
 555        struct rb_node *rb_node;
 556        struct mapping_node *node;
 557        struct btrfs_root *root = NULL;
 558
 559        spin_lock(&rc->reloc_root_tree.lock);
 560        rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
 561        if (rb_node) {
 562                node = rb_entry(rb_node, struct mapping_node, rb_node);
 563                root = (struct btrfs_root *)node->data;
 564        }
 565        spin_unlock(&rc->reloc_root_tree.lock);
 566        return root;
 567}
 568
 569static int is_cowonly_root(u64 root_objectid)
 570{
 571        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
 572            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
 573            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
 574            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
 575            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
 576            root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
 577            root_objectid == BTRFS_UUID_TREE_OBJECTID ||
 578            root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
 579                return 1;
 580        return 0;
 581}
 582
 583static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
 584                                        u64 root_objectid)
 585{
 586        struct btrfs_key key;
 587
 588        key.objectid = root_objectid;
 589        key.type = BTRFS_ROOT_ITEM_KEY;
 590        if (is_cowonly_root(root_objectid))
 591                key.offset = 0;
 592        else
 593                key.offset = (u64)-1;
 594
 595        return btrfs_get_fs_root(fs_info, &key, false);
 596}
 597
 598#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 599static noinline_for_stack
 600struct btrfs_root *find_tree_root(struct reloc_control *rc,
 601                                  struct extent_buffer *leaf,
 602                                  struct btrfs_extent_ref_v0 *ref0)
 603{
 604        struct btrfs_root *root;
 605        u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
 606        u64 generation = btrfs_ref_generation_v0(leaf, ref0);
 607
 608        BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
 609
 610        root = read_fs_root(rc->extent_root->fs_info, root_objectid);
 611        BUG_ON(IS_ERR(root));
 612
 613        if (root->ref_cows &&
 614            generation != btrfs_root_generation(&root->root_item))
 615                return NULL;
 616
 617        return root;
 618}
 619#endif
 620
 621static noinline_for_stack
 622int find_inline_backref(struct extent_buffer *leaf, int slot,
 623                        unsigned long *ptr, unsigned long *end)
 624{
 625        struct btrfs_key key;
 626        struct btrfs_extent_item *ei;
 627        struct btrfs_tree_block_info *bi;
 628        u32 item_size;
 629
 630        btrfs_item_key_to_cpu(leaf, &key, slot);
 631
 632        item_size = btrfs_item_size_nr(leaf, slot);
 633#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 634        if (item_size < sizeof(*ei)) {
 635                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
 636                return 1;
 637        }
 638#endif
 639        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 640        WARN_ON(!(btrfs_extent_flags(leaf, ei) &
 641                  BTRFS_EXTENT_FLAG_TREE_BLOCK));
 642
 643        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 644            item_size <= sizeof(*ei) + sizeof(*bi)) {
 645                WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 646                return 1;
 647        }
 648        if (key.type == BTRFS_METADATA_ITEM_KEY &&
 649            item_size <= sizeof(*ei)) {
 650                WARN_ON(item_size < sizeof(*ei));
 651                return 1;
 652        }
 653
 654        if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 655                bi = (struct btrfs_tree_block_info *)(ei + 1);
 656                *ptr = (unsigned long)(bi + 1);
 657        } else {
 658                *ptr = (unsigned long)(ei + 1);
 659        }
 660        *end = (unsigned long)ei + item_size;
 661        return 0;
 662}
 663
 664/*
 665 * build backref tree for a given tree block. root of the backref tree
 666 * corresponds the tree block, leaves of the backref tree correspond
 667 * roots of b-trees that reference the tree block.
 668 *
 669 * the basic idea of this function is check backrefs of a given block
 670 * to find upper level blocks that refernece the block, and then check
 671 * bakcrefs of these upper level blocks recursively. the recursion stop
 672 * when tree root is reached or backrefs for the block is cached.
 673 *
 674 * NOTE: if we find backrefs for a block are cached, we know backrefs
 675 * for all upper level blocks that directly/indirectly reference the
 676 * block are also cached.
 677 */
 678static noinline_for_stack
 679struct backref_node *build_backref_tree(struct reloc_control *rc,
 680                                        struct btrfs_key *node_key,
 681                                        int level, u64 bytenr)
 682{
 683        struct backref_cache *cache = &rc->backref_cache;
 684        struct btrfs_path *path1;
 685        struct btrfs_path *path2;
 686        struct extent_buffer *eb;
 687        struct btrfs_root *root;
 688        struct backref_node *cur;
 689        struct backref_node *upper;
 690        struct backref_node *lower;
 691        struct backref_node *node = NULL;
 692        struct backref_node *exist = NULL;
 693        struct backref_edge *edge;
 694        struct rb_node *rb_node;
 695        struct btrfs_key key;
 696        unsigned long end;
 697        unsigned long ptr;
 698        LIST_HEAD(list);
 699        LIST_HEAD(useless);
 700        int cowonly;
 701        int ret;
 702        int err = 0;
 703        bool need_check = true;
 704
 705        path1 = btrfs_alloc_path();
 706        path2 = btrfs_alloc_path();
 707        if (!path1 || !path2) {
 708                err = -ENOMEM;
 709                goto out;
 710        }
 711        path1->reada = 1;
 712        path2->reada = 2;
 713
 714        node = alloc_backref_node(cache);
 715        if (!node) {
 716                err = -ENOMEM;
 717                goto out;
 718        }
 719
 720        node->bytenr = bytenr;
 721        node->level = level;
 722        node->lowest = 1;
 723        cur = node;
 724again:
 725        end = 0;
 726        ptr = 0;
 727        key.objectid = cur->bytenr;
 728        key.type = BTRFS_METADATA_ITEM_KEY;
 729        key.offset = (u64)-1;
 730
 731        path1->search_commit_root = 1;
 732        path1->skip_locking = 1;
 733        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
 734                                0, 0);
 735        if (ret < 0) {
 736                err = ret;
 737                goto out;
 738        }
 739        BUG_ON(!ret || !path1->slots[0]);
 740
 741        path1->slots[0]--;
 742
 743        WARN_ON(cur->checked);
 744        if (!list_empty(&cur->upper)) {
 745                /*
 746                 * the backref was added previously when processing
 747                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
 748                 */
 749                BUG_ON(!list_is_singular(&cur->upper));
 750                edge = list_entry(cur->upper.next, struct backref_edge,
 751                                  list[LOWER]);
 752                BUG_ON(!list_empty(&edge->list[UPPER]));
 753                exist = edge->node[UPPER];
 754                /*
 755                 * add the upper level block to pending list if we need
 756                 * check its backrefs
 757                 */
 758                if (!exist->checked)
 759                        list_add_tail(&edge->list[UPPER], &list);
 760        } else {
 761                exist = NULL;
 762        }
 763
 764        while (1) {
 765                cond_resched();
 766                eb = path1->nodes[0];
 767
 768                if (ptr >= end) {
 769                        if (path1->slots[0] >= btrfs_header_nritems(eb)) {
 770                                ret = btrfs_next_leaf(rc->extent_root, path1);
 771                                if (ret < 0) {
 772                                        err = ret;
 773                                        goto out;
 774                                }
 775                                if (ret > 0)
 776                                        break;
 777                                eb = path1->nodes[0];
 778                        }
 779
 780                        btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
 781                        if (key.objectid != cur->bytenr) {
 782                                WARN_ON(exist);
 783                                break;
 784                        }
 785
 786                        if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 787                            key.type == BTRFS_METADATA_ITEM_KEY) {
 788                                ret = find_inline_backref(eb, path1->slots[0],
 789                                                          &ptr, &end);
 790                                if (ret)
 791                                        goto next;
 792                        }
 793                }
 794
 795                if (ptr < end) {
 796                        /* update key for inline back ref */
 797                        struct btrfs_extent_inline_ref *iref;
 798                        iref = (struct btrfs_extent_inline_ref *)ptr;
 799                        key.type = btrfs_extent_inline_ref_type(eb, iref);
 800                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
 801                        WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
 802                                key.type != BTRFS_SHARED_BLOCK_REF_KEY);
 803                }
 804
 805                if (exist &&
 806                    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
 807                      exist->owner == key.offset) ||
 808                     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
 809                      exist->bytenr == key.offset))) {
 810                        exist = NULL;
 811                        goto next;
 812                }
 813
 814#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 815                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
 816                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
 817                        if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
 818                                struct btrfs_extent_ref_v0 *ref0;
 819                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
 820                                                struct btrfs_extent_ref_v0);
 821                                if (key.objectid == key.offset) {
 822                                        root = find_tree_root(rc, eb, ref0);
 823                                        if (root && !should_ignore_root(root))
 824                                                cur->root = root;
 825                                        else
 826                                                list_add(&cur->list, &useless);
 827                                        break;
 828                                }
 829                                if (is_cowonly_root(btrfs_ref_root_v0(eb,
 830                                                                      ref0)))
 831                                        cur->cowonly = 1;
 832                        }
 833#else
 834                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
 835                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
 836#endif
 837                        if (key.objectid == key.offset) {
 838                                /*
 839                                 * only root blocks of reloc trees use
 840                                 * backref of this type.
 841                                 */
 842                                root = find_reloc_root(rc, cur->bytenr);
 843                                BUG_ON(!root);
 844                                cur->root = root;
 845                                break;
 846                        }
 847
 848                        edge = alloc_backref_edge(cache);
 849                        if (!edge) {
 850                                err = -ENOMEM;
 851                                goto out;
 852                        }
 853                        rb_node = tree_search(&cache->rb_root, key.offset);
 854                        if (!rb_node) {
 855                                upper = alloc_backref_node(cache);
 856                                if (!upper) {
 857                                        free_backref_edge(cache, edge);
 858                                        err = -ENOMEM;
 859                                        goto out;
 860                                }
 861                                upper->bytenr = key.offset;
 862                                upper->level = cur->level + 1;
 863                                /*
 864                                 *  backrefs for the upper level block isn't
 865                                 *  cached, add the block to pending list
 866                                 */
 867                                list_add_tail(&edge->list[UPPER], &list);
 868                        } else {
 869                                upper = rb_entry(rb_node, struct backref_node,
 870                                                 rb_node);
 871                                BUG_ON(!upper->checked);
 872                                INIT_LIST_HEAD(&edge->list[UPPER]);
 873                        }
 874                        list_add_tail(&edge->list[LOWER], &cur->upper);
 875                        edge->node[LOWER] = cur;
 876                        edge->node[UPPER] = upper;
 877
 878                        goto next;
 879                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
 880                        goto next;
 881                }
 882
 883                /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
 884                root = read_fs_root(rc->extent_root->fs_info, key.offset);
 885                if (IS_ERR(root)) {
 886                        err = PTR_ERR(root);
 887                        goto out;
 888                }
 889
 890                if (!root->ref_cows)
 891                        cur->cowonly = 1;
 892
 893                if (btrfs_root_level(&root->root_item) == cur->level) {
 894                        /* tree root */
 895                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
 896                               cur->bytenr);
 897                        if (should_ignore_root(root))
 898                                list_add(&cur->list, &useless);
 899                        else
 900                                cur->root = root;
 901                        break;
 902                }
 903
 904                level = cur->level + 1;
 905
 906                /*
 907                 * searching the tree to find upper level blocks
 908                 * reference the block.
 909                 */
 910                path2->search_commit_root = 1;
 911                path2->skip_locking = 1;
 912                path2->lowest_level = level;
 913                ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
 914                path2->lowest_level = 0;
 915                if (ret < 0) {
 916                        err = ret;
 917                        goto out;
 918                }
 919                if (ret > 0 && path2->slots[level] > 0)
 920                        path2->slots[level]--;
 921
 922                eb = path2->nodes[level];
 923                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
 924                        cur->bytenr);
 925
 926                lower = cur;
 927                need_check = true;
 928                for (; level < BTRFS_MAX_LEVEL; level++) {
 929                        if (!path2->nodes[level]) {
 930                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
 931                                       lower->bytenr);
 932                                if (should_ignore_root(root))
 933                                        list_add(&lower->list, &useless);
 934                                else
 935                                        lower->root = root;
 936                                break;
 937                        }
 938
 939                        edge = alloc_backref_edge(cache);
 940                        if (!edge) {
 941                                err = -ENOMEM;
 942                                goto out;
 943                        }
 944
 945                        eb = path2->nodes[level];
 946                        rb_node = tree_search(&cache->rb_root, eb->start);
 947                        if (!rb_node) {
 948                                upper = alloc_backref_node(cache);
 949                                if (!upper) {
 950                                        free_backref_edge(cache, edge);
 951                                        err = -ENOMEM;
 952                                        goto out;
 953                                }
 954                                upper->bytenr = eb->start;
 955                                upper->owner = btrfs_header_owner(eb);
 956                                upper->level = lower->level + 1;
 957                                if (!root->ref_cows)
 958                                        upper->cowonly = 1;
 959
 960                                /*
 961                                 * if we know the block isn't shared
 962                                 * we can void checking its backrefs.
 963                                 */
 964                                if (btrfs_block_can_be_shared(root, eb))
 965                                        upper->checked = 0;
 966                                else
 967                                        upper->checked = 1;
 968
 969                                /*
 970                                 * add the block to pending list if we
 971                                 * need check its backrefs, we only do this once
 972                                 * while walking up a tree as we will catch
 973                                 * anything else later on.
 974                                 */
 975                                if (!upper->checked && need_check) {
 976                                        need_check = false;
 977                                        list_add_tail(&edge->list[UPPER],
 978                                                      &list);
 979                                } else
 980                                        INIT_LIST_HEAD(&edge->list[UPPER]);
 981                        } else {
 982                                upper = rb_entry(rb_node, struct backref_node,
 983                                                 rb_node);
 984                                BUG_ON(!upper->checked);
 985                                INIT_LIST_HEAD(&edge->list[UPPER]);
 986                                if (!upper->owner)
 987                                        upper->owner = btrfs_header_owner(eb);
 988                        }
 989                        list_add_tail(&edge->list[LOWER], &lower->upper);
 990                        edge->node[LOWER] = lower;
 991                        edge->node[UPPER] = upper;
 992
 993                        if (rb_node)
 994                                break;
 995                        lower = upper;
 996                        upper = NULL;
 997                }
 998                btrfs_release_path(path2);
 999next:
1000                if (ptr < end) {
1001                        ptr += btrfs_extent_inline_ref_size(key.type);
1002                        if (ptr >= end) {
1003                                WARN_ON(ptr > end);
1004                                ptr = 0;
1005                                end = 0;
1006                        }
1007                }
1008                if (ptr >= end)
1009                        path1->slots[0]++;
1010        }
1011        btrfs_release_path(path1);
1012
1013        cur->checked = 1;
1014        WARN_ON(exist);
1015
1016        /* the pending list isn't empty, take the first block to process */
1017        if (!list_empty(&list)) {
1018                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1019                list_del_init(&edge->list[UPPER]);
1020                cur = edge->node[UPPER];
1021                goto again;
1022        }
1023
1024        /*
1025         * everything goes well, connect backref nodes and insert backref nodes
1026         * into the cache.
1027         */
1028        BUG_ON(!node->checked);
1029        cowonly = node->cowonly;
1030        if (!cowonly) {
1031                rb_node = tree_insert(&cache->rb_root, node->bytenr,
1032                                      &node->rb_node);
1033                if (rb_node)
1034                        backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1035                list_add_tail(&node->lower, &cache->leaves);
1036        }
1037
1038        list_for_each_entry(edge, &node->upper, list[LOWER])
1039                list_add_tail(&edge->list[UPPER], &list);
1040
1041        while (!list_empty(&list)) {
1042                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1043                list_del_init(&edge->list[UPPER]);
1044                upper = edge->node[UPPER];
1045                if (upper->detached) {
1046                        list_del(&edge->list[LOWER]);
1047                        lower = edge->node[LOWER];
1048                        free_backref_edge(cache, edge);
1049                        if (list_empty(&lower->upper))
1050                                list_add(&lower->list, &useless);
1051                        continue;
1052                }
1053
1054                if (!RB_EMPTY_NODE(&upper->rb_node)) {
1055                        if (upper->lowest) {
1056                                list_del_init(&upper->lower);
1057                                upper->lowest = 0;
1058                        }
1059
1060                        list_add_tail(&edge->list[UPPER], &upper->lower);
1061                        continue;
1062                }
1063
1064                BUG_ON(!upper->checked);
1065                BUG_ON(cowonly != upper->cowonly);
1066                if (!cowonly) {
1067                        rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1068                                              &upper->rb_node);
1069                        if (rb_node)
1070                                backref_tree_panic(rb_node, -EEXIST,
1071                                                   upper->bytenr);
1072                }
1073
1074                list_add_tail(&edge->list[UPPER], &upper->lower);
1075
1076                list_for_each_entry(edge, &upper->upper, list[LOWER])
1077                        list_add_tail(&edge->list[UPPER], &list);
1078        }
1079        /*
1080         * process useless backref nodes. backref nodes for tree leaves
1081         * are deleted from the cache. backref nodes for upper level
1082         * tree blocks are left in the cache to avoid unnecessary backref
1083         * lookup.
1084         */
1085        while (!list_empty(&useless)) {
1086                upper = list_entry(useless.next, struct backref_node, list);
1087                list_del_init(&upper->list);
1088                BUG_ON(!list_empty(&upper->upper));
1089                if (upper == node)
1090                        node = NULL;
1091                if (upper->lowest) {
1092                        list_del_init(&upper->lower);
1093                        upper->lowest = 0;
1094                }
1095                while (!list_empty(&upper->lower)) {
1096                        edge = list_entry(upper->lower.next,
1097                                          struct backref_edge, list[UPPER]);
1098                        list_del(&edge->list[UPPER]);
1099                        list_del(&edge->list[LOWER]);
1100                        lower = edge->node[LOWER];
1101                        free_backref_edge(cache, edge);
1102
1103                        if (list_empty(&lower->upper))
1104                                list_add(&lower->list, &useless);
1105                }
1106                __mark_block_processed(rc, upper);
1107                if (upper->level > 0) {
1108                        list_add(&upper->list, &cache->detached);
1109                        upper->detached = 1;
1110                } else {
1111                        rb_erase(&upper->rb_node, &cache->rb_root);
1112                        free_backref_node(cache, upper);
1113                }
1114        }
1115out:
1116        btrfs_free_path(path1);
1117        btrfs_free_path(path2);
1118        if (err) {
1119                while (!list_empty(&useless)) {
1120                        lower = list_entry(useless.next,
1121                                           struct backref_node, upper);
1122                        list_del_init(&lower->upper);
1123                }
1124                upper = node;
1125                INIT_LIST_HEAD(&list);
1126                while (upper) {
1127                        if (RB_EMPTY_NODE(&upper->rb_node)) {
1128                                list_splice_tail(&upper->upper, &list);
1129                                free_backref_node(cache, upper);
1130                        }
1131
1132                        if (list_empty(&list))
1133                                break;
1134
1135                        edge = list_entry(list.next, struct backref_edge,
1136                                          list[LOWER]);
1137                        list_del(&edge->list[LOWER]);
1138                        upper = edge->node[UPPER];
1139                        free_backref_edge(cache, edge);
1140                }
1141                return ERR_PTR(err);
1142        }
1143        BUG_ON(node && node->detached);
1144        return node;
1145}
1146
1147/*
1148 * helper to add backref node for the newly created snapshot.
1149 * the backref node is created by cloning backref node that
1150 * corresponds to root of source tree
1151 */
1152static int clone_backref_node(struct btrfs_trans_handle *trans,
1153                              struct reloc_control *rc,
1154                              struct btrfs_root *src,
1155                              struct btrfs_root *dest)
1156{
1157        struct btrfs_root *reloc_root = src->reloc_root;
1158        struct backref_cache *cache = &rc->backref_cache;
1159        struct backref_node *node = NULL;
1160        struct backref_node *new_node;
1161        struct backref_edge *edge;
1162        struct backref_edge *new_edge;
1163        struct rb_node *rb_node;
1164
1165        if (cache->last_trans > 0)
1166                update_backref_cache(trans, cache);
1167
1168        rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1169        if (rb_node) {
1170                node = rb_entry(rb_node, struct backref_node, rb_node);
1171                if (node->detached)
1172                        node = NULL;
1173                else
1174                        BUG_ON(node->new_bytenr != reloc_root->node->start);
1175        }
1176
1177        if (!node) {
1178                rb_node = tree_search(&cache->rb_root,
1179                                      reloc_root->commit_root->start);
1180                if (rb_node) {
1181                        node = rb_entry(rb_node, struct backref_node,
1182                                        rb_node);
1183                        BUG_ON(node->detached);
1184                }
1185        }
1186
1187        if (!node)
1188                return 0;
1189
1190        new_node = alloc_backref_node(cache);
1191        if (!new_node)
1192                return -ENOMEM;
1193
1194        new_node->bytenr = dest->node->start;
1195        new_node->level = node->level;
1196        new_node->lowest = node->lowest;
1197        new_node->checked = 1;
1198        new_node->root = dest;
1199
1200        if (!node->lowest) {
1201                list_for_each_entry(edge, &node->lower, list[UPPER]) {
1202                        new_edge = alloc_backref_edge(cache);
1203                        if (!new_edge)
1204                                goto fail;
1205
1206                        new_edge->node[UPPER] = new_node;
1207                        new_edge->node[LOWER] = edge->node[LOWER];
1208                        list_add_tail(&new_edge->list[UPPER],
1209                                      &new_node->lower);
1210                }
1211        } else {
1212                list_add_tail(&new_node->lower, &cache->leaves);
1213        }
1214
1215        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1216                              &new_node->rb_node);
1217        if (rb_node)
1218                backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1219
1220        if (!new_node->lowest) {
1221                list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1222                        list_add_tail(&new_edge->list[LOWER],
1223                                      &new_edge->node[LOWER]->upper);
1224                }
1225        }
1226        return 0;
1227fail:
1228        while (!list_empty(&new_node->lower)) {
1229                new_edge = list_entry(new_node->lower.next,
1230                                      struct backref_edge, list[UPPER]);
1231                list_del(&new_edge->list[UPPER]);
1232                free_backref_edge(cache, new_edge);
1233        }
1234        free_backref_node(cache, new_node);
1235        return -ENOMEM;
1236}
1237
1238/*
1239 * helper to add 'address of tree root -> reloc tree' mapping
1240 */
1241static int __must_check __add_reloc_root(struct btrfs_root *root)
1242{
1243        struct rb_node *rb_node;
1244        struct mapping_node *node;
1245        struct reloc_control *rc = root->fs_info->reloc_ctl;
1246
1247        node = kmalloc(sizeof(*node), GFP_NOFS);
1248        if (!node)
1249                return -ENOMEM;
1250
1251        node->bytenr = root->node->start;
1252        node->data = root;
1253
1254        spin_lock(&rc->reloc_root_tree.lock);
1255        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1256                              node->bytenr, &node->rb_node);
1257        spin_unlock(&rc->reloc_root_tree.lock);
1258        if (rb_node) {
1259                btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1260                            "for start=%llu while inserting into relocation "
1261                            "tree\n", node->bytenr);
1262                kfree(node);
1263                return -EEXIST;
1264        }
1265
1266        list_add_tail(&root->root_list, &rc->reloc_roots);
1267        return 0;
1268}
1269
1270/*
1271 * helper to delete the 'address of tree root -> reloc tree'
1272 * mapping
1273 */
1274static void __del_reloc_root(struct btrfs_root *root)
1275{
1276        struct rb_node *rb_node;
1277        struct mapping_node *node = NULL;
1278        struct reloc_control *rc = root->fs_info->reloc_ctl;
1279
1280        spin_lock(&rc->reloc_root_tree.lock);
1281        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1282                              root->node->start);
1283        if (rb_node) {
1284                node = rb_entry(rb_node, struct mapping_node, rb_node);
1285                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1286        }
1287        spin_unlock(&rc->reloc_root_tree.lock);
1288
1289        if (!node)
1290                return;
1291        BUG_ON((struct btrfs_root *)node->data != root);
1292
1293        spin_lock(&root->fs_info->trans_lock);
1294        list_del_init(&root->root_list);
1295        spin_unlock(&root->fs_info->trans_lock);
1296        kfree(node);
1297}
1298
1299/*
1300 * helper to update the 'address of tree root -> reloc tree'
1301 * mapping
1302 */
1303static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1304{
1305        struct rb_node *rb_node;
1306        struct mapping_node *node = NULL;
1307        struct reloc_control *rc = root->fs_info->reloc_ctl;
1308
1309        spin_lock(&rc->reloc_root_tree.lock);
1310        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1311                              root->node->start);
1312        if (rb_node) {
1313                node = rb_entry(rb_node, struct mapping_node, rb_node);
1314                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1315        }
1316        spin_unlock(&rc->reloc_root_tree.lock);
1317
1318        if (!node)
1319                return 0;
1320        BUG_ON((struct btrfs_root *)node->data != root);
1321
1322        spin_lock(&rc->reloc_root_tree.lock);
1323        node->bytenr = new_bytenr;
1324        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1325                              node->bytenr, &node->rb_node);
1326        spin_unlock(&rc->reloc_root_tree.lock);
1327        if (rb_node)
1328                backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1329        return 0;
1330}
1331
1332static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1333                                        struct btrfs_root *root, u64 objectid)
1334{
1335        struct btrfs_root *reloc_root;
1336        struct extent_buffer *eb;
1337        struct btrfs_root_item *root_item;
1338        struct btrfs_key root_key;
1339        u64 last_snap = 0;
1340        int ret;
1341
1342        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1343        BUG_ON(!root_item);
1344
1345        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1346        root_key.type = BTRFS_ROOT_ITEM_KEY;
1347        root_key.offset = objectid;
1348
1349        if (root->root_key.objectid == objectid) {
1350                /* called by btrfs_init_reloc_root */
1351                ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1352                                      BTRFS_TREE_RELOC_OBJECTID);
1353                BUG_ON(ret);
1354
1355                last_snap = btrfs_root_last_snapshot(&root->root_item);
1356                btrfs_set_root_last_snapshot(&root->root_item,
1357                                             trans->transid - 1);
1358        } else {
1359                /*
1360                 * called by btrfs_reloc_post_snapshot_hook.
1361                 * the source tree is a reloc tree, all tree blocks
1362                 * modified after it was created have RELOC flag
1363                 * set in their headers. so it's OK to not update
1364                 * the 'last_snapshot'.
1365                 */
1366                ret = btrfs_copy_root(trans, root, root->node, &eb,
1367                                      BTRFS_TREE_RELOC_OBJECTID);
1368                BUG_ON(ret);
1369        }
1370
1371        memcpy(root_item, &root->root_item, sizeof(*root_item));
1372        btrfs_set_root_bytenr(root_item, eb->start);
1373        btrfs_set_root_level(root_item, btrfs_header_level(eb));
1374        btrfs_set_root_generation(root_item, trans->transid);
1375
1376        if (root->root_key.objectid == objectid) {
1377                btrfs_set_root_refs(root_item, 0);
1378                memset(&root_item->drop_progress, 0,
1379                       sizeof(struct btrfs_disk_key));
1380                root_item->drop_level = 0;
1381                /*
1382                 * abuse rtransid, it is safe because it is impossible to
1383                 * receive data into a relocation tree.
1384                 */
1385                btrfs_set_root_rtransid(root_item, last_snap);
1386                btrfs_set_root_otransid(root_item, trans->transid);
1387        }
1388
1389        btrfs_tree_unlock(eb);
1390        free_extent_buffer(eb);
1391
1392        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
1393                                &root_key, root_item);
1394        BUG_ON(ret);
1395        kfree(root_item);
1396
1397        reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1398        BUG_ON(IS_ERR(reloc_root));
1399        reloc_root->last_trans = trans->transid;
1400        return reloc_root;
1401}
1402
1403/*
1404 * create reloc tree for a given fs tree. reloc tree is just a
1405 * snapshot of the fs tree with special root objectid.
1406 */
1407int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1408                          struct btrfs_root *root)
1409{
1410        struct btrfs_root *reloc_root;
1411        struct reloc_control *rc = root->fs_info->reloc_ctl;
1412        struct btrfs_block_rsv *rsv;
1413        int clear_rsv = 0;
1414        int ret;
1415
1416        if (root->reloc_root) {
1417                reloc_root = root->reloc_root;
1418                reloc_root->last_trans = trans->transid;
1419                return 0;
1420        }
1421
1422        if (!rc || !rc->create_reloc_tree ||
1423            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1424                return 0;
1425
1426        if (!trans->reloc_reserved) {
1427                rsv = trans->block_rsv;
1428                trans->block_rsv = rc->block_rsv;
1429                clear_rsv = 1;
1430        }
1431        reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1432        if (clear_rsv)
1433                trans->block_rsv = rsv;
1434
1435        ret = __add_reloc_root(reloc_root);
1436        BUG_ON(ret < 0);
1437        root->reloc_root = reloc_root;
1438        return 0;
1439}
1440
1441/*
1442 * update root item of reloc tree
1443 */
1444int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1445                            struct btrfs_root *root)
1446{
1447        struct btrfs_root *reloc_root;
1448        struct btrfs_root_item *root_item;
1449        int ret;
1450
1451        if (!root->reloc_root)
1452                goto out;
1453
1454        reloc_root = root->reloc_root;
1455        root_item = &reloc_root->root_item;
1456
1457        if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1458            btrfs_root_refs(root_item) == 0) {
1459                root->reloc_root = NULL;
1460                __del_reloc_root(reloc_root);
1461        }
1462
1463        if (reloc_root->commit_root != reloc_root->node) {
1464                btrfs_set_root_node(root_item, reloc_root->node);
1465                free_extent_buffer(reloc_root->commit_root);
1466                reloc_root->commit_root = btrfs_root_node(reloc_root);
1467        }
1468
1469        ret = btrfs_update_root(trans, root->fs_info->tree_root,
1470                                &reloc_root->root_key, root_item);
1471        BUG_ON(ret);
1472
1473out:
1474        return 0;
1475}
1476
1477/*
1478 * helper to find first cached inode with inode number >= objectid
1479 * in a subvolume
1480 */
1481static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1482{
1483        struct rb_node *node;
1484        struct rb_node *prev;
1485        struct btrfs_inode *entry;
1486        struct inode *inode;
1487
1488        spin_lock(&root->inode_lock);
1489again:
1490        node = root->inode_tree.rb_node;
1491        prev = NULL;
1492        while (node) {
1493                prev = node;
1494                entry = rb_entry(node, struct btrfs_inode, rb_node);
1495
1496                if (objectid < btrfs_ino(&entry->vfs_inode))
1497                        node = node->rb_left;
1498                else if (objectid > btrfs_ino(&entry->vfs_inode))
1499                        node = node->rb_right;
1500                else
1501                        break;
1502        }
1503        if (!node) {
1504                while (prev) {
1505                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
1506                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
1507                                node = prev;
1508                                break;
1509                        }
1510                        prev = rb_next(prev);
1511                }
1512        }
1513        while (node) {
1514                entry = rb_entry(node, struct btrfs_inode, rb_node);
1515                inode = igrab(&entry->vfs_inode);
1516                if (inode) {
1517                        spin_unlock(&root->inode_lock);
1518                        return inode;
1519                }
1520
1521                objectid = btrfs_ino(&entry->vfs_inode) + 1;
1522                if (cond_resched_lock(&root->inode_lock))
1523                        goto again;
1524
1525                node = rb_next(node);
1526        }
1527        spin_unlock(&root->inode_lock);
1528        return NULL;
1529}
1530
1531static int in_block_group(u64 bytenr,
1532                          struct btrfs_block_group_cache *block_group)
1533{
1534        if (bytenr >= block_group->key.objectid &&
1535            bytenr < block_group->key.objectid + block_group->key.offset)
1536                return 1;
1537        return 0;
1538}
1539
1540/*
1541 * get new location of data
1542 */
1543static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1544                            u64 bytenr, u64 num_bytes)
1545{
1546        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1547        struct btrfs_path *path;
1548        struct btrfs_file_extent_item *fi;
1549        struct extent_buffer *leaf;
1550        int ret;
1551
1552        path = btrfs_alloc_path();
1553        if (!path)
1554                return -ENOMEM;
1555
1556        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1557        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
1558                                       bytenr, 0);
1559        if (ret < 0)
1560                goto out;
1561        if (ret > 0) {
1562                ret = -ENOENT;
1563                goto out;
1564        }
1565
1566        leaf = path->nodes[0];
1567        fi = btrfs_item_ptr(leaf, path->slots[0],
1568                            struct btrfs_file_extent_item);
1569
1570        BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1571               btrfs_file_extent_compression(leaf, fi) ||
1572               btrfs_file_extent_encryption(leaf, fi) ||
1573               btrfs_file_extent_other_encoding(leaf, fi));
1574
1575        if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1576                ret = -EINVAL;
1577                goto out;
1578        }
1579
1580        *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1581        ret = 0;
1582out:
1583        btrfs_free_path(path);
1584        return ret;
1585}
1586
1587/*
1588 * update file extent items in the tree leaf to point to
1589 * the new locations.
1590 */
1591static noinline_for_stack
1592int replace_file_extents(struct btrfs_trans_handle *trans,
1593                         struct reloc_control *rc,
1594                         struct btrfs_root *root,
1595                         struct extent_buffer *leaf)
1596{
1597        struct btrfs_key key;
1598        struct btrfs_file_extent_item *fi;
1599        struct inode *inode = NULL;
1600        u64 parent;
1601        u64 bytenr;
1602        u64 new_bytenr = 0;
1603        u64 num_bytes;
1604        u64 end;
1605        u32 nritems;
1606        u32 i;
1607        int ret = 0;
1608        int first = 1;
1609        int dirty = 0;
1610
1611        if (rc->stage != UPDATE_DATA_PTRS)
1612                return 0;
1613
1614        /* reloc trees always use full backref */
1615        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1616                parent = leaf->start;
1617        else
1618                parent = 0;
1619
1620        nritems = btrfs_header_nritems(leaf);
1621        for (i = 0; i < nritems; i++) {
1622                cond_resched();
1623                btrfs_item_key_to_cpu(leaf, &key, i);
1624                if (key.type != BTRFS_EXTENT_DATA_KEY)
1625                        continue;
1626                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1627                if (btrfs_file_extent_type(leaf, fi) ==
1628                    BTRFS_FILE_EXTENT_INLINE)
1629                        continue;
1630                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1631                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1632                if (bytenr == 0)
1633                        continue;
1634                if (!in_block_group(bytenr, rc->block_group))
1635                        continue;
1636
1637                /*
1638                 * if we are modifying block in fs tree, wait for readpage
1639                 * to complete and drop the extent cache
1640                 */
1641                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1642                        if (first) {
1643                                inode = find_next_inode(root, key.objectid);
1644                                first = 0;
1645                        } else if (inode && btrfs_ino(inode) < key.objectid) {
1646                                btrfs_add_delayed_iput(inode);
1647                                inode = find_next_inode(root, key.objectid);
1648                        }
1649                        if (inode && btrfs_ino(inode) == key.objectid) {
1650                                end = key.offset +
1651                                      btrfs_file_extent_num_bytes(leaf, fi);
1652                                WARN_ON(!IS_ALIGNED(key.offset,
1653                                                    root->sectorsize));
1654                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1655                                end--;
1656                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1657                                                      key.offset, end);
1658                                if (!ret)
1659                                        continue;
1660
1661                                btrfs_drop_extent_cache(inode, key.offset, end,
1662                                                        1);
1663                                unlock_extent(&BTRFS_I(inode)->io_tree,
1664                                              key.offset, end);
1665                        }
1666                }
1667
1668                ret = get_new_location(rc->data_inode, &new_bytenr,
1669                                       bytenr, num_bytes);
1670                if (ret) {
1671                        /*
1672                         * Don't have to abort since we've not changed anything
1673                         * in the file extent yet.
1674                         */
1675                        break;
1676                }
1677
1678                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1679                dirty = 1;
1680
1681                key.offset -= btrfs_file_extent_offset(leaf, fi);
1682                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1683                                           num_bytes, parent,
1684                                           btrfs_header_owner(leaf),
1685                                           key.objectid, key.offset, 1);
1686                if (ret) {
1687                        btrfs_abort_transaction(trans, root, ret);
1688                        break;
1689                }
1690
1691                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1692                                        parent, btrfs_header_owner(leaf),
1693                                        key.objectid, key.offset, 1);
1694                if (ret) {
1695                        btrfs_abort_transaction(trans, root, ret);
1696                        break;
1697                }
1698        }
1699        if (dirty)
1700                btrfs_mark_buffer_dirty(leaf);
1701        if (inode)
1702                btrfs_add_delayed_iput(inode);
1703        return ret;
1704}
1705
1706static noinline_for_stack
1707int memcmp_node_keys(struct extent_buffer *eb, int slot,
1708                     struct btrfs_path *path, int level)
1709{
1710        struct btrfs_disk_key key1;
1711        struct btrfs_disk_key key2;
1712        btrfs_node_key(eb, &key1, slot);
1713        btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1714        return memcmp(&key1, &key2, sizeof(key1));
1715}
1716
1717/*
1718 * try to replace tree blocks in fs tree with the new blocks
1719 * in reloc tree. tree blocks haven't been modified since the
1720 * reloc tree was create can be replaced.
1721 *
1722 * if a block was replaced, level of the block + 1 is returned.
1723 * if no block got replaced, 0 is returned. if there are other
1724 * errors, a negative error number is returned.
1725 */
1726static noinline_for_stack
1727int replace_path(struct btrfs_trans_handle *trans,
1728                 struct btrfs_root *dest, struct btrfs_root *src,
1729                 struct btrfs_path *path, struct btrfs_key *next_key,
1730                 int lowest_level, int max_level)
1731{
1732        struct extent_buffer *eb;
1733        struct extent_buffer *parent;
1734        struct btrfs_key key;
1735        u64 old_bytenr;
1736        u64 new_bytenr;
1737        u64 old_ptr_gen;
1738        u64 new_ptr_gen;
1739        u64 last_snapshot;
1740        u32 blocksize;
1741        int cow = 0;
1742        int level;
1743        int ret;
1744        int slot;
1745
1746        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1747        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1748
1749        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1750again:
1751        slot = path->slots[lowest_level];
1752        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1753
1754        eb = btrfs_lock_root_node(dest);
1755        btrfs_set_lock_blocking(eb);
1756        level = btrfs_header_level(eb);
1757
1758        if (level < lowest_level) {
1759                btrfs_tree_unlock(eb);
1760                free_extent_buffer(eb);
1761                return 0;
1762        }
1763
1764        if (cow) {
1765                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1766                BUG_ON(ret);
1767        }
1768        btrfs_set_lock_blocking(eb);
1769
1770        if (next_key) {
1771                next_key->objectid = (u64)-1;
1772                next_key->type = (u8)-1;
1773                next_key->offset = (u64)-1;
1774        }
1775
1776        parent = eb;
1777        while (1) {
1778                level = btrfs_header_level(parent);
1779                BUG_ON(level < lowest_level);
1780
1781                ret = btrfs_bin_search(parent, &key, level, &slot);
1782                if (ret && slot > 0)
1783                        slot--;
1784
1785                if (next_key && slot + 1 < btrfs_header_nritems(parent))
1786                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1787
1788                old_bytenr = btrfs_node_blockptr(parent, slot);
1789                blocksize = btrfs_level_size(dest, level - 1);
1790                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1791
1792                if (level <= max_level) {
1793                        eb = path->nodes[level];
1794                        new_bytenr = btrfs_node_blockptr(eb,
1795                                                        path->slots[level]);
1796                        new_ptr_gen = btrfs_node_ptr_generation(eb,
1797                                                        path->slots[level]);
1798                } else {
1799                        new_bytenr = 0;
1800                        new_ptr_gen = 0;
1801                }
1802
1803                if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1804                        ret = level;
1805                        break;
1806                }
1807
1808                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1809                    memcmp_node_keys(parent, slot, path, level)) {
1810                        if (level <= lowest_level) {
1811                                ret = 0;
1812                                break;
1813                        }
1814
1815                        eb = read_tree_block(dest, old_bytenr, blocksize,
1816                                             old_ptr_gen);
1817                        if (!eb || !extent_buffer_uptodate(eb)) {
1818                                ret = (!eb) ? -ENOMEM : -EIO;
1819                                free_extent_buffer(eb);
1820                                break;
1821                        }
1822                        btrfs_tree_lock(eb);
1823                        if (cow) {
1824                                ret = btrfs_cow_block(trans, dest, eb, parent,
1825                                                      slot, &eb);
1826                                BUG_ON(ret);
1827                        }
1828                        btrfs_set_lock_blocking(eb);
1829
1830                        btrfs_tree_unlock(parent);
1831                        free_extent_buffer(parent);
1832
1833                        parent = eb;
1834                        continue;
1835                }
1836
1837                if (!cow) {
1838                        btrfs_tree_unlock(parent);
1839                        free_extent_buffer(parent);
1840                        cow = 1;
1841                        goto again;
1842                }
1843
1844                btrfs_node_key_to_cpu(path->nodes[level], &key,
1845                                      path->slots[level]);
1846                btrfs_release_path(path);
1847
1848                path->lowest_level = level;
1849                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1850                path->lowest_level = 0;
1851                BUG_ON(ret);
1852
1853                /*
1854                 * swap blocks in fs tree and reloc tree.
1855                 */
1856                btrfs_set_node_blockptr(parent, slot, new_bytenr);
1857                btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1858                btrfs_mark_buffer_dirty(parent);
1859
1860                btrfs_set_node_blockptr(path->nodes[level],
1861                                        path->slots[level], old_bytenr);
1862                btrfs_set_node_ptr_generation(path->nodes[level],
1863                                              path->slots[level], old_ptr_gen);
1864                btrfs_mark_buffer_dirty(path->nodes[level]);
1865
1866                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1867                                        path->nodes[level]->start,
1868                                        src->root_key.objectid, level - 1, 0,
1869                                        1);
1870                BUG_ON(ret);
1871                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1872                                        0, dest->root_key.objectid, level - 1,
1873                                        0, 1);
1874                BUG_ON(ret);
1875
1876                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1877                                        path->nodes[level]->start,
1878                                        src->root_key.objectid, level - 1, 0,
1879                                        1);
1880                BUG_ON(ret);
1881
1882                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1883                                        0, dest->root_key.objectid, level - 1,
1884                                        0, 1);
1885                BUG_ON(ret);
1886
1887                btrfs_unlock_up_safe(path, 0);
1888
1889                ret = level;
1890                break;
1891        }
1892        btrfs_tree_unlock(parent);
1893        free_extent_buffer(parent);
1894        return ret;
1895}
1896
1897/*
1898 * helper to find next relocated block in reloc tree
1899 */
1900static noinline_for_stack
1901int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1902                       int *level)
1903{
1904        struct extent_buffer *eb;
1905        int i;
1906        u64 last_snapshot;
1907        u32 nritems;
1908
1909        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1910
1911        for (i = 0; i < *level; i++) {
1912                free_extent_buffer(path->nodes[i]);
1913                path->nodes[i] = NULL;
1914        }
1915
1916        for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1917                eb = path->nodes[i];
1918                nritems = btrfs_header_nritems(eb);
1919                while (path->slots[i] + 1 < nritems) {
1920                        path->slots[i]++;
1921                        if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1922                            last_snapshot)
1923                                continue;
1924
1925                        *level = i;
1926                        return 0;
1927                }
1928                free_extent_buffer(path->nodes[i]);
1929                path->nodes[i] = NULL;
1930        }
1931        return 1;
1932}
1933
1934/*
1935 * walk down reloc tree to find relocated block of lowest level
1936 */
1937static noinline_for_stack
1938int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1939                         int *level)
1940{
1941        struct extent_buffer *eb = NULL;
1942        int i;
1943        u64 bytenr;
1944        u64 ptr_gen = 0;
1945        u64 last_snapshot;
1946        u32 blocksize;
1947        u32 nritems;
1948
1949        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1950
1951        for (i = *level; i > 0; i--) {
1952                eb = path->nodes[i];
1953                nritems = btrfs_header_nritems(eb);
1954                while (path->slots[i] < nritems) {
1955                        ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1956                        if (ptr_gen > last_snapshot)
1957                                break;
1958                        path->slots[i]++;
1959                }
1960                if (path->slots[i] >= nritems) {
1961                        if (i == *level)
1962                                break;
1963                        *level = i + 1;
1964                        return 0;
1965                }
1966                if (i == 1) {
1967                        *level = i;
1968                        return 0;
1969                }
1970
1971                bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1972                blocksize = btrfs_level_size(root, i - 1);
1973                eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1974                if (!eb || !extent_buffer_uptodate(eb)) {
1975                        free_extent_buffer(eb);
1976                        return -EIO;
1977                }
1978                BUG_ON(btrfs_header_level(eb) != i - 1);
1979                path->nodes[i - 1] = eb;
1980                path->slots[i - 1] = 0;
1981        }
1982        return 1;
1983}
1984
1985/*
1986 * invalidate extent cache for file extents whose key in range of
1987 * [min_key, max_key)
1988 */
1989static int invalidate_extent_cache(struct btrfs_root *root,
1990                                   struct btrfs_key *min_key,
1991                                   struct btrfs_key *max_key)
1992{
1993        struct inode *inode = NULL;
1994        u64 objectid;
1995        u64 start, end;
1996        u64 ino;
1997
1998        objectid = min_key->objectid;
1999        while (1) {
2000                cond_resched();
2001                iput(inode);
2002
2003                if (objectid > max_key->objectid)
2004                        break;
2005
2006                inode = find_next_inode(root, objectid);
2007                if (!inode)
2008                        break;
2009                ino = btrfs_ino(inode);
2010
2011                if (ino > max_key->objectid) {
2012                        iput(inode);
2013                        break;
2014                }
2015
2016                objectid = ino + 1;
2017                if (!S_ISREG(inode->i_mode))
2018                        continue;
2019
2020                if (unlikely(min_key->objectid == ino)) {
2021                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2022                                continue;
2023                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2024                                start = 0;
2025                        else {
2026                                start = min_key->offset;
2027                                WARN_ON(!IS_ALIGNED(start, root->sectorsize));
2028                        }
2029                } else {
2030                        start = 0;
2031                }
2032
2033                if (unlikely(max_key->objectid == ino)) {
2034                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2035                                continue;
2036                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2037                                end = (u64)-1;
2038                        } else {
2039                                if (max_key->offset == 0)
2040                                        continue;
2041                                end = max_key->offset;
2042                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
2043                                end--;
2044                        }
2045                } else {
2046                        end = (u64)-1;
2047                }
2048
2049                /* the lock_extent waits for readpage to complete */
2050                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2051                btrfs_drop_extent_cache(inode, start, end, 1);
2052                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2053        }
2054        return 0;
2055}
2056
2057static int find_next_key(struct btrfs_path *path, int level,
2058                         struct btrfs_key *key)
2059
2060{
2061        while (level < BTRFS_MAX_LEVEL) {
2062                if (!path->nodes[level])
2063                        break;
2064                if (path->slots[level] + 1 <
2065                    btrfs_header_nritems(path->nodes[level])) {
2066                        btrfs_node_key_to_cpu(path->nodes[level], key,
2067                                              path->slots[level] + 1);
2068                        return 0;
2069                }
2070                level++;
2071        }
2072        return 1;
2073}
2074
2075/*
2076 * merge the relocated tree blocks in reloc tree with corresponding
2077 * fs tree.
2078 */
2079static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2080                                               struct btrfs_root *root)
2081{
2082        LIST_HEAD(inode_list);
2083        struct btrfs_key key;
2084        struct btrfs_key next_key;
2085        struct btrfs_trans_handle *trans = NULL;
2086        struct btrfs_root *reloc_root;
2087        struct btrfs_root_item *root_item;
2088        struct btrfs_path *path;
2089        struct extent_buffer *leaf;
2090        int level;
2091        int max_level;
2092        int replaced = 0;
2093        int ret;
2094        int err = 0;
2095        u32 min_reserved;
2096
2097        path = btrfs_alloc_path();
2098        if (!path)
2099                return -ENOMEM;
2100        path->reada = 1;
2101
2102        reloc_root = root->reloc_root;
2103        root_item = &reloc_root->root_item;
2104
2105        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2106                level = btrfs_root_level(root_item);
2107                extent_buffer_get(reloc_root->node);
2108                path->nodes[level] = reloc_root->node;
2109                path->slots[level] = 0;
2110        } else {
2111                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2112
2113                level = root_item->drop_level;
2114                BUG_ON(level == 0);
2115                path->lowest_level = level;
2116                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2117                path->lowest_level = 0;
2118                if (ret < 0) {
2119                        btrfs_free_path(path);
2120                        return ret;
2121                }
2122
2123                btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2124                                      path->slots[level]);
2125                WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2126
2127                btrfs_unlock_up_safe(path, 0);
2128        }
2129
2130        min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2131        memset(&next_key, 0, sizeof(next_key));
2132
2133        while (1) {
2134                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2135                                             BTRFS_RESERVE_FLUSH_ALL);
2136                if (ret) {
2137                        err = ret;
2138                        goto out;
2139                }
2140                trans = btrfs_start_transaction(root, 0);
2141                if (IS_ERR(trans)) {
2142                        err = PTR_ERR(trans);
2143                        trans = NULL;
2144                        goto out;
2145                }
2146                trans->block_rsv = rc->block_rsv;
2147
2148                replaced = 0;
2149                max_level = level;
2150
2151                ret = walk_down_reloc_tree(reloc_root, path, &level);
2152                if (ret < 0) {
2153                        err = ret;
2154                        goto out;
2155                }
2156                if (ret > 0)
2157                        break;
2158
2159                if (!find_next_key(path, level, &key) &&
2160                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2161                        ret = 0;
2162                } else {
2163                        ret = replace_path(trans, root, reloc_root, path,
2164                                           &next_key, level, max_level);
2165                }
2166                if (ret < 0) {
2167                        err = ret;
2168                        goto out;
2169                }
2170
2171                if (ret > 0) {
2172                        level = ret;
2173                        btrfs_node_key_to_cpu(path->nodes[level], &key,
2174                                              path->slots[level]);
2175                        replaced = 1;
2176                }
2177
2178                ret = walk_up_reloc_tree(reloc_root, path, &level);
2179                if (ret > 0)
2180                        break;
2181
2182                BUG_ON(level == 0);
2183                /*
2184                 * save the merging progress in the drop_progress.
2185                 * this is OK since root refs == 1 in this case.
2186                 */
2187                btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2188                               path->slots[level]);
2189                root_item->drop_level = level;
2190
2191                btrfs_end_transaction_throttle(trans, root);
2192                trans = NULL;
2193
2194                btrfs_btree_balance_dirty(root);
2195
2196                if (replaced && rc->stage == UPDATE_DATA_PTRS)
2197                        invalidate_extent_cache(root, &key, &next_key);
2198        }
2199
2200        /*
2201         * handle the case only one block in the fs tree need to be
2202         * relocated and the block is tree root.
2203         */
2204        leaf = btrfs_lock_root_node(root);
2205        ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2206        btrfs_tree_unlock(leaf);
2207        free_extent_buffer(leaf);
2208        if (ret < 0)
2209                err = ret;
2210out:
2211        btrfs_free_path(path);
2212
2213        if (err == 0) {
2214                memset(&root_item->drop_progress, 0,
2215                       sizeof(root_item->drop_progress));
2216                root_item->drop_level = 0;
2217                btrfs_set_root_refs(root_item, 0);
2218                btrfs_update_reloc_root(trans, root);
2219        }
2220
2221        if (trans)
2222                btrfs_end_transaction_throttle(trans, root);
2223
2224        btrfs_btree_balance_dirty(root);
2225
2226        if (replaced && rc->stage == UPDATE_DATA_PTRS)
2227                invalidate_extent_cache(root, &key, &next_key);
2228
2229        return err;
2230}
2231
2232static noinline_for_stack
2233int prepare_to_merge(struct reloc_control *rc, int err)
2234{
2235        struct btrfs_root *root = rc->extent_root;
2236        struct btrfs_root *reloc_root;
2237        struct btrfs_trans_handle *trans;
2238        LIST_HEAD(reloc_roots);
2239        u64 num_bytes = 0;
2240        int ret;
2241
2242        mutex_lock(&root->fs_info->reloc_mutex);
2243        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2244        rc->merging_rsv_size += rc->nodes_relocated * 2;
2245        mutex_unlock(&root->fs_info->reloc_mutex);
2246
2247again:
2248        if (!err) {
2249                num_bytes = rc->merging_rsv_size;
2250                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2251                                          BTRFS_RESERVE_FLUSH_ALL);
2252                if (ret)
2253                        err = ret;
2254        }
2255
2256        trans = btrfs_join_transaction(rc->extent_root);
2257        if (IS_ERR(trans)) {
2258                if (!err)
2259                        btrfs_block_rsv_release(rc->extent_root,
2260                                                rc->block_rsv, num_bytes);
2261                return PTR_ERR(trans);
2262        }
2263
2264        if (!err) {
2265                if (num_bytes != rc->merging_rsv_size) {
2266                        btrfs_end_transaction(trans, rc->extent_root);
2267                        btrfs_block_rsv_release(rc->extent_root,
2268                                                rc->block_rsv, num_bytes);
2269                        goto again;
2270                }
2271        }
2272
2273        rc->merge_reloc_tree = 1;
2274
2275        while (!list_empty(&rc->reloc_roots)) {
2276                reloc_root = list_entry(rc->reloc_roots.next,
2277                                        struct btrfs_root, root_list);
2278                list_del_init(&reloc_root->root_list);
2279
2280                root = read_fs_root(reloc_root->fs_info,
2281                                    reloc_root->root_key.offset);
2282                BUG_ON(IS_ERR(root));
2283                BUG_ON(root->reloc_root != reloc_root);
2284
2285                /*
2286                 * set reference count to 1, so btrfs_recover_relocation
2287                 * knows it should resumes merging
2288                 */
2289                if (!err)
2290                        btrfs_set_root_refs(&reloc_root->root_item, 1);
2291                btrfs_update_reloc_root(trans, root);
2292
2293                list_add(&reloc_root->root_list, &reloc_roots);
2294        }
2295
2296        list_splice(&reloc_roots, &rc->reloc_roots);
2297
2298        if (!err)
2299                btrfs_commit_transaction(trans, rc->extent_root);
2300        else
2301                btrfs_end_transaction(trans, rc->extent_root);
2302        return err;
2303}
2304
2305static noinline_for_stack
2306void free_reloc_roots(struct list_head *list)
2307{
2308        struct btrfs_root *reloc_root;
2309
2310        while (!list_empty(list)) {
2311                reloc_root = list_entry(list->next, struct btrfs_root,
2312                                        root_list);
2313                __del_reloc_root(reloc_root);
2314        }
2315}
2316
2317static noinline_for_stack
2318int merge_reloc_roots(struct reloc_control *rc)
2319{
2320        struct btrfs_root *root;
2321        struct btrfs_root *reloc_root;
2322        u64 last_snap;
2323        u64 otransid;
2324        u64 objectid;
2325        LIST_HEAD(reloc_roots);
2326        int found = 0;
2327        int ret = 0;
2328again:
2329        root = rc->extent_root;
2330
2331        /*
2332         * this serializes us with btrfs_record_root_in_transaction,
2333         * we have to make sure nobody is in the middle of
2334         * adding their roots to the list while we are
2335         * doing this splice
2336         */
2337        mutex_lock(&root->fs_info->reloc_mutex);
2338        list_splice_init(&rc->reloc_roots, &reloc_roots);
2339        mutex_unlock(&root->fs_info->reloc_mutex);
2340
2341        while (!list_empty(&reloc_roots)) {
2342                found = 1;
2343                reloc_root = list_entry(reloc_roots.next,
2344                                        struct btrfs_root, root_list);
2345
2346                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2347                        root = read_fs_root(reloc_root->fs_info,
2348                                            reloc_root->root_key.offset);
2349                        BUG_ON(IS_ERR(root));
2350                        BUG_ON(root->reloc_root != reloc_root);
2351
2352                        ret = merge_reloc_root(rc, root);
2353                        if (ret) {
2354                                if (list_empty(&reloc_root->root_list))
2355                                        list_add_tail(&reloc_root->root_list,
2356                                                      &reloc_roots);
2357                                goto out;
2358                        }
2359                } else {
2360                        list_del_init(&reloc_root->root_list);
2361                }
2362
2363                /*
2364                 * we keep the old last snapshod transid in rtranid when we
2365                 * created the relocation tree.
2366                 */
2367                last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2368                otransid = btrfs_root_otransid(&reloc_root->root_item);
2369                objectid = reloc_root->root_key.offset;
2370
2371                ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2372                if (ret < 0) {
2373                        if (list_empty(&reloc_root->root_list))
2374                                list_add_tail(&reloc_root->root_list,
2375                                              &reloc_roots);
2376                        goto out;
2377                }
2378        }
2379
2380        if (found) {
2381                found = 0;
2382                goto again;
2383        }
2384out:
2385        if (ret) {
2386                btrfs_std_error(root->fs_info, ret);
2387                if (!list_empty(&reloc_roots))
2388                        free_reloc_roots(&reloc_roots);
2389
2390                /* new reloc root may be added */
2391                mutex_lock(&root->fs_info->reloc_mutex);
2392                list_splice_init(&rc->reloc_roots, &reloc_roots);
2393                mutex_unlock(&root->fs_info->reloc_mutex);
2394                if (!list_empty(&reloc_roots))
2395                        free_reloc_roots(&reloc_roots);
2396        }
2397
2398        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2399        return ret;
2400}
2401
2402static void free_block_list(struct rb_root *blocks)
2403{
2404        struct tree_block *block;
2405        struct rb_node *rb_node;
2406        while ((rb_node = rb_first(blocks))) {
2407                block = rb_entry(rb_node, struct tree_block, rb_node);
2408                rb_erase(rb_node, blocks);
2409                kfree(block);
2410        }
2411}
2412
2413static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2414                                      struct btrfs_root *reloc_root)
2415{
2416        struct btrfs_root *root;
2417
2418        if (reloc_root->last_trans == trans->transid)
2419                return 0;
2420
2421        root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
2422        BUG_ON(IS_ERR(root));
2423        BUG_ON(root->reloc_root != reloc_root);
2424
2425        return btrfs_record_root_in_trans(trans, root);
2426}
2427
2428static noinline_for_stack
2429struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2430                                     struct reloc_control *rc,
2431                                     struct backref_node *node,
2432                                     struct backref_edge *edges[])
2433{
2434        struct backref_node *next;
2435        struct btrfs_root *root;
2436        int index = 0;
2437
2438        next = node;
2439        while (1) {
2440                cond_resched();
2441                next = walk_up_backref(next, edges, &index);
2442                root = next->root;
2443                BUG_ON(!root);
2444                BUG_ON(!root->ref_cows);
2445
2446                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2447                        record_reloc_root_in_trans(trans, root);
2448                        break;
2449                }
2450
2451                btrfs_record_root_in_trans(trans, root);
2452                root = root->reloc_root;
2453
2454                if (next->new_bytenr != root->node->start) {
2455                        BUG_ON(next->new_bytenr);
2456                        BUG_ON(!list_empty(&next->list));
2457                        next->new_bytenr = root->node->start;
2458                        next->root = root;
2459                        list_add_tail(&next->list,
2460                                      &rc->backref_cache.changed);
2461                        __mark_block_processed(rc, next);
2462                        break;
2463                }
2464
2465                WARN_ON(1);
2466                root = NULL;
2467                next = walk_down_backref(edges, &index);
2468                if (!next || next->level <= node->level)
2469                        break;
2470        }
2471        if (!root)
2472                return NULL;
2473
2474        next = node;
2475        /* setup backref node path for btrfs_reloc_cow_block */
2476        while (1) {
2477                rc->backref_cache.path[next->level] = next;
2478                if (--index < 0)
2479                        break;
2480                next = edges[index]->node[UPPER];
2481        }
2482        return root;
2483}
2484
2485/*
2486 * select a tree root for relocation. return NULL if the block
2487 * is reference counted. we should use do_relocation() in this
2488 * case. return a tree root pointer if the block isn't reference
2489 * counted. return -ENOENT if the block is root of reloc tree.
2490 */
2491static noinline_for_stack
2492struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
2493                                   struct backref_node *node)
2494{
2495        struct backref_node *next;
2496        struct btrfs_root *root;
2497        struct btrfs_root *fs_root = NULL;
2498        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2499        int index = 0;
2500
2501        next = node;
2502        while (1) {
2503                cond_resched();
2504                next = walk_up_backref(next, edges, &index);
2505                root = next->root;
2506                BUG_ON(!root);
2507
2508                /* no other choice for non-references counted tree */
2509                if (!root->ref_cows)
2510                        return root;
2511
2512                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2513                        fs_root = root;
2514
2515                if (next != node)
2516                        return NULL;
2517
2518                next = walk_down_backref(edges, &index);
2519                if (!next || next->level <= node->level)
2520                        break;
2521        }
2522
2523        if (!fs_root)
2524                return ERR_PTR(-ENOENT);
2525        return fs_root;
2526}
2527
2528static noinline_for_stack
2529u64 calcu_metadata_size(struct reloc_control *rc,
2530                        struct backref_node *node, int reserve)
2531{
2532        struct backref_node *next = node;
2533        struct backref_edge *edge;
2534        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2535        u64 num_bytes = 0;
2536        int index = 0;
2537
2538        BUG_ON(reserve && node->processed);
2539
2540        while (next) {
2541                cond_resched();
2542                while (1) {
2543                        if (next->processed && (reserve || next != node))
2544                                break;
2545
2546                        num_bytes += btrfs_level_size(rc->extent_root,
2547                                                      next->level);
2548
2549                        if (list_empty(&next->upper))
2550                                break;
2551
2552                        edge = list_entry(next->upper.next,
2553                                          struct backref_edge, list[LOWER]);
2554                        edges[index++] = edge;
2555                        next = edge->node[UPPER];
2556                }
2557                next = walk_down_backref(edges, &index);
2558        }
2559        return num_bytes;
2560}
2561
2562static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2563                                  struct reloc_control *rc,
2564                                  struct backref_node *node)
2565{
2566        struct btrfs_root *root = rc->extent_root;
2567        u64 num_bytes;
2568        int ret;
2569        u64 tmp;
2570
2571        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2572
2573        trans->block_rsv = rc->block_rsv;
2574        rc->reserved_bytes += num_bytes;
2575        ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2576                                BTRFS_RESERVE_FLUSH_ALL);
2577        if (ret) {
2578                if (ret == -EAGAIN) {
2579                        tmp = rc->extent_root->nodesize *
2580                                RELOCATION_RESERVED_NODES;
2581                        while (tmp <= rc->reserved_bytes)
2582                                tmp <<= 1;
2583                        /*
2584                         * only one thread can access block_rsv at this point,
2585                         * so we don't need hold lock to protect block_rsv.
2586                         * we expand more reservation size here to allow enough
2587                         * space for relocation and we will return eailer in
2588                         * enospc case.
2589                         */
2590                        rc->block_rsv->size = tmp + rc->extent_root->nodesize *
2591                                              RELOCATION_RESERVED_NODES;
2592                }
2593                return ret;
2594        }
2595
2596        return 0;
2597}
2598
2599/*
2600 * relocate a block tree, and then update pointers in upper level
2601 * blocks that reference the block to point to the new location.
2602 *
2603 * if called by link_to_upper, the block has already been relocated.
2604 * in that case this function just updates pointers.
2605 */
2606static int do_relocation(struct btrfs_trans_handle *trans,
2607                         struct reloc_control *rc,
2608                         struct backref_node *node,
2609                         struct btrfs_key *key,
2610                         struct btrfs_path *path, int lowest)
2611{
2612        struct backref_node *upper;
2613        struct backref_edge *edge;
2614        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2615        struct btrfs_root *root;
2616        struct extent_buffer *eb;
2617        u32 blocksize;
2618        u64 bytenr;
2619        u64 generation;
2620        int slot;
2621        int ret;
2622        int err = 0;
2623
2624        BUG_ON(lowest && node->eb);
2625
2626        path->lowest_level = node->level + 1;
2627        rc->backref_cache.path[node->level] = node;
2628        list_for_each_entry(edge, &node->upper, list[LOWER]) {
2629                cond_resched();
2630
2631                upper = edge->node[UPPER];
2632                root = select_reloc_root(trans, rc, upper, edges);
2633                BUG_ON(!root);
2634
2635                if (upper->eb && !upper->locked) {
2636                        if (!lowest) {
2637                                ret = btrfs_bin_search(upper->eb, key,
2638                                                       upper->level, &slot);
2639                                BUG_ON(ret);
2640                                bytenr = btrfs_node_blockptr(upper->eb, slot);
2641                                if (node->eb->start == bytenr)
2642                                        goto next;
2643                        }
2644                        drop_node_buffer(upper);
2645                }
2646
2647                if (!upper->eb) {
2648                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2649                        if (ret < 0) {
2650                                err = ret;
2651                                break;
2652                        }
2653                        BUG_ON(ret > 0);
2654
2655                        if (!upper->eb) {
2656                                upper->eb = path->nodes[upper->level];
2657                                path->nodes[upper->level] = NULL;
2658                        } else {
2659                                BUG_ON(upper->eb != path->nodes[upper->level]);
2660                        }
2661
2662                        upper->locked = 1;
2663                        path->locks[upper->level] = 0;
2664
2665                        slot = path->slots[upper->level];
2666                        btrfs_release_path(path);
2667                } else {
2668                        ret = btrfs_bin_search(upper->eb, key, upper->level,
2669                                               &slot);
2670                        BUG_ON(ret);
2671                }
2672
2673                bytenr = btrfs_node_blockptr(upper->eb, slot);
2674                if (lowest) {
2675                        BUG_ON(bytenr != node->bytenr);
2676                } else {
2677                        if (node->eb->start == bytenr)
2678                                goto next;
2679                }
2680
2681                blocksize = btrfs_level_size(root, node->level);
2682                generation = btrfs_node_ptr_generation(upper->eb, slot);
2683                eb = read_tree_block(root, bytenr, blocksize, generation);
2684                if (!eb || !extent_buffer_uptodate(eb)) {
2685                        free_extent_buffer(eb);
2686                        err = -EIO;
2687                        goto next;
2688                }
2689                btrfs_tree_lock(eb);
2690                btrfs_set_lock_blocking(eb);
2691
2692                if (!node->eb) {
2693                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
2694                                              slot, &eb);
2695                        btrfs_tree_unlock(eb);
2696                        free_extent_buffer(eb);
2697                        if (ret < 0) {
2698                                err = ret;
2699                                goto next;
2700                        }
2701                        BUG_ON(node->eb != eb);
2702                } else {
2703                        btrfs_set_node_blockptr(upper->eb, slot,
2704                                                node->eb->start);
2705                        btrfs_set_node_ptr_generation(upper->eb, slot,
2706                                                      trans->transid);
2707                        btrfs_mark_buffer_dirty(upper->eb);
2708
2709                        ret = btrfs_inc_extent_ref(trans, root,
2710                                                node->eb->start, blocksize,
2711                                                upper->eb->start,
2712                                                btrfs_header_owner(upper->eb),
2713                                                node->level, 0, 1);
2714                        BUG_ON(ret);
2715
2716                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2717                        BUG_ON(ret);
2718                }
2719next:
2720                if (!upper->pending)
2721                        drop_node_buffer(upper);
2722                else
2723                        unlock_node_buffer(upper);
2724                if (err)
2725                        break;
2726        }
2727
2728        if (!err && node->pending) {
2729                drop_node_buffer(node);
2730                list_move_tail(&node->list, &rc->backref_cache.changed);
2731                node->pending = 0;
2732        }
2733
2734        path->lowest_level = 0;
2735        BUG_ON(err == -ENOSPC);
2736        return err;
2737}
2738
2739static int link_to_upper(struct btrfs_trans_handle *trans,
2740                         struct reloc_control *rc,
2741                         struct backref_node *node,
2742                         struct btrfs_path *path)
2743{
2744        struct btrfs_key key;
2745
2746        btrfs_node_key_to_cpu(node->eb, &key, 0);
2747        return do_relocation(trans, rc, node, &key, path, 0);
2748}
2749
2750static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2751                                struct reloc_control *rc,
2752                                struct btrfs_path *path, int err)
2753{
2754        LIST_HEAD(list);
2755        struct backref_cache *cache = &rc->backref_cache;
2756        struct backref_node *node;
2757        int level;
2758        int ret;
2759
2760        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2761                while (!list_empty(&cache->pending[level])) {
2762                        node = list_entry(cache->pending[level].next,
2763                                          struct backref_node, list);
2764                        list_move_tail(&node->list, &list);
2765                        BUG_ON(!node->pending);
2766
2767                        if (!err) {
2768                                ret = link_to_upper(trans, rc, node, path);
2769                                if (ret < 0)
2770                                        err = ret;
2771                        }
2772                }
2773                list_splice_init(&list, &cache->pending[level]);
2774        }
2775        return err;
2776}
2777
2778static void mark_block_processed(struct reloc_control *rc,
2779                                 u64 bytenr, u32 blocksize)
2780{
2781        set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2782                        EXTENT_DIRTY, GFP_NOFS);
2783}
2784
2785static void __mark_block_processed(struct reloc_control *rc,
2786                                   struct backref_node *node)
2787{
2788        u32 blocksize;
2789        if (node->level == 0 ||
2790            in_block_group(node->bytenr, rc->block_group)) {
2791                blocksize = btrfs_level_size(rc->extent_root, node->level);
2792                mark_block_processed(rc, node->bytenr, blocksize);
2793        }
2794        node->processed = 1;
2795}
2796
2797/*
2798 * mark a block and all blocks directly/indirectly reference the block
2799 * as processed.
2800 */
2801static void update_processed_blocks(struct reloc_control *rc,
2802                                    struct backref_node *node)
2803{
2804        struct backref_node *next = node;
2805        struct backref_edge *edge;
2806        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2807        int index = 0;
2808
2809        while (next) {
2810                cond_resched();
2811                while (1) {
2812                        if (next->processed)
2813                                break;
2814
2815                        __mark_block_processed(rc, next);
2816
2817                        if (list_empty(&next->upper))
2818                                break;
2819
2820                        edge = list_entry(next->upper.next,
2821                                          struct backref_edge, list[LOWER]);
2822                        edges[index++] = edge;
2823                        next = edge->node[UPPER];
2824                }
2825                next = walk_down_backref(edges, &index);
2826        }
2827}
2828
2829static int tree_block_processed(u64 bytenr, u32 blocksize,
2830                                struct reloc_control *rc)
2831{
2832        if (test_range_bit(&rc->processed_blocks, bytenr,
2833                           bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2834                return 1;
2835        return 0;
2836}
2837
2838static int get_tree_block_key(struct reloc_control *rc,
2839                              struct tree_block *block)
2840{
2841        struct extent_buffer *eb;
2842
2843        BUG_ON(block->key_ready);
2844        eb = read_tree_block(rc->extent_root, block->bytenr,
2845                             block->key.objectid, block->key.offset);
2846        if (!eb || !extent_buffer_uptodate(eb)) {
2847                free_extent_buffer(eb);
2848                return -EIO;
2849        }
2850        WARN_ON(btrfs_header_level(eb) != block->level);
2851        if (block->level == 0)
2852                btrfs_item_key_to_cpu(eb, &block->key, 0);
2853        else
2854                btrfs_node_key_to_cpu(eb, &block->key, 0);
2855        free_extent_buffer(eb);
2856        block->key_ready = 1;
2857        return 0;
2858}
2859
2860static int reada_tree_block(struct reloc_control *rc,
2861                            struct tree_block *block)
2862{
2863        BUG_ON(block->key_ready);
2864        if (block->key.type == BTRFS_METADATA_ITEM_KEY)
2865                readahead_tree_block(rc->extent_root, block->bytenr,
2866                                     block->key.objectid,
2867                                     rc->extent_root->leafsize);
2868        else
2869                readahead_tree_block(rc->extent_root, block->bytenr,
2870                                     block->key.objectid, block->key.offset);
2871        return 0;
2872}
2873
2874/*
2875 * helper function to relocate a tree block
2876 */
2877static int relocate_tree_block(struct btrfs_trans_handle *trans,
2878                                struct reloc_control *rc,
2879                                struct backref_node *node,
2880                                struct btrfs_key *key,
2881                                struct btrfs_path *path)
2882{
2883        struct btrfs_root *root;
2884        int ret = 0;
2885
2886        if (!node)
2887                return 0;
2888
2889        BUG_ON(node->processed);
2890        root = select_one_root(trans, node);
2891        if (root == ERR_PTR(-ENOENT)) {
2892                update_processed_blocks(rc, node);
2893                goto out;
2894        }
2895
2896        if (!root || root->ref_cows) {
2897                ret = reserve_metadata_space(trans, rc, node);
2898                if (ret)
2899                        goto out;
2900        }
2901
2902        if (root) {
2903                if (root->ref_cows) {
2904                        BUG_ON(node->new_bytenr);
2905                        BUG_ON(!list_empty(&node->list));
2906                        btrfs_record_root_in_trans(trans, root);
2907                        root = root->reloc_root;
2908                        node->new_bytenr = root->node->start;
2909                        node->root = root;
2910                        list_add_tail(&node->list, &rc->backref_cache.changed);
2911                } else {
2912                        path->lowest_level = node->level;
2913                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2914                        btrfs_release_path(path);
2915                        if (ret > 0)
2916                                ret = 0;
2917                }
2918                if (!ret)
2919                        update_processed_blocks(rc, node);
2920        } else {
2921                ret = do_relocation(trans, rc, node, key, path, 1);
2922        }
2923out:
2924        if (ret || node->level == 0 || node->cowonly)
2925                remove_backref_node(&rc->backref_cache, node);
2926        return ret;
2927}
2928
2929/*
2930 * relocate a list of blocks
2931 */
2932static noinline_for_stack
2933int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2934                         struct reloc_control *rc, struct rb_root *blocks)
2935{
2936        struct backref_node *node;
2937        struct btrfs_path *path;
2938        struct tree_block *block;
2939        struct rb_node *rb_node;
2940        int ret;
2941        int err = 0;
2942
2943        path = btrfs_alloc_path();
2944        if (!path) {
2945                err = -ENOMEM;
2946                goto out_free_blocks;
2947        }
2948
2949        rb_node = rb_first(blocks);
2950        while (rb_node) {
2951                block = rb_entry(rb_node, struct tree_block, rb_node);
2952                if (!block->key_ready)
2953                        reada_tree_block(rc, block);
2954                rb_node = rb_next(rb_node);
2955        }
2956
2957        rb_node = rb_first(blocks);
2958        while (rb_node) {
2959                block = rb_entry(rb_node, struct tree_block, rb_node);
2960                if (!block->key_ready) {
2961                        err = get_tree_block_key(rc, block);
2962                        if (err)
2963                                goto out_free_path;
2964                }
2965                rb_node = rb_next(rb_node);
2966        }
2967
2968        rb_node = rb_first(blocks);
2969        while (rb_node) {
2970                block = rb_entry(rb_node, struct tree_block, rb_node);
2971
2972                node = build_backref_tree(rc, &block->key,
2973                                          block->level, block->bytenr);
2974                if (IS_ERR(node)) {
2975                        err = PTR_ERR(node);
2976                        goto out;
2977                }
2978
2979                ret = relocate_tree_block(trans, rc, node, &block->key,
2980                                          path);
2981                if (ret < 0) {
2982                        if (ret != -EAGAIN || rb_node == rb_first(blocks))
2983                                err = ret;
2984                        goto out;
2985                }
2986                rb_node = rb_next(rb_node);
2987        }
2988out:
2989        err = finish_pending_nodes(trans, rc, path, err);
2990
2991out_free_path:
2992        btrfs_free_path(path);
2993out_free_blocks:
2994        free_block_list(blocks);
2995        return err;
2996}
2997
2998static noinline_for_stack
2999int prealloc_file_extent_cluster(struct inode *inode,
3000                                 struct file_extent_cluster *cluster)
3001{
3002        u64 alloc_hint = 0;
3003        u64 start;
3004        u64 end;
3005        u64 offset = BTRFS_I(inode)->index_cnt;
3006        u64 num_bytes;
3007        int nr = 0;
3008        int ret = 0;
3009
3010        BUG_ON(cluster->start != cluster->boundary[0]);
3011        mutex_lock(&inode->i_mutex);
3012
3013        ret = btrfs_check_data_free_space(inode, cluster->end +
3014                                          1 - cluster->start);
3015        if (ret)
3016                goto out;
3017
3018        while (nr < cluster->nr) {
3019                start = cluster->boundary[nr] - offset;
3020                if (nr + 1 < cluster->nr)
3021                        end = cluster->boundary[nr + 1] - 1 - offset;
3022                else
3023                        end = cluster->end - offset;
3024
3025                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3026                num_bytes = end + 1 - start;
3027                ret = btrfs_prealloc_file_range(inode, 0, start,
3028                                                num_bytes, num_bytes,
3029                                                end + 1, &alloc_hint);
3030                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3031                if (ret)
3032                        break;
3033                nr++;
3034        }
3035        btrfs_free_reserved_data_space(inode, cluster->end +
3036                                       1 - cluster->start);
3037out:
3038        mutex_unlock(&inode->i_mutex);
3039        return ret;
3040}
3041
3042static noinline_for_stack
3043int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3044                         u64 block_start)
3045{
3046        struct btrfs_root *root = BTRFS_I(inode)->root;
3047        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3048        struct extent_map *em;
3049        int ret = 0;
3050
3051        em = alloc_extent_map();
3052        if (!em)
3053                return -ENOMEM;
3054
3055        em->start = start;
3056        em->len = end + 1 - start;
3057        em->block_len = em->len;
3058        em->block_start = block_start;
3059        em->bdev = root->fs_info->fs_devices->latest_bdev;
3060        set_bit(EXTENT_FLAG_PINNED, &em->flags);
3061
3062        lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3063        while (1) {
3064                write_lock(&em_tree->lock);
3065                ret = add_extent_mapping(em_tree, em, 0);
3066                write_unlock(&em_tree->lock);
3067                if (ret != -EEXIST) {
3068                        free_extent_map(em);
3069                        break;
3070                }
3071                btrfs_drop_extent_cache(inode, start, end, 0);
3072        }
3073        unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3074        return ret;
3075}
3076
3077static int relocate_file_extent_cluster(struct inode *inode,
3078                                        struct file_extent_cluster *cluster)
3079{
3080        u64 page_start;
3081        u64 page_end;
3082        u64 offset = BTRFS_I(inode)->index_cnt;
3083        unsigned long index;
3084        unsigned long last_index;
3085        struct page *page;
3086        struct file_ra_state *ra;
3087        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3088        int nr = 0;
3089        int ret = 0;
3090
3091        if (!cluster->nr)
3092                return 0;
3093
3094        ra = kzalloc(sizeof(*ra), GFP_NOFS);
3095        if (!ra)
3096                return -ENOMEM;
3097
3098        ret = prealloc_file_extent_cluster(inode, cluster);
3099        if (ret)
3100                goto out;
3101
3102        file_ra_state_init(ra, inode->i_mapping);
3103
3104        ret = setup_extent_mapping(inode, cluster->start - offset,
3105                                   cluster->end - offset, cluster->start);
3106        if (ret)
3107                goto out;
3108
3109        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
3110        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
3111        while (index <= last_index) {
3112                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
3113                if (ret)
3114                        goto out;
3115
3116                page = find_lock_page(inode->i_mapping, index);
3117                if (!page) {
3118                        page_cache_sync_readahead(inode->i_mapping,
3119                                                  ra, NULL, index,
3120                                                  last_index + 1 - index);
3121                        page = find_or_create_page(inode->i_mapping, index,
3122                                                   mask);
3123                        if (!page) {
3124                                btrfs_delalloc_release_metadata(inode,
3125                                                        PAGE_CACHE_SIZE);
3126                                ret = -ENOMEM;
3127                                goto out;
3128                        }
3129                }
3130
3131                if (PageReadahead(page)) {
3132                        page_cache_async_readahead(inode->i_mapping,
3133                                                   ra, NULL, page, index,
3134                                                   last_index + 1 - index);
3135                }
3136
3137                if (!PageUptodate(page)) {
3138                        btrfs_readpage(NULL, page);
3139                        lock_page(page);
3140                        if (!PageUptodate(page)) {
3141                                unlock_page(page);
3142                                page_cache_release(page);
3143                                btrfs_delalloc_release_metadata(inode,
3144                                                        PAGE_CACHE_SIZE);
3145                                ret = -EIO;
3146                                goto out;
3147                        }
3148                }
3149
3150                page_start = page_offset(page);
3151                page_end = page_start + PAGE_CACHE_SIZE - 1;
3152
3153                lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3154
3155                set_page_extent_mapped(page);
3156
3157                if (nr < cluster->nr &&
3158                    page_start + offset == cluster->boundary[nr]) {
3159                        set_extent_bits(&BTRFS_I(inode)->io_tree,
3160                                        page_start, page_end,
3161                                        EXTENT_BOUNDARY, GFP_NOFS);
3162                        nr++;
3163                }
3164
3165                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
3166                set_page_dirty(page);
3167
3168                unlock_extent(&BTRFS_I(inode)->io_tree,
3169                              page_start, page_end);
3170                unlock_page(page);
3171                page_cache_release(page);
3172
3173                index++;
3174                balance_dirty_pages_ratelimited(inode->i_mapping);
3175                btrfs_throttle(BTRFS_I(inode)->root);
3176        }
3177        WARN_ON(nr != cluster->nr);
3178out:
3179        kfree(ra);
3180        return ret;
3181}
3182
3183static noinline_for_stack
3184int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3185                         struct file_extent_cluster *cluster)
3186{
3187        int ret;
3188
3189        if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3190                ret = relocate_file_extent_cluster(inode, cluster);
3191                if (ret)
3192                        return ret;
3193                cluster->nr = 0;
3194        }
3195
3196        if (!cluster->nr)
3197                cluster->start = extent_key->objectid;
3198        else
3199                BUG_ON(cluster->nr >= MAX_EXTENTS);
3200        cluster->end = extent_key->objectid + extent_key->offset - 1;
3201        cluster->boundary[cluster->nr] = extent_key->objectid;
3202        cluster->nr++;
3203
3204        if (cluster->nr >= MAX_EXTENTS) {
3205                ret = relocate_file_extent_cluster(inode, cluster);
3206                if (ret)
3207                        return ret;
3208                cluster->nr = 0;
3209        }
3210        return 0;
3211}
3212
3213#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3214static int get_ref_objectid_v0(struct reloc_control *rc,
3215                               struct btrfs_path *path,
3216                               struct btrfs_key *extent_key,
3217                               u64 *ref_objectid, int *path_change)
3218{
3219        struct btrfs_key key;
3220        struct extent_buffer *leaf;
3221        struct btrfs_extent_ref_v0 *ref0;
3222        int ret;
3223        int slot;
3224
3225        leaf = path->nodes[0];
3226        slot = path->slots[0];
3227        while (1) {
3228                if (slot >= btrfs_header_nritems(leaf)) {
3229                        ret = btrfs_next_leaf(rc->extent_root, path);
3230                        if (ret < 0)
3231                                return ret;
3232                        BUG_ON(ret > 0);
3233                        leaf = path->nodes[0];
3234                        slot = path->slots[0];
3235                        if (path_change)
3236                                *path_change = 1;
3237                }
3238                btrfs_item_key_to_cpu(leaf, &key, slot);
3239                if (key.objectid != extent_key->objectid)
3240                        return -ENOENT;
3241
3242                if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3243                        slot++;
3244                        continue;
3245                }
3246                ref0 = btrfs_item_ptr(leaf, slot,
3247                                struct btrfs_extent_ref_v0);
3248                *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3249                break;
3250        }
3251        return 0;
3252}
3253#endif
3254
3255/*
3256 * helper to add a tree block to the list.
3257 * the major work is getting the generation and level of the block
3258 */
3259static int add_tree_block(struct reloc_control *rc,
3260                          struct btrfs_key *extent_key,
3261                          struct btrfs_path *path,
3262                          struct rb_root *blocks)
3263{
3264        struct extent_buffer *eb;
3265        struct btrfs_extent_item *ei;
3266        struct btrfs_tree_block_info *bi;
3267        struct tree_block *block;
3268        struct rb_node *rb_node;
3269        u32 item_size;
3270        int level = -1;
3271        u64 generation;
3272
3273        eb =  path->nodes[0];
3274        item_size = btrfs_item_size_nr(eb, path->slots[0]);
3275
3276        if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3277            item_size >= sizeof(*ei) + sizeof(*bi)) {
3278                ei = btrfs_item_ptr(eb, path->slots[0],
3279                                struct btrfs_extent_item);
3280                if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3281                        bi = (struct btrfs_tree_block_info *)(ei + 1);
3282                        level = btrfs_tree_block_level(eb, bi);
3283                } else {
3284                        level = (int)extent_key->offset;
3285                }
3286                generation = btrfs_extent_generation(eb, ei);
3287        } else {
3288#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3289                u64 ref_owner;
3290                int ret;
3291
3292                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3293                ret = get_ref_objectid_v0(rc, path, extent_key,
3294                                          &ref_owner, NULL);
3295                if (ret < 0)
3296                        return ret;
3297                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3298                level = (int)ref_owner;
3299                /* FIXME: get real generation */
3300                generation = 0;
3301#else
3302                BUG();
3303#endif
3304        }
3305
3306        btrfs_release_path(path);
3307
3308        BUG_ON(level == -1);
3309
3310        block = kmalloc(sizeof(*block), GFP_NOFS);
3311        if (!block)
3312                return -ENOMEM;
3313
3314        block->bytenr = extent_key->objectid;
3315        block->key.objectid = rc->extent_root->leafsize;
3316        block->key.offset = generation;
3317        block->level = level;
3318        block->key_ready = 0;
3319
3320        rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3321        if (rb_node)
3322                backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3323
3324        return 0;
3325}
3326
3327/*
3328 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3329 */
3330static int __add_tree_block(struct reloc_control *rc,
3331                            u64 bytenr, u32 blocksize,
3332                            struct rb_root *blocks)
3333{
3334        struct btrfs_path *path;
3335        struct btrfs_key key;
3336        int ret;
3337        bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3338                                        SKINNY_METADATA);
3339
3340        if (tree_block_processed(bytenr, blocksize, rc))
3341                return 0;
3342
3343        if (tree_search(blocks, bytenr))
3344                return 0;
3345
3346        path = btrfs_alloc_path();
3347        if (!path)
3348                return -ENOMEM;
3349again:
3350        key.objectid = bytenr;
3351        if (skinny) {
3352                key.type = BTRFS_METADATA_ITEM_KEY;
3353                key.offset = (u64)-1;
3354        } else {
3355                key.type = BTRFS_EXTENT_ITEM_KEY;
3356                key.offset = blocksize;
3357        }
3358
3359        path->search_commit_root = 1;
3360        path->skip_locking = 1;
3361        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3362        if (ret < 0)
3363                goto out;
3364
3365        if (ret > 0 && skinny) {
3366                if (path->slots[0]) {
3367                        path->slots[0]--;
3368                        btrfs_item_key_to_cpu(path->nodes[0], &key,
3369                                              path->slots[0]);
3370                        if (key.objectid == bytenr &&
3371                            (key.type == BTRFS_METADATA_ITEM_KEY ||
3372                             (key.type == BTRFS_EXTENT_ITEM_KEY &&
3373                              key.offset == blocksize)))
3374                                ret = 0;
3375                }
3376
3377                if (ret) {
3378                        skinny = false;
3379                        btrfs_release_path(path);
3380                        goto again;
3381                }
3382        }
3383        BUG_ON(ret);
3384
3385        ret = add_tree_block(rc, &key, path, blocks);
3386out:
3387        btrfs_free_path(path);
3388        return ret;
3389}
3390
3391/*
3392 * helper to check if the block use full backrefs for pointers in it
3393 */
3394static int block_use_full_backref(struct reloc_control *rc,
3395                                  struct extent_buffer *eb)
3396{
3397        u64 flags;
3398        int ret;
3399
3400        if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3401            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3402                return 1;
3403
3404        ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
3405                                       eb->start, btrfs_header_level(eb), 1,
3406                                       NULL, &flags);
3407        BUG_ON(ret);
3408
3409        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3410                ret = 1;
3411        else
3412                ret = 0;
3413        return ret;
3414}
3415
3416static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3417                                    struct inode *inode, u64 ino)
3418{
3419        struct btrfs_key key;
3420        struct btrfs_root *root = fs_info->tree_root;
3421        struct btrfs_trans_handle *trans;
3422        int ret = 0;
3423
3424        if (inode)
3425                goto truncate;
3426
3427        key.objectid = ino;
3428        key.type = BTRFS_INODE_ITEM_KEY;
3429        key.offset = 0;
3430
3431        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3432        if (IS_ERR(inode) || is_bad_inode(inode)) {
3433                if (!IS_ERR(inode))
3434                        iput(inode);
3435                return -ENOENT;
3436        }
3437
3438truncate:
3439        ret = btrfs_check_trunc_cache_free_space(root,
3440                                                 &fs_info->global_block_rsv);
3441        if (ret)
3442                goto out;
3443
3444        trans = btrfs_join_transaction(root);
3445        if (IS_ERR(trans)) {
3446                ret = PTR_ERR(trans);
3447                goto out;
3448        }
3449
3450        ret = btrfs_truncate_free_space_cache(root, trans, inode);
3451
3452        btrfs_end_transaction(trans, root);
3453        btrfs_btree_balance_dirty(root);
3454out:
3455        iput(inode);
3456        return ret;
3457}
3458
3459/*
3460 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3461 * this function scans fs tree to find blocks reference the data extent
3462 */
3463static int find_data_references(struct reloc_control *rc,
3464                                struct btrfs_key *extent_key,
3465                                struct extent_buffer *leaf,
3466                                struct btrfs_extent_data_ref *ref,
3467                                struct rb_root *blocks)
3468{
3469        struct btrfs_path *path;
3470        struct tree_block *block;
3471        struct btrfs_root *root;
3472        struct btrfs_file_extent_item *fi;
3473        struct rb_node *rb_node;
3474        struct btrfs_key key;
3475        u64 ref_root;
3476        u64 ref_objectid;
3477        u64 ref_offset;
3478        u32 ref_count;
3479        u32 nritems;
3480        int err = 0;
3481        int added = 0;
3482        int counted;
3483        int ret;
3484
3485        ref_root = btrfs_extent_data_ref_root(leaf, ref);
3486        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3487        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3488        ref_count = btrfs_extent_data_ref_count(leaf, ref);
3489
3490        /*
3491         * This is an extent belonging to the free space cache, lets just delete
3492         * it and redo the search.
3493         */
3494        if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3495                ret = delete_block_group_cache(rc->extent_root->fs_info,
3496                                               NULL, ref_objectid);
3497                if (ret != -ENOENT)
3498                        return ret;
3499                ret = 0;
3500        }
3501
3502        path = btrfs_alloc_path();
3503        if (!path)
3504                return -ENOMEM;
3505        path->reada = 1;
3506
3507        root = read_fs_root(rc->extent_root->fs_info, ref_root);
3508        if (IS_ERR(root)) {
3509                err = PTR_ERR(root);
3510                goto out;
3511        }
3512
3513        key.objectid = ref_objectid;
3514        key.type = BTRFS_EXTENT_DATA_KEY;
3515        if (ref_offset > ((u64)-1 << 32))
3516                key.offset = 0;
3517        else
3518                key.offset = ref_offset;
3519
3520        path->search_commit_root = 1;
3521        path->skip_locking = 1;
3522        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3523        if (ret < 0) {
3524                err = ret;
3525                goto out;
3526        }
3527
3528        leaf = path->nodes[0];
3529        nritems = btrfs_header_nritems(leaf);
3530        /*
3531         * the references in tree blocks that use full backrefs
3532         * are not counted in
3533         */
3534        if (block_use_full_backref(rc, leaf))
3535                counted = 0;
3536        else
3537                counted = 1;
3538        rb_node = tree_search(blocks, leaf->start);
3539        if (rb_node) {
3540                if (counted)
3541                        added = 1;
3542                else
3543                        path->slots[0] = nritems;
3544        }
3545
3546        while (ref_count > 0) {
3547                while (path->slots[0] >= nritems) {
3548                        ret = btrfs_next_leaf(root, path);
3549                        if (ret < 0) {
3550                                err = ret;
3551                                goto out;
3552                        }
3553                        if (WARN_ON(ret > 0))
3554                                goto out;
3555
3556                        leaf = path->nodes[0];
3557                        nritems = btrfs_header_nritems(leaf);
3558                        added = 0;
3559
3560                        if (block_use_full_backref(rc, leaf))
3561                                counted = 0;
3562                        else
3563                                counted = 1;
3564                        rb_node = tree_search(blocks, leaf->start);
3565                        if (rb_node) {
3566                                if (counted)
3567                                        added = 1;
3568                                else
3569                                        path->slots[0] = nritems;
3570                        }
3571                }
3572
3573                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3574                if (WARN_ON(key.objectid != ref_objectid ||
3575                    key.type != BTRFS_EXTENT_DATA_KEY))
3576                        break;
3577
3578                fi = btrfs_item_ptr(leaf, path->slots[0],
3579                                    struct btrfs_file_extent_item);
3580
3581                if (btrfs_file_extent_type(leaf, fi) ==
3582                    BTRFS_FILE_EXTENT_INLINE)
3583                        goto next;
3584
3585                if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3586                    extent_key->objectid)
3587                        goto next;
3588
3589                key.offset -= btrfs_file_extent_offset(leaf, fi);
3590                if (key.offset != ref_offset)
3591                        goto next;
3592
3593                if (counted)
3594                        ref_count--;
3595                if (added)
3596                        goto next;
3597
3598                if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3599                        block = kmalloc(sizeof(*block), GFP_NOFS);
3600                        if (!block) {
3601                                err = -ENOMEM;
3602                                break;
3603                        }
3604                        block->bytenr = leaf->start;
3605                        btrfs_item_key_to_cpu(leaf, &block->key, 0);
3606                        block->level = 0;
3607                        block->key_ready = 1;
3608                        rb_node = tree_insert(blocks, block->bytenr,
3609                                              &block->rb_node);
3610                        if (rb_node)
3611                                backref_tree_panic(rb_node, -EEXIST,
3612                                                   block->bytenr);
3613                }
3614                if (counted)
3615                        added = 1;
3616                else
3617                        path->slots[0] = nritems;
3618next:
3619                path->slots[0]++;
3620
3621        }
3622out:
3623        btrfs_free_path(path);
3624        return err;
3625}
3626
3627/*
3628 * helper to find all tree blocks that reference a given data extent
3629 */
3630static noinline_for_stack
3631int add_data_references(struct reloc_control *rc,
3632                        struct btrfs_key *extent_key,
3633                        struct btrfs_path *path,
3634                        struct rb_root *blocks)
3635{
3636        struct btrfs_key key;
3637        struct extent_buffer *eb;
3638        struct btrfs_extent_data_ref *dref;
3639        struct btrfs_extent_inline_ref *iref;
3640        unsigned long ptr;
3641        unsigned long end;
3642        u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3643        int ret = 0;
3644        int err = 0;
3645
3646        eb = path->nodes[0];
3647        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3648        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3649#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3650        if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3651                ptr = end;
3652        else
3653#endif
3654                ptr += sizeof(struct btrfs_extent_item);
3655
3656        while (ptr < end) {
3657                iref = (struct btrfs_extent_inline_ref *)ptr;
3658                key.type = btrfs_extent_inline_ref_type(eb, iref);
3659                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3660                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3661                        ret = __add_tree_block(rc, key.offset, blocksize,
3662                                               blocks);
3663                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3664                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3665                        ret = find_data_references(rc, extent_key,
3666                                                   eb, dref, blocks);
3667                } else {
3668                        BUG();
3669                }
3670                if (ret) {
3671                        err = ret;
3672                        goto out;
3673                }
3674                ptr += btrfs_extent_inline_ref_size(key.type);
3675        }
3676        WARN_ON(ptr > end);
3677
3678        while (1) {
3679                cond_resched();
3680                eb = path->nodes[0];
3681                if (path->slots[0] >= btrfs_header_nritems(eb)) {
3682                        ret = btrfs_next_leaf(rc->extent_root, path);
3683                        if (ret < 0) {
3684                                err = ret;
3685                                break;
3686                        }
3687                        if (ret > 0)
3688                                break;
3689                        eb = path->nodes[0];
3690                }
3691
3692                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3693                if (key.objectid != extent_key->objectid)
3694                        break;
3695
3696#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3697                if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3698                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3699#else
3700                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3701                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3702#endif
3703                        ret = __add_tree_block(rc, key.offset, blocksize,
3704                                               blocks);
3705                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3706                        dref = btrfs_item_ptr(eb, path->slots[0],
3707                                              struct btrfs_extent_data_ref);
3708                        ret = find_data_references(rc, extent_key,
3709                                                   eb, dref, blocks);
3710                } else {
3711                        ret = 0;
3712                }
3713                if (ret) {
3714                        err = ret;
3715                        break;
3716                }
3717                path->slots[0]++;
3718        }
3719out:
3720        btrfs_release_path(path);
3721        if (err)
3722                free_block_list(blocks);
3723        return err;
3724}
3725
3726/*
3727 * helper to find next unprocessed extent
3728 */
3729static noinline_for_stack
3730int find_next_extent(struct btrfs_trans_handle *trans,
3731                     struct reloc_control *rc, struct btrfs_path *path,
3732                     struct btrfs_key *extent_key)
3733{
3734        struct btrfs_key key;
3735        struct extent_buffer *leaf;
3736        u64 start, end, last;
3737        int ret;
3738
3739        last = rc->block_group->key.objectid + rc->block_group->key.offset;
3740        while (1) {
3741                cond_resched();
3742                if (rc->search_start >= last) {
3743                        ret = 1;
3744                        break;
3745                }
3746
3747                key.objectid = rc->search_start;
3748                key.type = BTRFS_EXTENT_ITEM_KEY;
3749                key.offset = 0;
3750
3751                path->search_commit_root = 1;
3752                path->skip_locking = 1;
3753                ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3754                                        0, 0);
3755                if (ret < 0)
3756                        break;
3757next:
3758                leaf = path->nodes[0];
3759                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3760                        ret = btrfs_next_leaf(rc->extent_root, path);
3761                        if (ret != 0)
3762                                break;
3763                        leaf = path->nodes[0];
3764                }
3765
3766                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3767                if (key.objectid >= last) {
3768                        ret = 1;
3769                        break;
3770                }
3771
3772                if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3773                    key.type != BTRFS_METADATA_ITEM_KEY) {
3774                        path->slots[0]++;
3775                        goto next;
3776                }
3777
3778                if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3779                    key.objectid + key.offset <= rc->search_start) {
3780                        path->slots[0]++;
3781                        goto next;
3782                }
3783
3784                if (key.type == BTRFS_METADATA_ITEM_KEY &&
3785                    key.objectid + rc->extent_root->leafsize <=
3786                    rc->search_start) {
3787                        path->slots[0]++;
3788                        goto next;
3789                }
3790
3791                ret = find_first_extent_bit(&rc->processed_blocks,
3792                                            key.objectid, &start, &end,
3793                                            EXTENT_DIRTY, NULL);
3794
3795                if (ret == 0 && start <= key.objectid) {
3796                        btrfs_release_path(path);
3797                        rc->search_start = end + 1;
3798                } else {
3799                        if (key.type == BTRFS_EXTENT_ITEM_KEY)
3800                                rc->search_start = key.objectid + key.offset;
3801                        else
3802                                rc->search_start = key.objectid +
3803                                        rc->extent_root->leafsize;
3804                        memcpy(extent_key, &key, sizeof(key));
3805                        return 0;
3806                }
3807        }
3808        btrfs_release_path(path);
3809        return ret;
3810}
3811
3812static void set_reloc_control(struct reloc_control *rc)
3813{
3814        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3815
3816        mutex_lock(&fs_info->reloc_mutex);
3817        fs_info->reloc_ctl = rc;
3818        mutex_unlock(&fs_info->reloc_mutex);
3819}
3820
3821static void unset_reloc_control(struct reloc_control *rc)
3822{
3823        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3824
3825        mutex_lock(&fs_info->reloc_mutex);
3826        fs_info->reloc_ctl = NULL;
3827        mutex_unlock(&fs_info->reloc_mutex);
3828}
3829
3830static int check_extent_flags(u64 flags)
3831{
3832        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3833            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3834                return 1;
3835        if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3836            !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3837                return 1;
3838        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3839            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3840                return 1;
3841        return 0;
3842}
3843
3844static noinline_for_stack
3845int prepare_to_relocate(struct reloc_control *rc)
3846{
3847        struct btrfs_trans_handle *trans;
3848
3849        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3850                                              BTRFS_BLOCK_RSV_TEMP);
3851        if (!rc->block_rsv)
3852                return -ENOMEM;
3853
3854        memset(&rc->cluster, 0, sizeof(rc->cluster));
3855        rc->search_start = rc->block_group->key.objectid;
3856        rc->extents_found = 0;
3857        rc->nodes_relocated = 0;
3858        rc->merging_rsv_size = 0;
3859        rc->reserved_bytes = 0;
3860        rc->block_rsv->size = rc->extent_root->nodesize *
3861                              RELOCATION_RESERVED_NODES;
3862
3863        rc->create_reloc_tree = 1;
3864        set_reloc_control(rc);
3865
3866        trans = btrfs_join_transaction(rc->extent_root);
3867        if (IS_ERR(trans)) {
3868                unset_reloc_control(rc);
3869                /*
3870                 * extent tree is not a ref_cow tree and has no reloc_root to
3871                 * cleanup.  And callers are responsible to free the above
3872                 * block rsv.
3873                 */
3874                return PTR_ERR(trans);
3875        }
3876        btrfs_commit_transaction(trans, rc->extent_root);
3877        return 0;
3878}
3879
3880static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3881{
3882        struct rb_root blocks = RB_ROOT;
3883        struct btrfs_key key;
3884        struct btrfs_trans_handle *trans = NULL;
3885        struct btrfs_path *path;
3886        struct btrfs_extent_item *ei;
3887        u64 flags;
3888        u32 item_size;
3889        int ret;
3890        int err = 0;
3891        int progress = 0;
3892
3893        path = btrfs_alloc_path();
3894        if (!path)
3895                return -ENOMEM;
3896        path->reada = 1;
3897
3898        ret = prepare_to_relocate(rc);
3899        if (ret) {
3900                err = ret;
3901                goto out_free;
3902        }
3903
3904        while (1) {
3905                rc->reserved_bytes = 0;
3906                ret = btrfs_block_rsv_refill(rc->extent_root,
3907                                        rc->block_rsv, rc->block_rsv->size,
3908                                        BTRFS_RESERVE_FLUSH_ALL);
3909                if (ret) {
3910                        err = ret;
3911                        break;
3912                }
3913                progress++;
3914                trans = btrfs_start_transaction(rc->extent_root, 0);
3915                if (IS_ERR(trans)) {
3916                        err = PTR_ERR(trans);
3917                        trans = NULL;
3918                        break;
3919                }
3920restart:
3921                if (update_backref_cache(trans, &rc->backref_cache)) {
3922                        btrfs_end_transaction(trans, rc->extent_root);
3923                        continue;
3924                }
3925
3926                ret = find_next_extent(trans, rc, path, &key);
3927                if (ret < 0)
3928                        err = ret;
3929                if (ret != 0)
3930                        break;
3931
3932                rc->extents_found++;
3933
3934                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3935                                    struct btrfs_extent_item);
3936                item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3937                if (item_size >= sizeof(*ei)) {
3938                        flags = btrfs_extent_flags(path->nodes[0], ei);
3939                        ret = check_extent_flags(flags);
3940                        BUG_ON(ret);
3941
3942                } else {
3943#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3944                        u64 ref_owner;
3945                        int path_change = 0;
3946
3947                        BUG_ON(item_size !=
3948                               sizeof(struct btrfs_extent_item_v0));
3949                        ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3950                                                  &path_change);
3951                        if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3952                                flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3953                        else
3954                                flags = BTRFS_EXTENT_FLAG_DATA;
3955
3956                        if (path_change) {
3957                                btrfs_release_path(path);
3958
3959                                path->search_commit_root = 1;
3960                                path->skip_locking = 1;
3961                                ret = btrfs_search_slot(NULL, rc->extent_root,
3962                                                        &key, path, 0, 0);
3963                                if (ret < 0) {
3964                                        err = ret;
3965                                        break;
3966                                }
3967                                BUG_ON(ret > 0);
3968                        }
3969#else
3970                        BUG();
3971#endif
3972                }
3973
3974                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3975                        ret = add_tree_block(rc, &key, path, &blocks);
3976                } else if (rc->stage == UPDATE_DATA_PTRS &&
3977                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
3978                        ret = add_data_references(rc, &key, path, &blocks);
3979                } else {
3980                        btrfs_release_path(path);
3981                        ret = 0;
3982                }
3983                if (ret < 0) {
3984                        err = ret;
3985                        break;
3986                }
3987
3988                if (!RB_EMPTY_ROOT(&blocks)) {
3989                        ret = relocate_tree_blocks(trans, rc, &blocks);
3990                        if (ret < 0) {
3991                                /*
3992                                 * if we fail to relocate tree blocks, force to update
3993                                 * backref cache when committing transaction.
3994                                 */
3995                                rc->backref_cache.last_trans = trans->transid - 1;
3996
3997                                if (ret != -EAGAIN) {
3998                                        err = ret;
3999                                        break;
4000                                }
4001                                rc->extents_found--;
4002                                rc->search_start = key.objectid;
4003                        }
4004                }
4005
4006                btrfs_end_transaction_throttle(trans, rc->extent_root);
4007                btrfs_btree_balance_dirty(rc->extent_root);
4008                trans = NULL;
4009
4010                if (rc->stage == MOVE_DATA_EXTENTS &&
4011                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4012                        rc->found_file_extent = 1;
4013                        ret = relocate_data_extent(rc->data_inode,
4014                                                   &key, &rc->cluster);
4015                        if (ret < 0) {
4016                                err = ret;
4017                                break;
4018                        }
4019                }
4020        }
4021        if (trans && progress && err == -ENOSPC) {
4022                ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
4023                                              rc->block_group->flags);
4024                if (ret == 0) {
4025                        err = 0;
4026                        progress = 0;
4027                        goto restart;
4028                }
4029        }
4030
4031        btrfs_release_path(path);
4032        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
4033                          GFP_NOFS);
4034
4035        if (trans) {
4036                btrfs_end_transaction_throttle(trans, rc->extent_root);
4037                btrfs_btree_balance_dirty(rc->extent_root);
4038        }
4039
4040        if (!err) {
4041                ret = relocate_file_extent_cluster(rc->data_inode,
4042                                                   &rc->cluster);
4043                if (ret < 0)
4044                        err = ret;
4045        }
4046
4047        rc->create_reloc_tree = 0;
4048        set_reloc_control(rc);
4049
4050        backref_cache_cleanup(&rc->backref_cache);
4051        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4052
4053        err = prepare_to_merge(rc, err);
4054
4055        merge_reloc_roots(rc);
4056
4057        rc->merge_reloc_tree = 0;
4058        unset_reloc_control(rc);
4059        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4060
4061        /* get rid of pinned extents */
4062        trans = btrfs_join_transaction(rc->extent_root);
4063        if (IS_ERR(trans))
4064                err = PTR_ERR(trans);
4065        else
4066                btrfs_commit_transaction(trans, rc->extent_root);
4067out_free:
4068        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
4069        btrfs_free_path(path);
4070        return err;
4071}
4072
4073static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4074                                 struct btrfs_root *root, u64 objectid)
4075{
4076        struct btrfs_path *path;
4077        struct btrfs_inode_item *item;
4078        struct extent_buffer *leaf;
4079        int ret;
4080
4081        path = btrfs_alloc_path();
4082        if (!path)
4083                return -ENOMEM;
4084
4085        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4086        if (ret)
4087                goto out;
4088
4089        leaf = path->nodes[0];
4090        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4091        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
4092        btrfs_set_inode_generation(leaf, item, 1);
4093        btrfs_set_inode_size(leaf, item, 0);
4094        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4095        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4096                                          BTRFS_INODE_PREALLOC);
4097        btrfs_mark_buffer_dirty(leaf);
4098        btrfs_release_path(path);
4099out:
4100        btrfs_free_path(path);
4101        return ret;
4102}
4103
4104/*
4105 * helper to create inode for data relocation.
4106 * the inode is in data relocation tree and its link count is 0
4107 */
4108static noinline_for_stack
4109struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4110                                 struct btrfs_block_group_cache *group)
4111{
4112        struct inode *inode = NULL;
4113        struct btrfs_trans_handle *trans;
4114        struct btrfs_root *root;
4115        struct btrfs_key key;
4116        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
4117        int err = 0;
4118
4119        root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4120        if (IS_ERR(root))
4121                return ERR_CAST(root);
4122
4123        trans = btrfs_start_transaction(root, 6);
4124        if (IS_ERR(trans))
4125                return ERR_CAST(trans);
4126
4127        err = btrfs_find_free_objectid(root, &objectid);
4128        if (err)
4129                goto out;
4130
4131        err = __insert_orphan_inode(trans, root, objectid);
4132        BUG_ON(err);
4133
4134        key.objectid = objectid;
4135        key.type = BTRFS_INODE_ITEM_KEY;
4136        key.offset = 0;
4137        inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
4138        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4139        BTRFS_I(inode)->index_cnt = group->key.objectid;
4140
4141        err = btrfs_orphan_add(trans, inode);
4142out:
4143        btrfs_end_transaction(trans, root);
4144        btrfs_btree_balance_dirty(root);
4145        if (err) {
4146                if (inode)
4147                        iput(inode);
4148                inode = ERR_PTR(err);
4149        }
4150        return inode;
4151}
4152
4153static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4154{
4155        struct reloc_control *rc;
4156
4157        rc = kzalloc(sizeof(*rc), GFP_NOFS);
4158        if (!rc)
4159                return NULL;
4160
4161        INIT_LIST_HEAD(&rc->reloc_roots);
4162        backref_cache_init(&rc->backref_cache);
4163        mapping_tree_init(&rc->reloc_root_tree);
4164        extent_io_tree_init(&rc->processed_blocks,
4165                            fs_info->btree_inode->i_mapping);
4166        return rc;
4167}
4168
4169/*
4170 * function to relocate all extents in a block group.
4171 */
4172int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4173{
4174        struct btrfs_fs_info *fs_info = extent_root->fs_info;
4175        struct reloc_control *rc;
4176        struct inode *inode;
4177        struct btrfs_path *path;
4178        int ret;
4179        int rw = 0;
4180        int err = 0;
4181
4182        rc = alloc_reloc_control(fs_info);
4183        if (!rc)
4184                return -ENOMEM;
4185
4186        rc->extent_root = extent_root;
4187
4188        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4189        BUG_ON(!rc->block_group);
4190
4191        if (!rc->block_group->ro) {
4192                ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
4193                if (ret) {
4194                        err = ret;
4195                        goto out;
4196                }
4197                rw = 1;
4198        }
4199
4200        path = btrfs_alloc_path();
4201        if (!path) {
4202                err = -ENOMEM;
4203                goto out;
4204        }
4205
4206        inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
4207                                        path);
4208        btrfs_free_path(path);
4209
4210        if (!IS_ERR(inode))
4211                ret = delete_block_group_cache(fs_info, inode, 0);
4212        else
4213                ret = PTR_ERR(inode);
4214
4215        if (ret && ret != -ENOENT) {
4216                err = ret;
4217                goto out;
4218        }
4219
4220        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4221        if (IS_ERR(rc->data_inode)) {
4222                err = PTR_ERR(rc->data_inode);
4223                rc->data_inode = NULL;
4224                goto out;
4225        }
4226
4227        btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4228               rc->block_group->key.objectid, rc->block_group->flags);
4229
4230        ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
4231        if (ret < 0) {
4232                err = ret;
4233                goto out;
4234        }
4235        btrfs_wait_ordered_roots(fs_info, -1);
4236
4237        while (1) {
4238                mutex_lock(&fs_info->cleaner_mutex);
4239                ret = relocate_block_group(rc);
4240                mutex_unlock(&fs_info->cleaner_mutex);
4241                if (ret < 0) {
4242                        err = ret;
4243                        goto out;
4244                }
4245
4246                if (rc->extents_found == 0)
4247                        break;
4248
4249                btrfs_info(extent_root->fs_info, "found %llu extents",
4250                        rc->extents_found);
4251
4252                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4253                        ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4254                                                       (u64)-1);
4255                        if (ret) {
4256                                err = ret;
4257                                goto out;
4258                        }
4259                        invalidate_mapping_pages(rc->data_inode->i_mapping,
4260                                                 0, -1);
4261                        rc->stage = UPDATE_DATA_PTRS;
4262                }
4263        }
4264
4265        WARN_ON(rc->block_group->pinned > 0);
4266        WARN_ON(rc->block_group->reserved > 0);
4267        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4268out:
4269        if (err && rw)
4270                btrfs_set_block_group_rw(extent_root, rc->block_group);
4271        iput(rc->data_inode);
4272        btrfs_put_block_group(rc->block_group);
4273        kfree(rc);
4274        return err;
4275}
4276
4277static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4278{
4279        struct btrfs_trans_handle *trans;
4280        int ret, err;
4281
4282        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4283        if (IS_ERR(trans))
4284                return PTR_ERR(trans);
4285
4286        memset(&root->root_item.drop_progress, 0,
4287                sizeof(root->root_item.drop_progress));
4288        root->root_item.drop_level = 0;
4289        btrfs_set_root_refs(&root->root_item, 0);
4290        ret = btrfs_update_root(trans, root->fs_info->tree_root,
4291                                &root->root_key, &root->root_item);
4292
4293        err = btrfs_end_transaction(trans, root->fs_info->tree_root);
4294        if (err)
4295                return err;
4296        return ret;
4297}
4298
4299/*
4300 * recover relocation interrupted by system crash.
4301 *
4302 * this function resumes merging reloc trees with corresponding fs trees.
4303 * this is important for keeping the sharing of tree blocks
4304 */
4305int btrfs_recover_relocation(struct btrfs_root *root)
4306{
4307        LIST_HEAD(reloc_roots);
4308        struct btrfs_key key;
4309        struct btrfs_root *fs_root;
4310        struct btrfs_root *reloc_root;
4311        struct btrfs_path *path;
4312        struct extent_buffer *leaf;
4313        struct reloc_control *rc = NULL;
4314        struct btrfs_trans_handle *trans;
4315        int ret;
4316        int err = 0;
4317
4318        path = btrfs_alloc_path();
4319        if (!path)
4320                return -ENOMEM;
4321        path->reada = -1;
4322
4323        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4324        key.type = BTRFS_ROOT_ITEM_KEY;
4325        key.offset = (u64)-1;
4326
4327        while (1) {
4328                ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
4329                                        path, 0, 0);
4330                if (ret < 0) {
4331                        err = ret;
4332                        goto out;
4333                }
4334                if (ret > 0) {
4335                        if (path->slots[0] == 0)
4336                                break;
4337                        path->slots[0]--;
4338                }
4339                leaf = path->nodes[0];
4340                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4341                btrfs_release_path(path);
4342
4343                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4344                    key.type != BTRFS_ROOT_ITEM_KEY)
4345                        break;
4346
4347                reloc_root = btrfs_read_fs_root(root, &key);
4348                if (IS_ERR(reloc_root)) {
4349                        err = PTR_ERR(reloc_root);
4350                        goto out;
4351                }
4352
4353                list_add(&reloc_root->root_list, &reloc_roots);
4354
4355                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4356                        fs_root = read_fs_root(root->fs_info,
4357                                               reloc_root->root_key.offset);
4358                        if (IS_ERR(fs_root)) {
4359                                ret = PTR_ERR(fs_root);
4360                                if (ret != -ENOENT) {
4361                                        err = ret;
4362                                        goto out;
4363                                }
4364                                ret = mark_garbage_root(reloc_root);
4365                                if (ret < 0) {
4366                                        err = ret;
4367                                        goto out;
4368                                }
4369                        }
4370                }
4371
4372                if (key.offset == 0)
4373                        break;
4374
4375                key.offset--;
4376        }
4377        btrfs_release_path(path);
4378
4379        if (list_empty(&reloc_roots))
4380                goto out;
4381
4382        rc = alloc_reloc_control(root->fs_info);
4383        if (!rc) {
4384                err = -ENOMEM;
4385                goto out;
4386        }
4387
4388        rc->extent_root = root->fs_info->extent_root;
4389
4390        set_reloc_control(rc);
4391
4392        trans = btrfs_join_transaction(rc->extent_root);
4393        if (IS_ERR(trans)) {
4394                unset_reloc_control(rc);
4395                err = PTR_ERR(trans);
4396                goto out_free;
4397        }
4398
4399        rc->merge_reloc_tree = 1;
4400
4401        while (!list_empty(&reloc_roots)) {
4402                reloc_root = list_entry(reloc_roots.next,
4403                                        struct btrfs_root, root_list);
4404                list_del(&reloc_root->root_list);
4405
4406                if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4407                        list_add_tail(&reloc_root->root_list,
4408                                      &rc->reloc_roots);
4409                        continue;
4410                }
4411
4412                fs_root = read_fs_root(root->fs_info,
4413                                       reloc_root->root_key.offset);
4414                if (IS_ERR(fs_root)) {
4415                        err = PTR_ERR(fs_root);
4416                        goto out_free;
4417                }
4418
4419                err = __add_reloc_root(reloc_root);
4420                BUG_ON(err < 0); /* -ENOMEM or logic error */
4421                fs_root->reloc_root = reloc_root;
4422        }
4423
4424        err = btrfs_commit_transaction(trans, rc->extent_root);
4425        if (err)
4426                goto out_free;
4427
4428        merge_reloc_roots(rc);
4429
4430        unset_reloc_control(rc);
4431
4432        trans = btrfs_join_transaction(rc->extent_root);
4433        if (IS_ERR(trans))
4434                err = PTR_ERR(trans);
4435        else
4436                err = btrfs_commit_transaction(trans, rc->extent_root);
4437out_free:
4438        kfree(rc);
4439out:
4440        if (!list_empty(&reloc_roots))
4441                free_reloc_roots(&reloc_roots);
4442
4443        btrfs_free_path(path);
4444
4445        if (err == 0) {
4446                /* cleanup orphan inode in data relocation tree */
4447                fs_root = read_fs_root(root->fs_info,
4448                                       BTRFS_DATA_RELOC_TREE_OBJECTID);
4449                if (IS_ERR(fs_root))
4450                        err = PTR_ERR(fs_root);
4451                else
4452                        err = btrfs_orphan_cleanup(fs_root);
4453        }
4454        return err;
4455}
4456
4457/*
4458 * helper to add ordered checksum for data relocation.
4459 *
4460 * cloning checksum properly handles the nodatasum extents.
4461 * it also saves CPU time to re-calculate the checksum.
4462 */
4463int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4464{
4465        struct btrfs_ordered_sum *sums;
4466        struct btrfs_ordered_extent *ordered;
4467        struct btrfs_root *root = BTRFS_I(inode)->root;
4468        int ret;
4469        u64 disk_bytenr;
4470        u64 new_bytenr;
4471        LIST_HEAD(list);
4472
4473        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4474        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4475
4476        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4477        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4478                                       disk_bytenr + len - 1, &list, 0);
4479        if (ret)
4480                goto out;
4481
4482        while (!list_empty(&list)) {
4483                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4484                list_del_init(&sums->list);
4485
4486                /*
4487                 * We need to offset the new_bytenr based on where the csum is.
4488                 * We need to do this because we will read in entire prealloc
4489                 * extents but we may have written to say the middle of the
4490                 * prealloc extent, so we need to make sure the csum goes with
4491                 * the right disk offset.
4492                 *
4493                 * We can do this because the data reloc inode refers strictly
4494                 * to the on disk bytes, so we don't have to worry about
4495                 * disk_len vs real len like with real inodes since it's all
4496                 * disk length.
4497                 */
4498                new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4499                sums->bytenr = new_bytenr;
4500
4501                btrfs_add_ordered_sum(inode, ordered, sums);
4502        }
4503out:
4504        btrfs_put_ordered_extent(ordered);
4505        return ret;
4506}
4507
4508int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4509                          struct btrfs_root *root, struct extent_buffer *buf,
4510                          struct extent_buffer *cow)
4511{
4512        struct reloc_control *rc;
4513        struct backref_node *node;
4514        int first_cow = 0;
4515        int level;
4516        int ret = 0;
4517
4518        rc = root->fs_info->reloc_ctl;
4519        if (!rc)
4520                return 0;
4521
4522        BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4523               root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4524
4525        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4526                if (buf == root->node)
4527                        __update_reloc_root(root, cow->start);
4528        }
4529
4530        level = btrfs_header_level(buf);
4531        if (btrfs_header_generation(buf) <=
4532            btrfs_root_last_snapshot(&root->root_item))
4533                first_cow = 1;
4534
4535        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4536            rc->create_reloc_tree) {
4537                WARN_ON(!first_cow && level == 0);
4538
4539                node = rc->backref_cache.path[level];
4540                BUG_ON(node->bytenr != buf->start &&
4541                       node->new_bytenr != buf->start);
4542
4543                drop_node_buffer(node);
4544                extent_buffer_get(cow);
4545                node->eb = cow;
4546                node->new_bytenr = cow->start;
4547
4548                if (!node->pending) {
4549                        list_move_tail(&node->list,
4550                                       &rc->backref_cache.pending[level]);
4551                        node->pending = 1;
4552                }
4553
4554                if (first_cow)
4555                        __mark_block_processed(rc, node);
4556
4557                if (first_cow && level > 0)
4558                        rc->nodes_relocated += buf->len;
4559        }
4560
4561        if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4562                ret = replace_file_extents(trans, rc, root, cow);
4563        return ret;
4564}
4565
4566/*
4567 * called before creating snapshot. it calculates metadata reservation
4568 * requried for relocating tree blocks in the snapshot
4569 */
4570void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4571                              struct btrfs_pending_snapshot *pending,
4572                              u64 *bytes_to_reserve)
4573{
4574        struct btrfs_root *root;
4575        struct reloc_control *rc;
4576
4577        root = pending->root;
4578        if (!root->reloc_root)
4579                return;
4580
4581        rc = root->fs_info->reloc_ctl;
4582        if (!rc->merge_reloc_tree)
4583                return;
4584
4585        root = root->reloc_root;
4586        BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4587        /*
4588         * relocation is in the stage of merging trees. the space
4589         * used by merging a reloc tree is twice the size of
4590         * relocated tree nodes in the worst case. half for cowing
4591         * the reloc tree, half for cowing the fs tree. the space
4592         * used by cowing the reloc tree will be freed after the
4593         * tree is dropped. if we create snapshot, cowing the fs
4594         * tree may use more space than it frees. so we need
4595         * reserve extra space.
4596         */
4597        *bytes_to_reserve += rc->nodes_relocated;
4598}
4599
4600/*
4601 * called after snapshot is created. migrate block reservation
4602 * and create reloc root for the newly created snapshot
4603 */
4604int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4605                               struct btrfs_pending_snapshot *pending)
4606{
4607        struct btrfs_root *root = pending->root;
4608        struct btrfs_root *reloc_root;
4609        struct btrfs_root *new_root;
4610        struct reloc_control *rc;
4611        int ret;
4612
4613        if (!root->reloc_root)
4614                return 0;
4615
4616        rc = root->fs_info->reloc_ctl;
4617        rc->merging_rsv_size += rc->nodes_relocated;
4618
4619        if (rc->merge_reloc_tree) {
4620                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4621                                              rc->block_rsv,
4622                                              rc->nodes_relocated);
4623                if (ret)
4624                        return ret;
4625        }
4626
4627        new_root = pending->snap;
4628        reloc_root = create_reloc_root(trans, root->reloc_root,
4629                                       new_root->root_key.objectid);
4630        if (IS_ERR(reloc_root))
4631                return PTR_ERR(reloc_root);
4632
4633        ret = __add_reloc_root(reloc_root);
4634        BUG_ON(ret < 0);
4635        new_root->reloc_root = reloc_root;
4636
4637        if (rc->create_reloc_tree)
4638                ret = clone_backref_node(trans, rc, root, reloc_root);
4639        return ret;
4640}
4641