linux/fs/btrfs/relocation.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2009 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/pagemap.h>
  21#include <linux/writeback.h>
  22#include <linux/blkdev.h>
  23#include <linux/rbtree.h>
  24#include <linux/slab.h>
  25#include "ctree.h"
  26#include "disk-io.h"
  27#include "transaction.h"
  28#include "volumes.h"
  29#include "locking.h"
  30#include "btrfs_inode.h"
  31#include "async-thread.h"
  32#include "free-space-cache.h"
  33#include "inode-map.h"
  34
  35/*
  36 * backref_node, mapping_node and tree_block start with this
  37 */
  38struct tree_entry {
  39        struct rb_node rb_node;
  40        u64 bytenr;
  41};
  42
  43/*
  44 * present a tree block in the backref cache
  45 */
  46struct backref_node {
  47        struct rb_node rb_node;
  48        u64 bytenr;
  49
  50        u64 new_bytenr;
  51        /* objectid of tree block owner, can be not uptodate */
  52        u64 owner;
  53        /* link to pending, changed or detached list */
  54        struct list_head list;
  55        /* list of upper level blocks reference this block */
  56        struct list_head upper;
  57        /* list of child blocks in the cache */
  58        struct list_head lower;
  59        /* NULL if this node is not tree root */
  60        struct btrfs_root *root;
  61        /* extent buffer got by COW the block */
  62        struct extent_buffer *eb;
  63        /* level of tree block */
  64        unsigned int level:8;
  65        /* is the block in non-reference counted tree */
  66        unsigned int cowonly:1;
  67        /* 1 if no child node in the cache */
  68        unsigned int lowest:1;
  69        /* is the extent buffer locked */
  70        unsigned int locked:1;
  71        /* has the block been processed */
  72        unsigned int processed:1;
  73        /* have backrefs of this block been checked */
  74        unsigned int checked:1;
  75        /*
  76         * 1 if corresponding block has been cowed but some upper
  77         * level block pointers may not point to the new location
  78         */
  79        unsigned int pending:1;
  80        /*
  81         * 1 if the backref node isn't connected to any other
  82         * backref node.
  83         */
  84        unsigned int detached:1;
  85};
  86
  87/*
  88 * present a block pointer in the backref cache
  89 */
  90struct backref_edge {
  91        struct list_head list[2];
  92        struct backref_node *node[2];
  93};
  94
  95#define LOWER   0
  96#define UPPER   1
  97
  98struct backref_cache {
  99        /* red black tree of all backref nodes in the cache */
 100        struct rb_root rb_root;
 101        /* for passing backref nodes to btrfs_reloc_cow_block */
 102        struct backref_node *path[BTRFS_MAX_LEVEL];
 103        /*
 104         * list of blocks that have been cowed but some block
 105         * pointers in upper level blocks may not reflect the
 106         * new location
 107         */
 108        struct list_head pending[BTRFS_MAX_LEVEL];
 109        /* list of backref nodes with no child node */
 110        struct list_head leaves;
 111        /* list of blocks that have been cowed in current transaction */
 112        struct list_head changed;
 113        /* list of detached backref node. */
 114        struct list_head detached;
 115
 116        u64 last_trans;
 117
 118        int nr_nodes;
 119        int nr_edges;
 120};
 121
 122/*
 123 * map address of tree root to tree
 124 */
 125struct mapping_node {
 126        struct rb_node rb_node;
 127        u64 bytenr;
 128        void *data;
 129};
 130
 131struct mapping_tree {
 132        struct rb_root rb_root;
 133        spinlock_t lock;
 134};
 135
 136/*
 137 * present a tree block to process
 138 */
 139struct tree_block {
 140        struct rb_node rb_node;
 141        u64 bytenr;
 142        struct btrfs_key key;
 143        unsigned int level:8;
 144        unsigned int key_ready:1;
 145};
 146
 147#define MAX_EXTENTS 128
 148
 149struct file_extent_cluster {
 150        u64 start;
 151        u64 end;
 152        u64 boundary[MAX_EXTENTS];
 153        unsigned int nr;
 154};
 155
 156struct reloc_control {
 157        /* block group to relocate */
 158        struct btrfs_block_group_cache *block_group;
 159        /* extent tree */
 160        struct btrfs_root *extent_root;
 161        /* inode for moving data */
 162        struct inode *data_inode;
 163
 164        struct btrfs_block_rsv *block_rsv;
 165
 166        struct backref_cache backref_cache;
 167
 168        struct file_extent_cluster cluster;
 169        /* tree blocks have been processed */
 170        struct extent_io_tree processed_blocks;
 171        /* map start of tree root to corresponding reloc tree */
 172        struct mapping_tree reloc_root_tree;
 173        /* list of reloc trees */
 174        struct list_head reloc_roots;
 175        /* size of metadata reservation for merging reloc trees */
 176        u64 merging_rsv_size;
 177        /* size of relocated tree nodes */
 178        u64 nodes_relocated;
 179
 180        u64 search_start;
 181        u64 extents_found;
 182
 183        unsigned int stage:8;
 184        unsigned int create_reloc_tree:1;
 185        unsigned int merge_reloc_tree:1;
 186        unsigned int found_file_extent:1;
 187        unsigned int commit_transaction:1;
 188};
 189
 190/* stages of data relocation */
 191#define MOVE_DATA_EXTENTS       0
 192#define UPDATE_DATA_PTRS        1
 193
 194static void remove_backref_node(struct backref_cache *cache,
 195                                struct backref_node *node);
 196static void __mark_block_processed(struct reloc_control *rc,
 197                                   struct backref_node *node);
 198
 199static void mapping_tree_init(struct mapping_tree *tree)
 200{
 201        tree->rb_root = RB_ROOT;
 202        spin_lock_init(&tree->lock);
 203}
 204
 205static void backref_cache_init(struct backref_cache *cache)
 206{
 207        int i;
 208        cache->rb_root = RB_ROOT;
 209        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 210                INIT_LIST_HEAD(&cache->pending[i]);
 211        INIT_LIST_HEAD(&cache->changed);
 212        INIT_LIST_HEAD(&cache->detached);
 213        INIT_LIST_HEAD(&cache->leaves);
 214}
 215
 216static void backref_cache_cleanup(struct backref_cache *cache)
 217{
 218        struct backref_node *node;
 219        int i;
 220
 221        while (!list_empty(&cache->detached)) {
 222                node = list_entry(cache->detached.next,
 223                                  struct backref_node, list);
 224                remove_backref_node(cache, node);
 225        }
 226
 227        while (!list_empty(&cache->leaves)) {
 228                node = list_entry(cache->leaves.next,
 229                                  struct backref_node, lower);
 230                remove_backref_node(cache, node);
 231        }
 232
 233        cache->last_trans = 0;
 234
 235        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 236                BUG_ON(!list_empty(&cache->pending[i]));
 237        BUG_ON(!list_empty(&cache->changed));
 238        BUG_ON(!list_empty(&cache->detached));
 239        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
 240        BUG_ON(cache->nr_nodes);
 241        BUG_ON(cache->nr_edges);
 242}
 243
 244static struct backref_node *alloc_backref_node(struct backref_cache *cache)
 245{
 246        struct backref_node *node;
 247
 248        node = kzalloc(sizeof(*node), GFP_NOFS);
 249        if (node) {
 250                INIT_LIST_HEAD(&node->list);
 251                INIT_LIST_HEAD(&node->upper);
 252                INIT_LIST_HEAD(&node->lower);
 253                RB_CLEAR_NODE(&node->rb_node);
 254                cache->nr_nodes++;
 255        }
 256        return node;
 257}
 258
 259static void free_backref_node(struct backref_cache *cache,
 260                              struct backref_node *node)
 261{
 262        if (node) {
 263                cache->nr_nodes--;
 264                kfree(node);
 265        }
 266}
 267
 268static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
 269{
 270        struct backref_edge *edge;
 271
 272        edge = kzalloc(sizeof(*edge), GFP_NOFS);
 273        if (edge)
 274                cache->nr_edges++;
 275        return edge;
 276}
 277
 278static void free_backref_edge(struct backref_cache *cache,
 279                              struct backref_edge *edge)
 280{
 281        if (edge) {
 282                cache->nr_edges--;
 283                kfree(edge);
 284        }
 285}
 286
 287static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 288                                   struct rb_node *node)
 289{
 290        struct rb_node **p = &root->rb_node;
 291        struct rb_node *parent = NULL;
 292        struct tree_entry *entry;
 293
 294        while (*p) {
 295                parent = *p;
 296                entry = rb_entry(parent, struct tree_entry, rb_node);
 297
 298                if (bytenr < entry->bytenr)
 299                        p = &(*p)->rb_left;
 300                else if (bytenr > entry->bytenr)
 301                        p = &(*p)->rb_right;
 302                else
 303                        return parent;
 304        }
 305
 306        rb_link_node(node, parent, p);
 307        rb_insert_color(node, root);
 308        return NULL;
 309}
 310
 311static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 312{
 313        struct rb_node *n = root->rb_node;
 314        struct tree_entry *entry;
 315
 316        while (n) {
 317                entry = rb_entry(n, struct tree_entry, rb_node);
 318
 319                if (bytenr < entry->bytenr)
 320                        n = n->rb_left;
 321                else if (bytenr > entry->bytenr)
 322                        n = n->rb_right;
 323                else
 324                        return n;
 325        }
 326        return NULL;
 327}
 328
 329static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
 330{
 331
 332        struct btrfs_fs_info *fs_info = NULL;
 333        struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
 334                                              rb_node);
 335        if (bnode->root)
 336                fs_info = bnode->root->fs_info;
 337        btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
 338                    "found at offset %llu\n", (unsigned long long)bytenr);
 339}
 340
 341/*
 342 * walk up backref nodes until reach node presents tree root
 343 */
 344static struct backref_node *walk_up_backref(struct backref_node *node,
 345                                            struct backref_edge *edges[],
 346                                            int *index)
 347{
 348        struct backref_edge *edge;
 349        int idx = *index;
 350
 351        while (!list_empty(&node->upper)) {
 352                edge = list_entry(node->upper.next,
 353                                  struct backref_edge, list[LOWER]);
 354                edges[idx++] = edge;
 355                node = edge->node[UPPER];
 356        }
 357        BUG_ON(node->detached);
 358        *index = idx;
 359        return node;
 360}
 361
 362/*
 363 * walk down backref nodes to find start of next reference path
 364 */
 365static struct backref_node *walk_down_backref(struct backref_edge *edges[],
 366                                              int *index)
 367{
 368        struct backref_edge *edge;
 369        struct backref_node *lower;
 370        int idx = *index;
 371
 372        while (idx > 0) {
 373                edge = edges[idx - 1];
 374                lower = edge->node[LOWER];
 375                if (list_is_last(&edge->list[LOWER], &lower->upper)) {
 376                        idx--;
 377                        continue;
 378                }
 379                edge = list_entry(edge->list[LOWER].next,
 380                                  struct backref_edge, list[LOWER]);
 381                edges[idx - 1] = edge;
 382                *index = idx;
 383                return edge->node[UPPER];
 384        }
 385        *index = 0;
 386        return NULL;
 387}
 388
 389static void unlock_node_buffer(struct backref_node *node)
 390{
 391        if (node->locked) {
 392                btrfs_tree_unlock(node->eb);
 393                node->locked = 0;
 394        }
 395}
 396
 397static void drop_node_buffer(struct backref_node *node)
 398{
 399        if (node->eb) {
 400                unlock_node_buffer(node);
 401                free_extent_buffer(node->eb);
 402                node->eb = NULL;
 403        }
 404}
 405
 406static void drop_backref_node(struct backref_cache *tree,
 407                              struct backref_node *node)
 408{
 409        BUG_ON(!list_empty(&node->upper));
 410
 411        drop_node_buffer(node);
 412        list_del(&node->list);
 413        list_del(&node->lower);
 414        if (!RB_EMPTY_NODE(&node->rb_node))
 415                rb_erase(&node->rb_node, &tree->rb_root);
 416        free_backref_node(tree, node);
 417}
 418
 419/*
 420 * remove a backref node from the backref cache
 421 */
 422static void remove_backref_node(struct backref_cache *cache,
 423                                struct backref_node *node)
 424{
 425        struct backref_node *upper;
 426        struct backref_edge *edge;
 427
 428        if (!node)
 429                return;
 430
 431        BUG_ON(!node->lowest && !node->detached);
 432        while (!list_empty(&node->upper)) {
 433                edge = list_entry(node->upper.next, struct backref_edge,
 434                                  list[LOWER]);
 435                upper = edge->node[UPPER];
 436                list_del(&edge->list[LOWER]);
 437                list_del(&edge->list[UPPER]);
 438                free_backref_edge(cache, edge);
 439
 440                if (RB_EMPTY_NODE(&upper->rb_node)) {
 441                        BUG_ON(!list_empty(&node->upper));
 442                        drop_backref_node(cache, node);
 443                        node = upper;
 444                        node->lowest = 1;
 445                        continue;
 446                }
 447                /*
 448                 * add the node to leaf node list if no other
 449                 * child block cached.
 450                 */
 451                if (list_empty(&upper->lower)) {
 452                        list_add_tail(&upper->lower, &cache->leaves);
 453                        upper->lowest = 1;
 454                }
 455        }
 456
 457        drop_backref_node(cache, node);
 458}
 459
 460static void update_backref_node(struct backref_cache *cache,
 461                                struct backref_node *node, u64 bytenr)
 462{
 463        struct rb_node *rb_node;
 464        rb_erase(&node->rb_node, &cache->rb_root);
 465        node->bytenr = bytenr;
 466        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
 467        if (rb_node)
 468                backref_tree_panic(rb_node, -EEXIST, bytenr);
 469}
 470
 471/*
 472 * update backref cache after a transaction commit
 473 */
 474static int update_backref_cache(struct btrfs_trans_handle *trans,
 475                                struct backref_cache *cache)
 476{
 477        struct backref_node *node;
 478        int level = 0;
 479
 480        if (cache->last_trans == 0) {
 481                cache->last_trans = trans->transid;
 482                return 0;
 483        }
 484
 485        if (cache->last_trans == trans->transid)
 486                return 0;
 487
 488        /*
 489         * detached nodes are used to avoid unnecessary backref
 490         * lookup. transaction commit changes the extent tree.
 491         * so the detached nodes are no longer useful.
 492         */
 493        while (!list_empty(&cache->detached)) {
 494                node = list_entry(cache->detached.next,
 495                                  struct backref_node, list);
 496                remove_backref_node(cache, node);
 497        }
 498
 499        while (!list_empty(&cache->changed)) {
 500                node = list_entry(cache->changed.next,
 501                                  struct backref_node, list);
 502                list_del_init(&node->list);
 503                BUG_ON(node->pending);
 504                update_backref_node(cache, node, node->new_bytenr);
 505        }
 506
 507        /*
 508         * some nodes can be left in the pending list if there were
 509         * errors during processing the pending nodes.
 510         */
 511        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 512                list_for_each_entry(node, &cache->pending[level], list) {
 513                        BUG_ON(!node->pending);
 514                        if (node->bytenr == node->new_bytenr)
 515                                continue;
 516                        update_backref_node(cache, node, node->new_bytenr);
 517                }
 518        }
 519
 520        cache->last_trans = 0;
 521        return 1;
 522}
 523
 524
 525static int should_ignore_root(struct btrfs_root *root)
 526{
 527        struct btrfs_root *reloc_root;
 528
 529        if (!root->ref_cows)
 530                return 0;
 531
 532        reloc_root = root->reloc_root;
 533        if (!reloc_root)
 534                return 0;
 535
 536        if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
 537            root->fs_info->running_transaction->transid - 1)
 538                return 0;
 539        /*
 540         * if there is reloc tree and it was created in previous
 541         * transaction backref lookup can find the reloc tree,
 542         * so backref node for the fs tree root is useless for
 543         * relocation.
 544         */
 545        return 1;
 546}
 547/*
 548 * find reloc tree by address of tree root
 549 */
 550static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
 551                                          u64 bytenr)
 552{
 553        struct rb_node *rb_node;
 554        struct mapping_node *node;
 555        struct btrfs_root *root = NULL;
 556
 557        spin_lock(&rc->reloc_root_tree.lock);
 558        rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
 559        if (rb_node) {
 560                node = rb_entry(rb_node, struct mapping_node, rb_node);
 561                root = (struct btrfs_root *)node->data;
 562        }
 563        spin_unlock(&rc->reloc_root_tree.lock);
 564        return root;
 565}
 566
 567static int is_cowonly_root(u64 root_objectid)
 568{
 569        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
 570            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
 571            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
 572            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
 573            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
 574            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
 575                return 1;
 576        return 0;
 577}
 578
 579static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
 580                                        u64 root_objectid)
 581{
 582        struct btrfs_key key;
 583
 584        key.objectid = root_objectid;
 585        key.type = BTRFS_ROOT_ITEM_KEY;
 586        if (is_cowonly_root(root_objectid))
 587                key.offset = 0;
 588        else
 589                key.offset = (u64)-1;
 590
 591        return btrfs_read_fs_root_no_name(fs_info, &key);
 592}
 593
 594#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 595static noinline_for_stack
 596struct btrfs_root *find_tree_root(struct reloc_control *rc,
 597                                  struct extent_buffer *leaf,
 598                                  struct btrfs_extent_ref_v0 *ref0)
 599{
 600        struct btrfs_root *root;
 601        u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
 602        u64 generation = btrfs_ref_generation_v0(leaf, ref0);
 603
 604        BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
 605
 606        root = read_fs_root(rc->extent_root->fs_info, root_objectid);
 607        BUG_ON(IS_ERR(root));
 608
 609        if (root->ref_cows &&
 610            generation != btrfs_root_generation(&root->root_item))
 611                return NULL;
 612
 613        return root;
 614}
 615#endif
 616
 617static noinline_for_stack
 618int find_inline_backref(struct extent_buffer *leaf, int slot,
 619                        unsigned long *ptr, unsigned long *end)
 620{
 621        struct btrfs_key key;
 622        struct btrfs_extent_item *ei;
 623        struct btrfs_tree_block_info *bi;
 624        u32 item_size;
 625
 626        btrfs_item_key_to_cpu(leaf, &key, slot);
 627
 628        item_size = btrfs_item_size_nr(leaf, slot);
 629#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 630        if (item_size < sizeof(*ei)) {
 631                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
 632                return 1;
 633        }
 634#endif
 635        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 636        WARN_ON(!(btrfs_extent_flags(leaf, ei) &
 637                  BTRFS_EXTENT_FLAG_TREE_BLOCK));
 638
 639        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 640            item_size <= sizeof(*ei) + sizeof(*bi)) {
 641                WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 642                return 1;
 643        }
 644
 645        if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 646                bi = (struct btrfs_tree_block_info *)(ei + 1);
 647                *ptr = (unsigned long)(bi + 1);
 648        } else {
 649                *ptr = (unsigned long)(ei + 1);
 650        }
 651        *end = (unsigned long)ei + item_size;
 652        return 0;
 653}
 654
 655/*
 656 * build backref tree for a given tree block. root of the backref tree
 657 * corresponds the tree block, leaves of the backref tree correspond
 658 * roots of b-trees that reference the tree block.
 659 *
 660 * the basic idea of this function is check backrefs of a given block
 661 * to find upper level blocks that refernece the block, and then check
 662 * bakcrefs of these upper level blocks recursively. the recursion stop
 663 * when tree root is reached or backrefs for the block is cached.
 664 *
 665 * NOTE: if we find backrefs for a block are cached, we know backrefs
 666 * for all upper level blocks that directly/indirectly reference the
 667 * block are also cached.
 668 */
 669static noinline_for_stack
 670struct backref_node *build_backref_tree(struct reloc_control *rc,
 671                                        struct btrfs_key *node_key,
 672                                        int level, u64 bytenr)
 673{
 674        struct backref_cache *cache = &rc->backref_cache;
 675        struct btrfs_path *path1;
 676        struct btrfs_path *path2;
 677        struct extent_buffer *eb;
 678        struct btrfs_root *root;
 679        struct backref_node *cur;
 680        struct backref_node *upper;
 681        struct backref_node *lower;
 682        struct backref_node *node = NULL;
 683        struct backref_node *exist = NULL;
 684        struct backref_edge *edge;
 685        struct rb_node *rb_node;
 686        struct btrfs_key key;
 687        unsigned long end;
 688        unsigned long ptr;
 689        LIST_HEAD(list);
 690        LIST_HEAD(useless);
 691        int cowonly;
 692        int ret;
 693        int err = 0;
 694
 695        path1 = btrfs_alloc_path();
 696        path2 = btrfs_alloc_path();
 697        if (!path1 || !path2) {
 698                err = -ENOMEM;
 699                goto out;
 700        }
 701        path1->reada = 1;
 702        path2->reada = 2;
 703
 704        node = alloc_backref_node(cache);
 705        if (!node) {
 706                err = -ENOMEM;
 707                goto out;
 708        }
 709
 710        node->bytenr = bytenr;
 711        node->level = level;
 712        node->lowest = 1;
 713        cur = node;
 714again:
 715        end = 0;
 716        ptr = 0;
 717        key.objectid = cur->bytenr;
 718        key.type = BTRFS_METADATA_ITEM_KEY;
 719        key.offset = (u64)-1;
 720
 721        path1->search_commit_root = 1;
 722        path1->skip_locking = 1;
 723        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
 724                                0, 0);
 725        if (ret < 0) {
 726                err = ret;
 727                goto out;
 728        }
 729        BUG_ON(!ret || !path1->slots[0]);
 730
 731        path1->slots[0]--;
 732
 733        WARN_ON(cur->checked);
 734        if (!list_empty(&cur->upper)) {
 735                /*
 736                 * the backref was added previously when processing
 737                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
 738                 */
 739                BUG_ON(!list_is_singular(&cur->upper));
 740                edge = list_entry(cur->upper.next, struct backref_edge,
 741                                  list[LOWER]);
 742                BUG_ON(!list_empty(&edge->list[UPPER]));
 743                exist = edge->node[UPPER];
 744                /*
 745                 * add the upper level block to pending list if we need
 746                 * check its backrefs
 747                 */
 748                if (!exist->checked)
 749                        list_add_tail(&edge->list[UPPER], &list);
 750        } else {
 751                exist = NULL;
 752        }
 753
 754        while (1) {
 755                cond_resched();
 756                eb = path1->nodes[0];
 757
 758                if (ptr >= end) {
 759                        if (path1->slots[0] >= btrfs_header_nritems(eb)) {
 760                                ret = btrfs_next_leaf(rc->extent_root, path1);
 761                                if (ret < 0) {
 762                                        err = ret;
 763                                        goto out;
 764                                }
 765                                if (ret > 0)
 766                                        break;
 767                                eb = path1->nodes[0];
 768                        }
 769
 770                        btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
 771                        if (key.objectid != cur->bytenr) {
 772                                WARN_ON(exist);
 773                                break;
 774                        }
 775
 776                        if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 777                            key.type == BTRFS_METADATA_ITEM_KEY) {
 778                                ret = find_inline_backref(eb, path1->slots[0],
 779                                                          &ptr, &end);
 780                                if (ret)
 781                                        goto next;
 782                        }
 783                }
 784
 785                if (ptr < end) {
 786                        /* update key for inline back ref */
 787                        struct btrfs_extent_inline_ref *iref;
 788                        iref = (struct btrfs_extent_inline_ref *)ptr;
 789                        key.type = btrfs_extent_inline_ref_type(eb, iref);
 790                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
 791                        WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
 792                                key.type != BTRFS_SHARED_BLOCK_REF_KEY);
 793                }
 794
 795                if (exist &&
 796                    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
 797                      exist->owner == key.offset) ||
 798                     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
 799                      exist->bytenr == key.offset))) {
 800                        exist = NULL;
 801                        goto next;
 802                }
 803
 804#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 805                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
 806                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
 807                        if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
 808                                struct btrfs_extent_ref_v0 *ref0;
 809                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
 810                                                struct btrfs_extent_ref_v0);
 811                                if (key.objectid == key.offset) {
 812                                        root = find_tree_root(rc, eb, ref0);
 813                                        if (root && !should_ignore_root(root))
 814                                                cur->root = root;
 815                                        else
 816                                                list_add(&cur->list, &useless);
 817                                        break;
 818                                }
 819                                if (is_cowonly_root(btrfs_ref_root_v0(eb,
 820                                                                      ref0)))
 821                                        cur->cowonly = 1;
 822                        }
 823#else
 824                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
 825                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
 826#endif
 827                        if (key.objectid == key.offset) {
 828                                /*
 829                                 * only root blocks of reloc trees use
 830                                 * backref of this type.
 831                                 */
 832                                root = find_reloc_root(rc, cur->bytenr);
 833                                BUG_ON(!root);
 834                                cur->root = root;
 835                                break;
 836                        }
 837
 838                        edge = alloc_backref_edge(cache);
 839                        if (!edge) {
 840                                err = -ENOMEM;
 841                                goto out;
 842                        }
 843                        rb_node = tree_search(&cache->rb_root, key.offset);
 844                        if (!rb_node) {
 845                                upper = alloc_backref_node(cache);
 846                                if (!upper) {
 847                                        free_backref_edge(cache, edge);
 848                                        err = -ENOMEM;
 849                                        goto out;
 850                                }
 851                                upper->bytenr = key.offset;
 852                                upper->level = cur->level + 1;
 853                                /*
 854                                 *  backrefs for the upper level block isn't
 855                                 *  cached, add the block to pending list
 856                                 */
 857                                list_add_tail(&edge->list[UPPER], &list);
 858                        } else {
 859                                upper = rb_entry(rb_node, struct backref_node,
 860                                                 rb_node);
 861                                BUG_ON(!upper->checked);
 862                                INIT_LIST_HEAD(&edge->list[UPPER]);
 863                        }
 864                        list_add_tail(&edge->list[LOWER], &cur->upper);
 865                        edge->node[LOWER] = cur;
 866                        edge->node[UPPER] = upper;
 867
 868                        goto next;
 869                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
 870                        goto next;
 871                }
 872
 873                /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
 874                root = read_fs_root(rc->extent_root->fs_info, key.offset);
 875                if (IS_ERR(root)) {
 876                        err = PTR_ERR(root);
 877                        goto out;
 878                }
 879
 880                if (!root->ref_cows)
 881                        cur->cowonly = 1;
 882
 883                if (btrfs_root_level(&root->root_item) == cur->level) {
 884                        /* tree root */
 885                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
 886                               cur->bytenr);
 887                        if (should_ignore_root(root))
 888                                list_add(&cur->list, &useless);
 889                        else
 890                                cur->root = root;
 891                        break;
 892                }
 893
 894                level = cur->level + 1;
 895
 896                /*
 897                 * searching the tree to find upper level blocks
 898                 * reference the block.
 899                 */
 900                path2->search_commit_root = 1;
 901                path2->skip_locking = 1;
 902                path2->lowest_level = level;
 903                ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
 904                path2->lowest_level = 0;
 905                if (ret < 0) {
 906                        err = ret;
 907                        goto out;
 908                }
 909                if (ret > 0 && path2->slots[level] > 0)
 910                        path2->slots[level]--;
 911
 912                eb = path2->nodes[level];
 913                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
 914                        cur->bytenr);
 915
 916                lower = cur;
 917                for (; level < BTRFS_MAX_LEVEL; level++) {
 918                        if (!path2->nodes[level]) {
 919                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
 920                                       lower->bytenr);
 921                                if (should_ignore_root(root))
 922                                        list_add(&lower->list, &useless);
 923                                else
 924                                        lower->root = root;
 925                                break;
 926                        }
 927
 928                        edge = alloc_backref_edge(cache);
 929                        if (!edge) {
 930                                err = -ENOMEM;
 931                                goto out;
 932                        }
 933
 934                        eb = path2->nodes[level];
 935                        rb_node = tree_search(&cache->rb_root, eb->start);
 936                        if (!rb_node) {
 937                                upper = alloc_backref_node(cache);
 938                                if (!upper) {
 939                                        free_backref_edge(cache, edge);
 940                                        err = -ENOMEM;
 941                                        goto out;
 942                                }
 943                                upper->bytenr = eb->start;
 944                                upper->owner = btrfs_header_owner(eb);
 945                                upper->level = lower->level + 1;
 946                                if (!root->ref_cows)
 947                                        upper->cowonly = 1;
 948
 949                                /*
 950                                 * if we know the block isn't shared
 951                                 * we can void checking its backrefs.
 952                                 */
 953                                if (btrfs_block_can_be_shared(root, eb))
 954                                        upper->checked = 0;
 955                                else
 956                                        upper->checked = 1;
 957
 958                                /*
 959                                 * add the block to pending list if we
 960                                 * need check its backrefs. only block
 961                                 * at 'cur->level + 1' is added to the
 962                                 * tail of pending list. this guarantees
 963                                 * we check backrefs from lower level
 964                                 * blocks to upper level blocks.
 965                                 */
 966                                if (!upper->checked &&
 967                                    level == cur->level + 1) {
 968                                        list_add_tail(&edge->list[UPPER],
 969                                                      &list);
 970                                } else
 971                                        INIT_LIST_HEAD(&edge->list[UPPER]);
 972                        } else {
 973                                upper = rb_entry(rb_node, struct backref_node,
 974                                                 rb_node);
 975                                BUG_ON(!upper->checked);
 976                                INIT_LIST_HEAD(&edge->list[UPPER]);
 977                                if (!upper->owner)
 978                                        upper->owner = btrfs_header_owner(eb);
 979                        }
 980                        list_add_tail(&edge->list[LOWER], &lower->upper);
 981                        edge->node[LOWER] = lower;
 982                        edge->node[UPPER] = upper;
 983
 984                        if (rb_node)
 985                                break;
 986                        lower = upper;
 987                        upper = NULL;
 988                }
 989                btrfs_release_path(path2);
 990next:
 991                if (ptr < end) {
 992                        ptr += btrfs_extent_inline_ref_size(key.type);
 993                        if (ptr >= end) {
 994                                WARN_ON(ptr > end);
 995                                ptr = 0;
 996                                end = 0;
 997                        }
 998                }
 999                if (ptr >= end)
1000                        path1->slots[0]++;
1001        }
1002        btrfs_release_path(path1);
1003
1004        cur->checked = 1;
1005        WARN_ON(exist);
1006
1007        /* the pending list isn't empty, take the first block to process */
1008        if (!list_empty(&list)) {
1009                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1010                list_del_init(&edge->list[UPPER]);
1011                cur = edge->node[UPPER];
1012                goto again;
1013        }
1014
1015        /*
1016         * everything goes well, connect backref nodes and insert backref nodes
1017         * into the cache.
1018         */
1019        BUG_ON(!node->checked);
1020        cowonly = node->cowonly;
1021        if (!cowonly) {
1022                rb_node = tree_insert(&cache->rb_root, node->bytenr,
1023                                      &node->rb_node);
1024                if (rb_node)
1025                        backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1026                list_add_tail(&node->lower, &cache->leaves);
1027        }
1028
1029        list_for_each_entry(edge, &node->upper, list[LOWER])
1030                list_add_tail(&edge->list[UPPER], &list);
1031
1032        while (!list_empty(&list)) {
1033                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1034                list_del_init(&edge->list[UPPER]);
1035                upper = edge->node[UPPER];
1036                if (upper->detached) {
1037                        list_del(&edge->list[LOWER]);
1038                        lower = edge->node[LOWER];
1039                        free_backref_edge(cache, edge);
1040                        if (list_empty(&lower->upper))
1041                                list_add(&lower->list, &useless);
1042                        continue;
1043                }
1044
1045                if (!RB_EMPTY_NODE(&upper->rb_node)) {
1046                        if (upper->lowest) {
1047                                list_del_init(&upper->lower);
1048                                upper->lowest = 0;
1049                        }
1050
1051                        list_add_tail(&edge->list[UPPER], &upper->lower);
1052                        continue;
1053                }
1054
1055                BUG_ON(!upper->checked);
1056                BUG_ON(cowonly != upper->cowonly);
1057                if (!cowonly) {
1058                        rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1059                                              &upper->rb_node);
1060                        if (rb_node)
1061                                backref_tree_panic(rb_node, -EEXIST,
1062                                                   upper->bytenr);
1063                }
1064
1065                list_add_tail(&edge->list[UPPER], &upper->lower);
1066
1067                list_for_each_entry(edge, &upper->upper, list[LOWER])
1068                        list_add_tail(&edge->list[UPPER], &list);
1069        }
1070        /*
1071         * process useless backref nodes. backref nodes for tree leaves
1072         * are deleted from the cache. backref nodes for upper level
1073         * tree blocks are left in the cache to avoid unnecessary backref
1074         * lookup.
1075         */
1076        while (!list_empty(&useless)) {
1077                upper = list_entry(useless.next, struct backref_node, list);
1078                list_del_init(&upper->list);
1079                BUG_ON(!list_empty(&upper->upper));
1080                if (upper == node)
1081                        node = NULL;
1082                if (upper->lowest) {
1083                        list_del_init(&upper->lower);
1084                        upper->lowest = 0;
1085                }
1086                while (!list_empty(&upper->lower)) {
1087                        edge = list_entry(upper->lower.next,
1088                                          struct backref_edge, list[UPPER]);
1089                        list_del(&edge->list[UPPER]);
1090                        list_del(&edge->list[LOWER]);
1091                        lower = edge->node[LOWER];
1092                        free_backref_edge(cache, edge);
1093
1094                        if (list_empty(&lower->upper))
1095                                list_add(&lower->list, &useless);
1096                }
1097                __mark_block_processed(rc, upper);
1098                if (upper->level > 0) {
1099                        list_add(&upper->list, &cache->detached);
1100                        upper->detached = 1;
1101                } else {
1102                        rb_erase(&upper->rb_node, &cache->rb_root);
1103                        free_backref_node(cache, upper);
1104                }
1105        }
1106out:
1107        btrfs_free_path(path1);
1108        btrfs_free_path(path2);
1109        if (err) {
1110                while (!list_empty(&useless)) {
1111                        lower = list_entry(useless.next,
1112                                           struct backref_node, upper);
1113                        list_del_init(&lower->upper);
1114                }
1115                upper = node;
1116                INIT_LIST_HEAD(&list);
1117                while (upper) {
1118                        if (RB_EMPTY_NODE(&upper->rb_node)) {
1119                                list_splice_tail(&upper->upper, &list);
1120                                free_backref_node(cache, upper);
1121                        }
1122
1123                        if (list_empty(&list))
1124                                break;
1125
1126                        edge = list_entry(list.next, struct backref_edge,
1127                                          list[LOWER]);
1128                        list_del(&edge->list[LOWER]);
1129                        upper = edge->node[UPPER];
1130                        free_backref_edge(cache, edge);
1131                }
1132                return ERR_PTR(err);
1133        }
1134        BUG_ON(node && node->detached);
1135        return node;
1136}
1137
1138/*
1139 * helper to add backref node for the newly created snapshot.
1140 * the backref node is created by cloning backref node that
1141 * corresponds to root of source tree
1142 */
1143static int clone_backref_node(struct btrfs_trans_handle *trans,
1144                              struct reloc_control *rc,
1145                              struct btrfs_root *src,
1146                              struct btrfs_root *dest)
1147{
1148        struct btrfs_root *reloc_root = src->reloc_root;
1149        struct backref_cache *cache = &rc->backref_cache;
1150        struct backref_node *node = NULL;
1151        struct backref_node *new_node;
1152        struct backref_edge *edge;
1153        struct backref_edge *new_edge;
1154        struct rb_node *rb_node;
1155
1156        if (cache->last_trans > 0)
1157                update_backref_cache(trans, cache);
1158
1159        rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1160        if (rb_node) {
1161                node = rb_entry(rb_node, struct backref_node, rb_node);
1162                if (node->detached)
1163                        node = NULL;
1164                else
1165                        BUG_ON(node->new_bytenr != reloc_root->node->start);
1166        }
1167
1168        if (!node) {
1169                rb_node = tree_search(&cache->rb_root,
1170                                      reloc_root->commit_root->start);
1171                if (rb_node) {
1172                        node = rb_entry(rb_node, struct backref_node,
1173                                        rb_node);
1174                        BUG_ON(node->detached);
1175                }
1176        }
1177
1178        if (!node)
1179                return 0;
1180
1181        new_node = alloc_backref_node(cache);
1182        if (!new_node)
1183                return -ENOMEM;
1184
1185        new_node->bytenr = dest->node->start;
1186        new_node->level = node->level;
1187        new_node->lowest = node->lowest;
1188        new_node->checked = 1;
1189        new_node->root = dest;
1190
1191        if (!node->lowest) {
1192                list_for_each_entry(edge, &node->lower, list[UPPER]) {
1193                        new_edge = alloc_backref_edge(cache);
1194                        if (!new_edge)
1195                                goto fail;
1196
1197                        new_edge->node[UPPER] = new_node;
1198                        new_edge->node[LOWER] = edge->node[LOWER];
1199                        list_add_tail(&new_edge->list[UPPER],
1200                                      &new_node->lower);
1201                }
1202        } else {
1203                list_add_tail(&new_node->lower, &cache->leaves);
1204        }
1205
1206        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1207                              &new_node->rb_node);
1208        if (rb_node)
1209                backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1210
1211        if (!new_node->lowest) {
1212                list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1213                        list_add_tail(&new_edge->list[LOWER],
1214                                      &new_edge->node[LOWER]->upper);
1215                }
1216        }
1217        return 0;
1218fail:
1219        while (!list_empty(&new_node->lower)) {
1220                new_edge = list_entry(new_node->lower.next,
1221                                      struct backref_edge, list[UPPER]);
1222                list_del(&new_edge->list[UPPER]);
1223                free_backref_edge(cache, new_edge);
1224        }
1225        free_backref_node(cache, new_node);
1226        return -ENOMEM;
1227}
1228
1229/*
1230 * helper to add 'address of tree root -> reloc tree' mapping
1231 */
1232static int __must_check __add_reloc_root(struct btrfs_root *root)
1233{
1234        struct rb_node *rb_node;
1235        struct mapping_node *node;
1236        struct reloc_control *rc = root->fs_info->reloc_ctl;
1237
1238        node = kmalloc(sizeof(*node), GFP_NOFS);
1239        if (!node)
1240                return -ENOMEM;
1241
1242        node->bytenr = root->node->start;
1243        node->data = root;
1244
1245        spin_lock(&rc->reloc_root_tree.lock);
1246        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1247                              node->bytenr, &node->rb_node);
1248        spin_unlock(&rc->reloc_root_tree.lock);
1249        if (rb_node) {
1250                btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
1251                            "for start=%llu while inserting into relocation "
1252                            "tree\n", node->bytenr);
1253                kfree(node);
1254                return -EEXIST;
1255        }
1256
1257        list_add_tail(&root->root_list, &rc->reloc_roots);
1258        return 0;
1259}
1260
1261/*
1262 * helper to update/delete the 'address of tree root -> reloc tree'
1263 * mapping
1264 */
1265static int __update_reloc_root(struct btrfs_root *root, int del)
1266{
1267        struct rb_node *rb_node;
1268        struct mapping_node *node = NULL;
1269        struct reloc_control *rc = root->fs_info->reloc_ctl;
1270
1271        spin_lock(&rc->reloc_root_tree.lock);
1272        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1273                              root->commit_root->start);
1274        if (rb_node) {
1275                node = rb_entry(rb_node, struct mapping_node, rb_node);
1276                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1277        }
1278        spin_unlock(&rc->reloc_root_tree.lock);
1279
1280        if (!node)
1281                return 0;
1282        BUG_ON((struct btrfs_root *)node->data != root);
1283
1284        if (!del) {
1285                spin_lock(&rc->reloc_root_tree.lock);
1286                node->bytenr = root->node->start;
1287                rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1288                                      node->bytenr, &node->rb_node);
1289                spin_unlock(&rc->reloc_root_tree.lock);
1290                if (rb_node)
1291                        backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1292        } else {
1293                spin_lock(&root->fs_info->trans_lock);
1294                list_del_init(&root->root_list);
1295                spin_unlock(&root->fs_info->trans_lock);
1296                kfree(node);
1297        }
1298        return 0;
1299}
1300
1301static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1302                                        struct btrfs_root *root, u64 objectid)
1303{
1304        struct btrfs_root *reloc_root;
1305        struct extent_buffer *eb;
1306        struct btrfs_root_item *root_item;
1307        struct btrfs_key root_key;
1308        u64 last_snap = 0;
1309        int ret;
1310
1311        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1312        BUG_ON(!root_item);
1313
1314        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1315        root_key.type = BTRFS_ROOT_ITEM_KEY;
1316        root_key.offset = objectid;
1317
1318        if (root->root_key.objectid == objectid) {
1319                /* called by btrfs_init_reloc_root */
1320                ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1321                                      BTRFS_TREE_RELOC_OBJECTID);
1322                BUG_ON(ret);
1323
1324                last_snap = btrfs_root_last_snapshot(&root->root_item);
1325                btrfs_set_root_last_snapshot(&root->root_item,
1326                                             trans->transid - 1);
1327        } else {
1328                /*
1329                 * called by btrfs_reloc_post_snapshot_hook.
1330                 * the source tree is a reloc tree, all tree blocks
1331                 * modified after it was created have RELOC flag
1332                 * set in their headers. so it's OK to not update
1333                 * the 'last_snapshot'.
1334                 */
1335                ret = btrfs_copy_root(trans, root, root->node, &eb,
1336                                      BTRFS_TREE_RELOC_OBJECTID);
1337                BUG_ON(ret);
1338        }
1339
1340        memcpy(root_item, &root->root_item, sizeof(*root_item));
1341        btrfs_set_root_bytenr(root_item, eb->start);
1342        btrfs_set_root_level(root_item, btrfs_header_level(eb));
1343        btrfs_set_root_generation(root_item, trans->transid);
1344
1345        if (root->root_key.objectid == objectid) {
1346                btrfs_set_root_refs(root_item, 0);
1347                memset(&root_item->drop_progress, 0,
1348                       sizeof(struct btrfs_disk_key));
1349                root_item->drop_level = 0;
1350                /*
1351                 * abuse rtransid, it is safe because it is impossible to
1352                 * receive data into a relocation tree.
1353                 */
1354                btrfs_set_root_rtransid(root_item, last_snap);
1355                btrfs_set_root_otransid(root_item, trans->transid);
1356        }
1357
1358        btrfs_tree_unlock(eb);
1359        free_extent_buffer(eb);
1360
1361        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
1362                                &root_key, root_item);
1363        BUG_ON(ret);
1364        kfree(root_item);
1365
1366        reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
1367        BUG_ON(IS_ERR(reloc_root));
1368        reloc_root->last_trans = trans->transid;
1369        return reloc_root;
1370}
1371
1372/*
1373 * create reloc tree for a given fs tree. reloc tree is just a
1374 * snapshot of the fs tree with special root objectid.
1375 */
1376int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1377                          struct btrfs_root *root)
1378{
1379        struct btrfs_root *reloc_root;
1380        struct reloc_control *rc = root->fs_info->reloc_ctl;
1381        int clear_rsv = 0;
1382        int ret;
1383
1384        if (root->reloc_root) {
1385                reloc_root = root->reloc_root;
1386                reloc_root->last_trans = trans->transid;
1387                return 0;
1388        }
1389
1390        if (!rc || !rc->create_reloc_tree ||
1391            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1392                return 0;
1393
1394        if (!trans->block_rsv) {
1395                trans->block_rsv = rc->block_rsv;
1396                clear_rsv = 1;
1397        }
1398        reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1399        if (clear_rsv)
1400                trans->block_rsv = NULL;
1401
1402        ret = __add_reloc_root(reloc_root);
1403        BUG_ON(ret < 0);
1404        root->reloc_root = reloc_root;
1405        return 0;
1406}
1407
1408/*
1409 * update root item of reloc tree
1410 */
1411int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1412                            struct btrfs_root *root)
1413{
1414        struct btrfs_root *reloc_root;
1415        struct btrfs_root_item *root_item;
1416        int del = 0;
1417        int ret;
1418
1419        if (!root->reloc_root)
1420                goto out;
1421
1422        reloc_root = root->reloc_root;
1423        root_item = &reloc_root->root_item;
1424
1425        if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1426            btrfs_root_refs(root_item) == 0) {
1427                root->reloc_root = NULL;
1428                del = 1;
1429        }
1430
1431        __update_reloc_root(reloc_root, del);
1432
1433        if (reloc_root->commit_root != reloc_root->node) {
1434                btrfs_set_root_node(root_item, reloc_root->node);
1435                free_extent_buffer(reloc_root->commit_root);
1436                reloc_root->commit_root = btrfs_root_node(reloc_root);
1437        }
1438
1439        ret = btrfs_update_root(trans, root->fs_info->tree_root,
1440                                &reloc_root->root_key, root_item);
1441        BUG_ON(ret);
1442
1443out:
1444        return 0;
1445}
1446
1447/*
1448 * helper to find first cached inode with inode number >= objectid
1449 * in a subvolume
1450 */
1451static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1452{
1453        struct rb_node *node;
1454        struct rb_node *prev;
1455        struct btrfs_inode *entry;
1456        struct inode *inode;
1457
1458        spin_lock(&root->inode_lock);
1459again:
1460        node = root->inode_tree.rb_node;
1461        prev = NULL;
1462        while (node) {
1463                prev = node;
1464                entry = rb_entry(node, struct btrfs_inode, rb_node);
1465
1466                if (objectid < btrfs_ino(&entry->vfs_inode))
1467                        node = node->rb_left;
1468                else if (objectid > btrfs_ino(&entry->vfs_inode))
1469                        node = node->rb_right;
1470                else
1471                        break;
1472        }
1473        if (!node) {
1474                while (prev) {
1475                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
1476                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
1477                                node = prev;
1478                                break;
1479                        }
1480                        prev = rb_next(prev);
1481                }
1482        }
1483        while (node) {
1484                entry = rb_entry(node, struct btrfs_inode, rb_node);
1485                inode = igrab(&entry->vfs_inode);
1486                if (inode) {
1487                        spin_unlock(&root->inode_lock);
1488                        return inode;
1489                }
1490
1491                objectid = btrfs_ino(&entry->vfs_inode) + 1;
1492                if (cond_resched_lock(&root->inode_lock))
1493                        goto again;
1494
1495                node = rb_next(node);
1496        }
1497        spin_unlock(&root->inode_lock);
1498        return NULL;
1499}
1500
1501static int in_block_group(u64 bytenr,
1502                          struct btrfs_block_group_cache *block_group)
1503{
1504        if (bytenr >= block_group->key.objectid &&
1505            bytenr < block_group->key.objectid + block_group->key.offset)
1506                return 1;
1507        return 0;
1508}
1509
1510/*
1511 * get new location of data
1512 */
1513static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1514                            u64 bytenr, u64 num_bytes)
1515{
1516        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1517        struct btrfs_path *path;
1518        struct btrfs_file_extent_item *fi;
1519        struct extent_buffer *leaf;
1520        int ret;
1521
1522        path = btrfs_alloc_path();
1523        if (!path)
1524                return -ENOMEM;
1525
1526        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1527        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
1528                                       bytenr, 0);
1529        if (ret < 0)
1530                goto out;
1531        if (ret > 0) {
1532                ret = -ENOENT;
1533                goto out;
1534        }
1535
1536        leaf = path->nodes[0];
1537        fi = btrfs_item_ptr(leaf, path->slots[0],
1538                            struct btrfs_file_extent_item);
1539
1540        BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1541               btrfs_file_extent_compression(leaf, fi) ||
1542               btrfs_file_extent_encryption(leaf, fi) ||
1543               btrfs_file_extent_other_encoding(leaf, fi));
1544
1545        if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1546                ret = 1;
1547                goto out;
1548        }
1549
1550        *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1551        ret = 0;
1552out:
1553        btrfs_free_path(path);
1554        return ret;
1555}
1556
1557/*
1558 * update file extent items in the tree leaf to point to
1559 * the new locations.
1560 */
1561static noinline_for_stack
1562int replace_file_extents(struct btrfs_trans_handle *trans,
1563                         struct reloc_control *rc,
1564                         struct btrfs_root *root,
1565                         struct extent_buffer *leaf)
1566{
1567        struct btrfs_key key;
1568        struct btrfs_file_extent_item *fi;
1569        struct inode *inode = NULL;
1570        u64 parent;
1571        u64 bytenr;
1572        u64 new_bytenr = 0;
1573        u64 num_bytes;
1574        u64 end;
1575        u32 nritems;
1576        u32 i;
1577        int ret;
1578        int first = 1;
1579        int dirty = 0;
1580
1581        if (rc->stage != UPDATE_DATA_PTRS)
1582                return 0;
1583
1584        /* reloc trees always use full backref */
1585        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1586                parent = leaf->start;
1587        else
1588                parent = 0;
1589
1590        nritems = btrfs_header_nritems(leaf);
1591        for (i = 0; i < nritems; i++) {
1592                cond_resched();
1593                btrfs_item_key_to_cpu(leaf, &key, i);
1594                if (key.type != BTRFS_EXTENT_DATA_KEY)
1595                        continue;
1596                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1597                if (btrfs_file_extent_type(leaf, fi) ==
1598                    BTRFS_FILE_EXTENT_INLINE)
1599                        continue;
1600                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1601                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1602                if (bytenr == 0)
1603                        continue;
1604                if (!in_block_group(bytenr, rc->block_group))
1605                        continue;
1606
1607                /*
1608                 * if we are modifying block in fs tree, wait for readpage
1609                 * to complete and drop the extent cache
1610                 */
1611                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1612                        if (first) {
1613                                inode = find_next_inode(root, key.objectid);
1614                                first = 0;
1615                        } else if (inode && btrfs_ino(inode) < key.objectid) {
1616                                btrfs_add_delayed_iput(inode);
1617                                inode = find_next_inode(root, key.objectid);
1618                        }
1619                        if (inode && btrfs_ino(inode) == key.objectid) {
1620                                end = key.offset +
1621                                      btrfs_file_extent_num_bytes(leaf, fi);
1622                                WARN_ON(!IS_ALIGNED(key.offset,
1623                                                    root->sectorsize));
1624                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1625                                end--;
1626                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1627                                                      key.offset, end);
1628                                if (!ret)
1629                                        continue;
1630
1631                                btrfs_drop_extent_cache(inode, key.offset, end,
1632                                                        1);
1633                                unlock_extent(&BTRFS_I(inode)->io_tree,
1634                                              key.offset, end);
1635                        }
1636                }
1637
1638                ret = get_new_location(rc->data_inode, &new_bytenr,
1639                                       bytenr, num_bytes);
1640                if (ret > 0) {
1641                        WARN_ON(1);
1642                        continue;
1643                }
1644                BUG_ON(ret < 0);
1645
1646                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1647                dirty = 1;
1648
1649                key.offset -= btrfs_file_extent_offset(leaf, fi);
1650                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1651                                           num_bytes, parent,
1652                                           btrfs_header_owner(leaf),
1653                                           key.objectid, key.offset, 1);
1654                BUG_ON(ret);
1655
1656                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1657                                        parent, btrfs_header_owner(leaf),
1658                                        key.objectid, key.offset, 1);
1659                BUG_ON(ret);
1660        }
1661        if (dirty)
1662                btrfs_mark_buffer_dirty(leaf);
1663        if (inode)
1664                btrfs_add_delayed_iput(inode);
1665        return 0;
1666}
1667
1668static noinline_for_stack
1669int memcmp_node_keys(struct extent_buffer *eb, int slot,
1670                     struct btrfs_path *path, int level)
1671{
1672        struct btrfs_disk_key key1;
1673        struct btrfs_disk_key key2;
1674        btrfs_node_key(eb, &key1, slot);
1675        btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1676        return memcmp(&key1, &key2, sizeof(key1));
1677}
1678
1679/*
1680 * try to replace tree blocks in fs tree with the new blocks
1681 * in reloc tree. tree blocks haven't been modified since the
1682 * reloc tree was create can be replaced.
1683 *
1684 * if a block was replaced, level of the block + 1 is returned.
1685 * if no block got replaced, 0 is returned. if there are other
1686 * errors, a negative error number is returned.
1687 */
1688static noinline_for_stack
1689int replace_path(struct btrfs_trans_handle *trans,
1690                 struct btrfs_root *dest, struct btrfs_root *src,
1691                 struct btrfs_path *path, struct btrfs_key *next_key,
1692                 int lowest_level, int max_level)
1693{
1694        struct extent_buffer *eb;
1695        struct extent_buffer *parent;
1696        struct btrfs_key key;
1697        u64 old_bytenr;
1698        u64 new_bytenr;
1699        u64 old_ptr_gen;
1700        u64 new_ptr_gen;
1701        u64 last_snapshot;
1702        u32 blocksize;
1703        int cow = 0;
1704        int level;
1705        int ret;
1706        int slot;
1707
1708        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1709        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1710
1711        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1712again:
1713        slot = path->slots[lowest_level];
1714        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1715
1716        eb = btrfs_lock_root_node(dest);
1717        btrfs_set_lock_blocking(eb);
1718        level = btrfs_header_level(eb);
1719
1720        if (level < lowest_level) {
1721                btrfs_tree_unlock(eb);
1722                free_extent_buffer(eb);
1723                return 0;
1724        }
1725
1726        if (cow) {
1727                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1728                BUG_ON(ret);
1729        }
1730        btrfs_set_lock_blocking(eb);
1731
1732        if (next_key) {
1733                next_key->objectid = (u64)-1;
1734                next_key->type = (u8)-1;
1735                next_key->offset = (u64)-1;
1736        }
1737
1738        parent = eb;
1739        while (1) {
1740                level = btrfs_header_level(parent);
1741                BUG_ON(level < lowest_level);
1742
1743                ret = btrfs_bin_search(parent, &key, level, &slot);
1744                if (ret && slot > 0)
1745                        slot--;
1746
1747                if (next_key && slot + 1 < btrfs_header_nritems(parent))
1748                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1749
1750                old_bytenr = btrfs_node_blockptr(parent, slot);
1751                blocksize = btrfs_level_size(dest, level - 1);
1752                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1753
1754                if (level <= max_level) {
1755                        eb = path->nodes[level];
1756                        new_bytenr = btrfs_node_blockptr(eb,
1757                                                        path->slots[level]);
1758                        new_ptr_gen = btrfs_node_ptr_generation(eb,
1759                                                        path->slots[level]);
1760                } else {
1761                        new_bytenr = 0;
1762                        new_ptr_gen = 0;
1763                }
1764
1765                if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1766                        WARN_ON(1);
1767                        ret = level;
1768                        break;
1769                }
1770
1771                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1772                    memcmp_node_keys(parent, slot, path, level)) {
1773                        if (level <= lowest_level) {
1774                                ret = 0;
1775                                break;
1776                        }
1777
1778                        eb = read_tree_block(dest, old_bytenr, blocksize,
1779                                             old_ptr_gen);
1780                        if (!eb || !extent_buffer_uptodate(eb)) {
1781                                ret = (!eb) ? -ENOMEM : -EIO;
1782                                free_extent_buffer(eb);
1783                                break;
1784                        }
1785                        btrfs_tree_lock(eb);
1786                        if (cow) {
1787                                ret = btrfs_cow_block(trans, dest, eb, parent,
1788                                                      slot, &eb);
1789                                BUG_ON(ret);
1790                        }
1791                        btrfs_set_lock_blocking(eb);
1792
1793                        btrfs_tree_unlock(parent);
1794                        free_extent_buffer(parent);
1795
1796                        parent = eb;
1797                        continue;
1798                }
1799
1800                if (!cow) {
1801                        btrfs_tree_unlock(parent);
1802                        free_extent_buffer(parent);
1803                        cow = 1;
1804                        goto again;
1805                }
1806
1807                btrfs_node_key_to_cpu(path->nodes[level], &key,
1808                                      path->slots[level]);
1809                btrfs_release_path(path);
1810
1811                path->lowest_level = level;
1812                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1813                path->lowest_level = 0;
1814                BUG_ON(ret);
1815
1816                /*
1817                 * swap blocks in fs tree and reloc tree.
1818                 */
1819                btrfs_set_node_blockptr(parent, slot, new_bytenr);
1820                btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1821                btrfs_mark_buffer_dirty(parent);
1822
1823                btrfs_set_node_blockptr(path->nodes[level],
1824                                        path->slots[level], old_bytenr);
1825                btrfs_set_node_ptr_generation(path->nodes[level],
1826                                              path->slots[level], old_ptr_gen);
1827                btrfs_mark_buffer_dirty(path->nodes[level]);
1828
1829                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1830                                        path->nodes[level]->start,
1831                                        src->root_key.objectid, level - 1, 0,
1832                                        1);
1833                BUG_ON(ret);
1834                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1835                                        0, dest->root_key.objectid, level - 1,
1836                                        0, 1);
1837                BUG_ON(ret);
1838
1839                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1840                                        path->nodes[level]->start,
1841                                        src->root_key.objectid, level - 1, 0,
1842                                        1);
1843                BUG_ON(ret);
1844
1845                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1846                                        0, dest->root_key.objectid, level - 1,
1847                                        0, 1);
1848                BUG_ON(ret);
1849
1850                btrfs_unlock_up_safe(path, 0);
1851
1852                ret = level;
1853                break;
1854        }
1855        btrfs_tree_unlock(parent);
1856        free_extent_buffer(parent);
1857        return ret;
1858}
1859
1860/*
1861 * helper to find next relocated block in reloc tree
1862 */
1863static noinline_for_stack
1864int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1865                       int *level)
1866{
1867        struct extent_buffer *eb;
1868        int i;
1869        u64 last_snapshot;
1870        u32 nritems;
1871
1872        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1873
1874        for (i = 0; i < *level; i++) {
1875                free_extent_buffer(path->nodes[i]);
1876                path->nodes[i] = NULL;
1877        }
1878
1879        for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1880                eb = path->nodes[i];
1881                nritems = btrfs_header_nritems(eb);
1882                while (path->slots[i] + 1 < nritems) {
1883                        path->slots[i]++;
1884                        if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1885                            last_snapshot)
1886                                continue;
1887
1888                        *level = i;
1889                        return 0;
1890                }
1891                free_extent_buffer(path->nodes[i]);
1892                path->nodes[i] = NULL;
1893        }
1894        return 1;
1895}
1896
1897/*
1898 * walk down reloc tree to find relocated block of lowest level
1899 */
1900static noinline_for_stack
1901int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1902                         int *level)
1903{
1904        struct extent_buffer *eb = NULL;
1905        int i;
1906        u64 bytenr;
1907        u64 ptr_gen = 0;
1908        u64 last_snapshot;
1909        u32 blocksize;
1910        u32 nritems;
1911
1912        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1913
1914        for (i = *level; i > 0; i--) {
1915                eb = path->nodes[i];
1916                nritems = btrfs_header_nritems(eb);
1917                while (path->slots[i] < nritems) {
1918                        ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1919                        if (ptr_gen > last_snapshot)
1920                                break;
1921                        path->slots[i]++;
1922                }
1923                if (path->slots[i] >= nritems) {
1924                        if (i == *level)
1925                                break;
1926                        *level = i + 1;
1927                        return 0;
1928                }
1929                if (i == 1) {
1930                        *level = i;
1931                        return 0;
1932                }
1933
1934                bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1935                blocksize = btrfs_level_size(root, i - 1);
1936                eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1937                if (!eb || !extent_buffer_uptodate(eb)) {
1938                        free_extent_buffer(eb);
1939                        return -EIO;
1940                }
1941                BUG_ON(btrfs_header_level(eb) != i - 1);
1942                path->nodes[i - 1] = eb;
1943                path->slots[i - 1] = 0;
1944        }
1945        return 1;
1946}
1947
1948/*
1949 * invalidate extent cache for file extents whose key in range of
1950 * [min_key, max_key)
1951 */
1952static int invalidate_extent_cache(struct btrfs_root *root,
1953                                   struct btrfs_key *min_key,
1954                                   struct btrfs_key *max_key)
1955{
1956        struct inode *inode = NULL;
1957        u64 objectid;
1958        u64 start, end;
1959        u64 ino;
1960
1961        objectid = min_key->objectid;
1962        while (1) {
1963                cond_resched();
1964                iput(inode);
1965
1966                if (objectid > max_key->objectid)
1967                        break;
1968
1969                inode = find_next_inode(root, objectid);
1970                if (!inode)
1971                        break;
1972                ino = btrfs_ino(inode);
1973
1974                if (ino > max_key->objectid) {
1975                        iput(inode);
1976                        break;
1977                }
1978
1979                objectid = ino + 1;
1980                if (!S_ISREG(inode->i_mode))
1981                        continue;
1982
1983                if (unlikely(min_key->objectid == ino)) {
1984                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1985                                continue;
1986                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1987                                start = 0;
1988                        else {
1989                                start = min_key->offset;
1990                                WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1991                        }
1992                } else {
1993                        start = 0;
1994                }
1995
1996                if (unlikely(max_key->objectid == ino)) {
1997                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1998                                continue;
1999                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2000                                end = (u64)-1;
2001                        } else {
2002                                if (max_key->offset == 0)
2003                                        continue;
2004                                end = max_key->offset;
2005                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
2006                                end--;
2007                        }
2008                } else {
2009                        end = (u64)-1;
2010                }
2011
2012                /* the lock_extent waits for readpage to complete */
2013                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2014                btrfs_drop_extent_cache(inode, start, end, 1);
2015                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2016        }
2017        return 0;
2018}
2019
2020static int find_next_key(struct btrfs_path *path, int level,
2021                         struct btrfs_key *key)
2022
2023{
2024        while (level < BTRFS_MAX_LEVEL) {
2025                if (!path->nodes[level])
2026                        break;
2027                if (path->slots[level] + 1 <
2028                    btrfs_header_nritems(path->nodes[level])) {
2029                        btrfs_node_key_to_cpu(path->nodes[level], key,
2030                                              path->slots[level] + 1);
2031                        return 0;
2032                }
2033                level++;
2034        }
2035        return 1;
2036}
2037
2038/*
2039 * merge the relocated tree blocks in reloc tree with corresponding
2040 * fs tree.
2041 */
2042static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2043                                               struct btrfs_root *root)
2044{
2045        LIST_HEAD(inode_list);
2046        struct btrfs_key key;
2047        struct btrfs_key next_key;
2048        struct btrfs_trans_handle *trans;
2049        struct btrfs_root *reloc_root;
2050        struct btrfs_root_item *root_item;
2051        struct btrfs_path *path;
2052        struct extent_buffer *leaf;
2053        int level;
2054        int max_level;
2055        int replaced = 0;
2056        int ret;
2057        int err = 0;
2058        u32 min_reserved;
2059
2060        path = btrfs_alloc_path();
2061        if (!path)
2062                return -ENOMEM;
2063        path->reada = 1;
2064
2065        reloc_root = root->reloc_root;
2066        root_item = &reloc_root->root_item;
2067
2068        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2069                level = btrfs_root_level(root_item);
2070                extent_buffer_get(reloc_root->node);
2071                path->nodes[level] = reloc_root->node;
2072                path->slots[level] = 0;
2073        } else {
2074                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2075
2076                level = root_item->drop_level;
2077                BUG_ON(level == 0);
2078                path->lowest_level = level;
2079                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2080                path->lowest_level = 0;
2081                if (ret < 0) {
2082                        btrfs_free_path(path);
2083                        return ret;
2084                }
2085
2086                btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2087                                      path->slots[level]);
2088                WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2089
2090                btrfs_unlock_up_safe(path, 0);
2091        }
2092
2093        min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2094        memset(&next_key, 0, sizeof(next_key));
2095
2096        while (1) {
2097                trans = btrfs_start_transaction(root, 0);
2098                BUG_ON(IS_ERR(trans));
2099                trans->block_rsv = rc->block_rsv;
2100
2101                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2102                                             BTRFS_RESERVE_FLUSH_ALL);
2103                if (ret) {
2104                        BUG_ON(ret != -EAGAIN);
2105                        ret = btrfs_commit_transaction(trans, root);
2106                        BUG_ON(ret);
2107                        continue;
2108                }
2109
2110                replaced = 0;
2111                max_level = level;
2112
2113                ret = walk_down_reloc_tree(reloc_root, path, &level);
2114                if (ret < 0) {
2115                        err = ret;
2116                        goto out;
2117                }
2118                if (ret > 0)
2119                        break;
2120
2121                if (!find_next_key(path, level, &key) &&
2122                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2123                        ret = 0;
2124                } else {
2125                        ret = replace_path(trans, root, reloc_root, path,
2126                                           &next_key, level, max_level);
2127                }
2128                if (ret < 0) {
2129                        err = ret;
2130                        goto out;
2131                }
2132
2133                if (ret > 0) {
2134                        level = ret;
2135                        btrfs_node_key_to_cpu(path->nodes[level], &key,
2136                                              path->slots[level]);
2137                        replaced = 1;
2138                }
2139
2140                ret = walk_up_reloc_tree(reloc_root, path, &level);
2141                if (ret > 0)
2142                        break;
2143
2144                BUG_ON(level == 0);
2145                /*
2146                 * save the merging progress in the drop_progress.
2147                 * this is OK since root refs == 1 in this case.
2148                 */
2149                btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2150                               path->slots[level]);
2151                root_item->drop_level = level;
2152
2153                btrfs_end_transaction_throttle(trans, root);
2154
2155                btrfs_btree_balance_dirty(root);
2156
2157                if (replaced && rc->stage == UPDATE_DATA_PTRS)
2158                        invalidate_extent_cache(root, &key, &next_key);
2159        }
2160
2161        /*
2162         * handle the case only one block in the fs tree need to be
2163         * relocated and the block is tree root.
2164         */
2165        leaf = btrfs_lock_root_node(root);
2166        ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2167        btrfs_tree_unlock(leaf);
2168        free_extent_buffer(leaf);
2169        if (ret < 0)
2170                err = ret;
2171out:
2172        btrfs_free_path(path);
2173
2174        if (err == 0) {
2175                memset(&root_item->drop_progress, 0,
2176                       sizeof(root_item->drop_progress));
2177                root_item->drop_level = 0;
2178                btrfs_set_root_refs(root_item, 0);
2179                btrfs_update_reloc_root(trans, root);
2180        }
2181
2182        btrfs_end_transaction_throttle(trans, root);
2183
2184        btrfs_btree_balance_dirty(root);
2185
2186        if (replaced && rc->stage == UPDATE_DATA_PTRS)
2187                invalidate_extent_cache(root, &key, &next_key);
2188
2189        return err;
2190}
2191
2192static noinline_for_stack
2193int prepare_to_merge(struct reloc_control *rc, int err)
2194{
2195        struct btrfs_root *root = rc->extent_root;
2196        struct btrfs_root *reloc_root;
2197        struct btrfs_trans_handle *trans;
2198        LIST_HEAD(reloc_roots);
2199        u64 num_bytes = 0;
2200        int ret;
2201
2202        mutex_lock(&root->fs_info->reloc_mutex);
2203        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2204        rc->merging_rsv_size += rc->nodes_relocated * 2;
2205        mutex_unlock(&root->fs_info->reloc_mutex);
2206
2207again:
2208        if (!err) {
2209                num_bytes = rc->merging_rsv_size;
2210                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2211                                          BTRFS_RESERVE_FLUSH_ALL);
2212                if (ret)
2213                        err = ret;
2214        }
2215
2216        trans = btrfs_join_transaction(rc->extent_root);
2217        if (IS_ERR(trans)) {
2218                if (!err)
2219                        btrfs_block_rsv_release(rc->extent_root,
2220                                                rc->block_rsv, num_bytes);
2221                return PTR_ERR(trans);
2222        }
2223
2224        if (!err) {
2225                if (num_bytes != rc->merging_rsv_size) {
2226                        btrfs_end_transaction(trans, rc->extent_root);
2227                        btrfs_block_rsv_release(rc->extent_root,
2228                                                rc->block_rsv, num_bytes);
2229                        goto again;
2230                }
2231        }
2232
2233        rc->merge_reloc_tree = 1;
2234
2235        while (!list_empty(&rc->reloc_roots)) {
2236                reloc_root = list_entry(rc->reloc_roots.next,
2237                                        struct btrfs_root, root_list);
2238                list_del_init(&reloc_root->root_list);
2239
2240                root = read_fs_root(reloc_root->fs_info,
2241                                    reloc_root->root_key.offset);
2242                BUG_ON(IS_ERR(root));
2243                BUG_ON(root->reloc_root != reloc_root);
2244
2245                /*
2246                 * set reference count to 1, so btrfs_recover_relocation
2247                 * knows it should resumes merging
2248                 */
2249                if (!err)
2250                        btrfs_set_root_refs(&reloc_root->root_item, 1);
2251                btrfs_update_reloc_root(trans, root);
2252
2253                list_add(&reloc_root->root_list, &reloc_roots);
2254        }
2255
2256        list_splice(&reloc_roots, &rc->reloc_roots);
2257
2258        if (!err)
2259                btrfs_commit_transaction(trans, rc->extent_root);
2260        else
2261                btrfs_end_transaction(trans, rc->extent_root);
2262        return err;
2263}
2264
2265static noinline_for_stack
2266void free_reloc_roots(struct list_head *list)
2267{
2268        struct btrfs_root *reloc_root;
2269
2270        while (!list_empty(list)) {
2271                reloc_root = list_entry(list->next, struct btrfs_root,
2272                                        root_list);
2273                __update_reloc_root(reloc_root, 1);
2274                free_extent_buffer(reloc_root->node);
2275                free_extent_buffer(reloc_root->commit_root);
2276                kfree(reloc_root);
2277        }
2278}
2279
2280static noinline_for_stack
2281int merge_reloc_roots(struct reloc_control *rc)
2282{
2283        struct btrfs_trans_handle *trans;
2284        struct btrfs_root *root;
2285        struct btrfs_root *reloc_root;
2286        u64 last_snap;
2287        u64 otransid;
2288        u64 objectid;
2289        LIST_HEAD(reloc_roots);
2290        int found = 0;
2291        int ret = 0;
2292again:
2293        root = rc->extent_root;
2294
2295        /*
2296         * this serializes us with btrfs_record_root_in_transaction,
2297         * we have to make sure nobody is in the middle of
2298         * adding their roots to the list while we are
2299         * doing this splice
2300         */
2301        mutex_lock(&root->fs_info->reloc_mutex);
2302        list_splice_init(&rc->reloc_roots, &reloc_roots);
2303        mutex_unlock(&root->fs_info->reloc_mutex);
2304
2305        while (!list_empty(&reloc_roots)) {
2306                found = 1;
2307                reloc_root = list_entry(reloc_roots.next,
2308                                        struct btrfs_root, root_list);
2309
2310                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2311                        root = read_fs_root(reloc_root->fs_info,
2312                                            reloc_root->root_key.offset);
2313                        BUG_ON(IS_ERR(root));
2314                        BUG_ON(root->reloc_root != reloc_root);
2315
2316                        ret = merge_reloc_root(rc, root);
2317                        if (ret)
2318                                goto out;
2319                } else {
2320                        list_del_init(&reloc_root->root_list);
2321                }
2322
2323                /*
2324                 * we keep the old last snapshod transid in rtranid when we
2325                 * created the relocation tree.
2326                 */
2327                last_snap = btrfs_root_rtransid(&reloc_root->root_item);
2328                otransid = btrfs_root_otransid(&reloc_root->root_item);
2329                objectid = reloc_root->root_key.offset;
2330
2331                ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2332                if (ret < 0) {
2333                        if (list_empty(&reloc_root->root_list))
2334                                list_add_tail(&reloc_root->root_list,
2335                                              &reloc_roots);
2336                        goto out;
2337                } else if (!ret) {
2338                        /*
2339                         * recover the last snapshot tranid to avoid
2340                         * the space balance break NOCOW.
2341                         */
2342                        root = read_fs_root(rc->extent_root->fs_info,
2343                                            objectid);
2344                        if (IS_ERR(root))
2345                                continue;
2346
2347                        if (btrfs_root_refs(&root->root_item) == 0)
2348                                continue;
2349
2350                        trans = btrfs_join_transaction(root);
2351                        BUG_ON(IS_ERR(trans));
2352
2353                        /* Check if the fs/file tree was snapshoted or not. */
2354                        if (btrfs_root_last_snapshot(&root->root_item) ==
2355                            otransid - 1)
2356                                btrfs_set_root_last_snapshot(&root->root_item,
2357                                                             last_snap);
2358                                
2359                        btrfs_end_transaction(trans, root);
2360                }
2361        }
2362
2363        if (found) {
2364                found = 0;
2365                goto again;
2366        }
2367out:
2368        if (ret) {
2369                btrfs_std_error(root->fs_info, ret);
2370                if (!list_empty(&reloc_roots))
2371                        free_reloc_roots(&reloc_roots);
2372        }
2373
2374        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2375        return ret;
2376}
2377
2378static void free_block_list(struct rb_root *blocks)
2379{
2380        struct tree_block *block;
2381        struct rb_node *rb_node;
2382        while ((rb_node = rb_first(blocks))) {
2383                block = rb_entry(rb_node, struct tree_block, rb_node);
2384                rb_erase(rb_node, blocks);
2385                kfree(block);
2386        }
2387}
2388
2389static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2390                                      struct btrfs_root *reloc_root)
2391{
2392        struct btrfs_root *root;
2393
2394        if (reloc_root->last_trans == trans->transid)
2395                return 0;
2396
2397        root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
2398        BUG_ON(IS_ERR(root));
2399        BUG_ON(root->reloc_root != reloc_root);
2400
2401        return btrfs_record_root_in_trans(trans, root);
2402}
2403
2404static noinline_for_stack
2405struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2406                                     struct reloc_control *rc,
2407                                     struct backref_node *node,
2408                                     struct backref_edge *edges[], int *nr)
2409{
2410        struct backref_node *next;
2411        struct btrfs_root *root;
2412        int index = 0;
2413
2414        next = node;
2415        while (1) {
2416                cond_resched();
2417                next = walk_up_backref(next, edges, &index);
2418                root = next->root;
2419                BUG_ON(!root);
2420                BUG_ON(!root->ref_cows);
2421
2422                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2423                        record_reloc_root_in_trans(trans, root);
2424                        break;
2425                }
2426
2427                btrfs_record_root_in_trans(trans, root);
2428                root = root->reloc_root;
2429
2430                if (next->new_bytenr != root->node->start) {
2431                        BUG_ON(next->new_bytenr);
2432                        BUG_ON(!list_empty(&next->list));
2433                        next->new_bytenr = root->node->start;
2434                        next->root = root;
2435                        list_add_tail(&next->list,
2436                                      &rc->backref_cache.changed);
2437                        __mark_block_processed(rc, next);
2438                        break;
2439                }
2440
2441                WARN_ON(1);
2442                root = NULL;
2443                next = walk_down_backref(edges, &index);
2444                if (!next || next->level <= node->level)
2445                        break;
2446        }
2447        if (!root)
2448                return NULL;
2449
2450        *nr = index;
2451        next = node;
2452        /* setup backref node path for btrfs_reloc_cow_block */
2453        while (1) {
2454                rc->backref_cache.path[next->level] = next;
2455                if (--index < 0)
2456                        break;
2457                next = edges[index]->node[UPPER];
2458        }
2459        return root;
2460}
2461
2462/*
2463 * select a tree root for relocation. return NULL if the block
2464 * is reference counted. we should use do_relocation() in this
2465 * case. return a tree root pointer if the block isn't reference
2466 * counted. return -ENOENT if the block is root of reloc tree.
2467 */
2468static noinline_for_stack
2469struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
2470                                   struct backref_node *node)
2471{
2472        struct backref_node *next;
2473        struct btrfs_root *root;
2474        struct btrfs_root *fs_root = NULL;
2475        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2476        int index = 0;
2477
2478        next = node;
2479        while (1) {
2480                cond_resched();
2481                next = walk_up_backref(next, edges, &index);
2482                root = next->root;
2483                BUG_ON(!root);
2484
2485                /* no other choice for non-references counted tree */
2486                if (!root->ref_cows)
2487                        return root;
2488
2489                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2490                        fs_root = root;
2491
2492                if (next != node)
2493                        return NULL;
2494
2495                next = walk_down_backref(edges, &index);
2496                if (!next || next->level <= node->level)
2497                        break;
2498        }
2499
2500        if (!fs_root)
2501                return ERR_PTR(-ENOENT);
2502        return fs_root;
2503}
2504
2505static noinline_for_stack
2506u64 calcu_metadata_size(struct reloc_control *rc,
2507                        struct backref_node *node, int reserve)
2508{
2509        struct backref_node *next = node;
2510        struct backref_edge *edge;
2511        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2512        u64 num_bytes = 0;
2513        int index = 0;
2514
2515        BUG_ON(reserve && node->processed);
2516
2517        while (next) {
2518                cond_resched();
2519                while (1) {
2520                        if (next->processed && (reserve || next != node))
2521                                break;
2522
2523                        num_bytes += btrfs_level_size(rc->extent_root,
2524                                                      next->level);
2525
2526                        if (list_empty(&next->upper))
2527                                break;
2528
2529                        edge = list_entry(next->upper.next,
2530                                          struct backref_edge, list[LOWER]);
2531                        edges[index++] = edge;
2532                        next = edge->node[UPPER];
2533                }
2534                next = walk_down_backref(edges, &index);
2535        }
2536        return num_bytes;
2537}
2538
2539static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2540                                  struct reloc_control *rc,
2541                                  struct backref_node *node)
2542{
2543        struct btrfs_root *root = rc->extent_root;
2544        u64 num_bytes;
2545        int ret;
2546
2547        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2548
2549        trans->block_rsv = rc->block_rsv;
2550        ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2551                                  BTRFS_RESERVE_FLUSH_ALL);
2552        if (ret) {
2553                if (ret == -EAGAIN)
2554                        rc->commit_transaction = 1;
2555                return ret;
2556        }
2557
2558        return 0;
2559}
2560
2561static void release_metadata_space(struct reloc_control *rc,
2562                                   struct backref_node *node)
2563{
2564        u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2565        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
2566}
2567
2568/*
2569 * relocate a block tree, and then update pointers in upper level
2570 * blocks that reference the block to point to the new location.
2571 *
2572 * if called by link_to_upper, the block has already been relocated.
2573 * in that case this function just updates pointers.
2574 */
2575static int do_relocation(struct btrfs_trans_handle *trans,
2576                         struct reloc_control *rc,
2577                         struct backref_node *node,
2578                         struct btrfs_key *key,
2579                         struct btrfs_path *path, int lowest)
2580{
2581        struct backref_node *upper;
2582        struct backref_edge *edge;
2583        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2584        struct btrfs_root *root;
2585        struct extent_buffer *eb;
2586        u32 blocksize;
2587        u64 bytenr;
2588        u64 generation;
2589        int nr;
2590        int slot;
2591        int ret;
2592        int err = 0;
2593
2594        BUG_ON(lowest && node->eb);
2595
2596        path->lowest_level = node->level + 1;
2597        rc->backref_cache.path[node->level] = node;
2598        list_for_each_entry(edge, &node->upper, list[LOWER]) {
2599                cond_resched();
2600
2601                upper = edge->node[UPPER];
2602                root = select_reloc_root(trans, rc, upper, edges, &nr);
2603                BUG_ON(!root);
2604
2605                if (upper->eb && !upper->locked) {
2606                        if (!lowest) {
2607                                ret = btrfs_bin_search(upper->eb, key,
2608                                                       upper->level, &slot);
2609                                BUG_ON(ret);
2610                                bytenr = btrfs_node_blockptr(upper->eb, slot);
2611                                if (node->eb->start == bytenr)
2612                                        goto next;
2613                        }
2614                        drop_node_buffer(upper);
2615                }
2616
2617                if (!upper->eb) {
2618                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2619                        if (ret < 0) {
2620                                err = ret;
2621                                break;
2622                        }
2623                        BUG_ON(ret > 0);
2624
2625                        if (!upper->eb) {
2626                                upper->eb = path->nodes[upper->level];
2627                                path->nodes[upper->level] = NULL;
2628                        } else {
2629                                BUG_ON(upper->eb != path->nodes[upper->level]);
2630                        }
2631
2632                        upper->locked = 1;
2633                        path->locks[upper->level] = 0;
2634
2635                        slot = path->slots[upper->level];
2636                        btrfs_release_path(path);
2637                } else {
2638                        ret = btrfs_bin_search(upper->eb, key, upper->level,
2639                                               &slot);
2640                        BUG_ON(ret);
2641                }
2642
2643                bytenr = btrfs_node_blockptr(upper->eb, slot);
2644                if (lowest) {
2645                        BUG_ON(bytenr != node->bytenr);
2646                } else {
2647                        if (node->eb->start == bytenr)
2648                                goto next;
2649                }
2650
2651                blocksize = btrfs_level_size(root, node->level);
2652                generation = btrfs_node_ptr_generation(upper->eb, slot);
2653                eb = read_tree_block(root, bytenr, blocksize, generation);
2654                if (!eb || !extent_buffer_uptodate(eb)) {
2655                        free_extent_buffer(eb);
2656                        err = -EIO;
2657                        goto next;
2658                }
2659                btrfs_tree_lock(eb);
2660                btrfs_set_lock_blocking(eb);
2661
2662                if (!node->eb) {
2663                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
2664                                              slot, &eb);
2665                        btrfs_tree_unlock(eb);
2666                        free_extent_buffer(eb);
2667                        if (ret < 0) {
2668                                err = ret;
2669                                goto next;
2670                        }
2671                        BUG_ON(node->eb != eb);
2672                } else {
2673                        btrfs_set_node_blockptr(upper->eb, slot,
2674                                                node->eb->start);
2675                        btrfs_set_node_ptr_generation(upper->eb, slot,
2676                                                      trans->transid);
2677                        btrfs_mark_buffer_dirty(upper->eb);
2678
2679                        ret = btrfs_inc_extent_ref(trans, root,
2680                                                node->eb->start, blocksize,
2681                                                upper->eb->start,
2682                                                btrfs_header_owner(upper->eb),
2683                                                node->level, 0, 1);
2684                        BUG_ON(ret);
2685
2686                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2687                        BUG_ON(ret);
2688                }
2689next:
2690                if (!upper->pending)
2691                        drop_node_buffer(upper);
2692                else
2693                        unlock_node_buffer(upper);
2694                if (err)
2695                        break;
2696        }
2697
2698        if (!err && node->pending) {
2699                drop_node_buffer(node);
2700                list_move_tail(&node->list, &rc->backref_cache.changed);
2701                node->pending = 0;
2702        }
2703
2704        path->lowest_level = 0;
2705        BUG_ON(err == -ENOSPC);
2706        return err;
2707}
2708
2709static int link_to_upper(struct btrfs_trans_handle *trans,
2710                         struct reloc_control *rc,
2711                         struct backref_node *node,
2712                         struct btrfs_path *path)
2713{
2714        struct btrfs_key key;
2715
2716        btrfs_node_key_to_cpu(node->eb, &key, 0);
2717        return do_relocation(trans, rc, node, &key, path, 0);
2718}
2719
2720static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2721                                struct reloc_control *rc,
2722                                struct btrfs_path *path, int err)
2723{
2724        LIST_HEAD(list);
2725        struct backref_cache *cache = &rc->backref_cache;
2726        struct backref_node *node;
2727        int level;
2728        int ret;
2729
2730        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2731                while (!list_empty(&cache->pending[level])) {
2732                        node = list_entry(cache->pending[level].next,
2733                                          struct backref_node, list);
2734                        list_move_tail(&node->list, &list);
2735                        BUG_ON(!node->pending);
2736
2737                        if (!err) {
2738                                ret = link_to_upper(trans, rc, node, path);
2739                                if (ret < 0)
2740                                        err = ret;
2741                        }
2742                }
2743                list_splice_init(&list, &cache->pending[level]);
2744        }
2745        return err;
2746}
2747
2748static void mark_block_processed(struct reloc_control *rc,
2749                                 u64 bytenr, u32 blocksize)
2750{
2751        set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2752                        EXTENT_DIRTY, GFP_NOFS);
2753}
2754
2755static void __mark_block_processed(struct reloc_control *rc,
2756                                   struct backref_node *node)
2757{
2758        u32 blocksize;
2759        if (node->level == 0 ||
2760            in_block_group(node->bytenr, rc->block_group)) {
2761                blocksize = btrfs_level_size(rc->extent_root, node->level);
2762                mark_block_processed(rc, node->bytenr, blocksize);
2763        }
2764        node->processed = 1;
2765}
2766
2767/*
2768 * mark a block and all blocks directly/indirectly reference the block
2769 * as processed.
2770 */
2771static void update_processed_blocks(struct reloc_control *rc,
2772                                    struct backref_node *node)
2773{
2774        struct backref_node *next = node;
2775        struct backref_edge *edge;
2776        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2777        int index = 0;
2778
2779        while (next) {
2780                cond_resched();
2781                while (1) {
2782                        if (next->processed)
2783                                break;
2784
2785                        __mark_block_processed(rc, next);
2786
2787                        if (list_empty(&next->upper))
2788                                break;
2789
2790                        edge = list_entry(next->upper.next,
2791                                          struct backref_edge, list[LOWER]);
2792                        edges[index++] = edge;
2793                        next = edge->node[UPPER];
2794                }
2795                next = walk_down_backref(edges, &index);
2796        }
2797}
2798
2799static int tree_block_processed(u64 bytenr, u32 blocksize,
2800                                struct reloc_control *rc)
2801{
2802        if (test_range_bit(&rc->processed_blocks, bytenr,
2803                           bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2804                return 1;
2805        return 0;
2806}
2807
2808static int get_tree_block_key(struct reloc_control *rc,
2809                              struct tree_block *block)
2810{
2811        struct extent_buffer *eb;
2812
2813        BUG_ON(block->key_ready);
2814        eb = read_tree_block(rc->extent_root, block->bytenr,
2815                             block->key.objectid, block->key.offset);
2816        if (!eb || !extent_buffer_uptodate(eb)) {
2817                free_extent_buffer(eb);
2818                return -EIO;
2819        }
2820        WARN_ON(btrfs_header_level(eb) != block->level);
2821        if (block->level == 0)
2822                btrfs_item_key_to_cpu(eb, &block->key, 0);
2823        else
2824                btrfs_node_key_to_cpu(eb, &block->key, 0);
2825        free_extent_buffer(eb);
2826        block->key_ready = 1;
2827        return 0;
2828}
2829
2830static int reada_tree_block(struct reloc_control *rc,
2831                            struct tree_block *block)
2832{
2833        BUG_ON(block->key_ready);
2834        if (block->key.type == BTRFS_METADATA_ITEM_KEY)
2835                readahead_tree_block(rc->extent_root, block->bytenr,
2836                                     block->key.objectid,
2837                                     rc->extent_root->leafsize);
2838        else
2839                readahead_tree_block(rc->extent_root, block->bytenr,
2840                                     block->key.objectid, block->key.offset);
2841        return 0;
2842}
2843
2844/*
2845 * helper function to relocate a tree block
2846 */
2847static int relocate_tree_block(struct btrfs_trans_handle *trans,
2848                                struct reloc_control *rc,
2849                                struct backref_node *node,
2850                                struct btrfs_key *key,
2851                                struct btrfs_path *path)
2852{
2853        struct btrfs_root *root;
2854        int release = 0;
2855        int ret = 0;
2856
2857        if (!node)
2858                return 0;
2859
2860        BUG_ON(node->processed);
2861        root = select_one_root(trans, node);
2862        if (root == ERR_PTR(-ENOENT)) {
2863                update_processed_blocks(rc, node);
2864                goto out;
2865        }
2866
2867        if (!root || root->ref_cows) {
2868                ret = reserve_metadata_space(trans, rc, node);
2869                if (ret)
2870                        goto out;
2871                release = 1;
2872        }
2873
2874        if (root) {
2875                if (root->ref_cows) {
2876                        BUG_ON(node->new_bytenr);
2877                        BUG_ON(!list_empty(&node->list));
2878                        btrfs_record_root_in_trans(trans, root);
2879                        root = root->reloc_root;
2880                        node->new_bytenr = root->node->start;
2881                        node->root = root;
2882                        list_add_tail(&node->list, &rc->backref_cache.changed);
2883                } else {
2884                        path->lowest_level = node->level;
2885                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2886                        btrfs_release_path(path);
2887                        if (ret > 0)
2888                                ret = 0;
2889                }
2890                if (!ret)
2891                        update_processed_blocks(rc, node);
2892        } else {
2893                ret = do_relocation(trans, rc, node, key, path, 1);
2894        }
2895out:
2896        if (ret || node->level == 0 || node->cowonly) {
2897                if (release)
2898                        release_metadata_space(rc, node);
2899                remove_backref_node(&rc->backref_cache, node);
2900        }
2901        return ret;
2902}
2903
2904/*
2905 * relocate a list of blocks
2906 */
2907static noinline_for_stack
2908int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2909                         struct reloc_control *rc, struct rb_root *blocks)
2910{
2911        struct backref_node *node;
2912        struct btrfs_path *path;
2913        struct tree_block *block;
2914        struct rb_node *rb_node;
2915        int ret;
2916        int err = 0;
2917
2918        path = btrfs_alloc_path();
2919        if (!path) {
2920                err = -ENOMEM;
2921                goto out_free_blocks;
2922        }
2923
2924        rb_node = rb_first(blocks);
2925        while (rb_node) {
2926                block = rb_entry(rb_node, struct tree_block, rb_node);
2927                if (!block->key_ready)
2928                        reada_tree_block(rc, block);
2929                rb_node = rb_next(rb_node);
2930        }
2931
2932        rb_node = rb_first(blocks);
2933        while (rb_node) {
2934                block = rb_entry(rb_node, struct tree_block, rb_node);
2935                if (!block->key_ready) {
2936                        err = get_tree_block_key(rc, block);
2937                        if (err)
2938                                goto out_free_path;
2939                }
2940                rb_node = rb_next(rb_node);
2941        }
2942
2943        rb_node = rb_first(blocks);
2944        while (rb_node) {
2945                block = rb_entry(rb_node, struct tree_block, rb_node);
2946
2947                node = build_backref_tree(rc, &block->key,
2948                                          block->level, block->bytenr);
2949                if (IS_ERR(node)) {
2950                        err = PTR_ERR(node);
2951                        goto out;
2952                }
2953
2954                ret = relocate_tree_block(trans, rc, node, &block->key,
2955                                          path);
2956                if (ret < 0) {
2957                        if (ret != -EAGAIN || rb_node == rb_first(blocks))
2958                                err = ret;
2959                        goto out;
2960                }
2961                rb_node = rb_next(rb_node);
2962        }
2963out:
2964        err = finish_pending_nodes(trans, rc, path, err);
2965
2966out_free_path:
2967        btrfs_free_path(path);
2968out_free_blocks:
2969        free_block_list(blocks);
2970        return err;
2971}
2972
2973static noinline_for_stack
2974int prealloc_file_extent_cluster(struct inode *inode,
2975                                 struct file_extent_cluster *cluster)
2976{
2977        u64 alloc_hint = 0;
2978        u64 start;
2979        u64 end;
2980        u64 offset = BTRFS_I(inode)->index_cnt;
2981        u64 num_bytes;
2982        int nr = 0;
2983        int ret = 0;
2984
2985        BUG_ON(cluster->start != cluster->boundary[0]);
2986        mutex_lock(&inode->i_mutex);
2987
2988        ret = btrfs_check_data_free_space(inode, cluster->end +
2989                                          1 - cluster->start);
2990        if (ret)
2991                goto out;
2992
2993        while (nr < cluster->nr) {
2994                start = cluster->boundary[nr] - offset;
2995                if (nr + 1 < cluster->nr)
2996                        end = cluster->boundary[nr + 1] - 1 - offset;
2997                else
2998                        end = cluster->end - offset;
2999
3000                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3001                num_bytes = end + 1 - start;
3002                ret = btrfs_prealloc_file_range(inode, 0, start,
3003                                                num_bytes, num_bytes,
3004                                                end + 1, &alloc_hint);
3005                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3006                if (ret)
3007                        break;
3008                nr++;
3009        }
3010        btrfs_free_reserved_data_space(inode, cluster->end +
3011                                       1 - cluster->start);
3012out:
3013        mutex_unlock(&inode->i_mutex);
3014        return ret;
3015}
3016
3017static noinline_for_stack
3018int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3019                         u64 block_start)
3020{
3021        struct btrfs_root *root = BTRFS_I(inode)->root;
3022        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3023        struct extent_map *em;
3024        int ret = 0;
3025
3026        em = alloc_extent_map();
3027        if (!em)
3028                return -ENOMEM;
3029
3030        em->start = start;
3031        em->len = end + 1 - start;
3032        em->block_len = em->len;
3033        em->block_start = block_start;
3034        em->bdev = root->fs_info->fs_devices->latest_bdev;
3035        set_bit(EXTENT_FLAG_PINNED, &em->flags);
3036
3037        lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3038        while (1) {
3039                write_lock(&em_tree->lock);
3040                ret = add_extent_mapping(em_tree, em, 0);
3041                write_unlock(&em_tree->lock);
3042                if (ret != -EEXIST) {
3043                        free_extent_map(em);
3044                        break;
3045                }
3046                btrfs_drop_extent_cache(inode, start, end, 0);
3047        }
3048        unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3049        return ret;
3050}
3051
3052static int relocate_file_extent_cluster(struct inode *inode,
3053                                        struct file_extent_cluster *cluster)
3054{
3055        u64 page_start;
3056        u64 page_end;
3057        u64 offset = BTRFS_I(inode)->index_cnt;
3058        unsigned long index;
3059        unsigned long last_index;
3060        struct page *page;
3061        struct file_ra_state *ra;
3062        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3063        int nr = 0;
3064        int ret = 0;
3065
3066        if (!cluster->nr)
3067                return 0;
3068
3069        ra = kzalloc(sizeof(*ra), GFP_NOFS);
3070        if (!ra)
3071                return -ENOMEM;
3072
3073        ret = prealloc_file_extent_cluster(inode, cluster);
3074        if (ret)
3075                goto out;
3076
3077        file_ra_state_init(ra, inode->i_mapping);
3078
3079        ret = setup_extent_mapping(inode, cluster->start - offset,
3080                                   cluster->end - offset, cluster->start);
3081        if (ret)
3082                goto out;
3083
3084        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
3085        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
3086        while (index <= last_index) {
3087                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
3088                if (ret)
3089                        goto out;
3090
3091                page = find_lock_page(inode->i_mapping, index);
3092                if (!page) {
3093                        page_cache_sync_readahead(inode->i_mapping,
3094                                                  ra, NULL, index,
3095                                                  last_index + 1 - index);
3096                        page = find_or_create_page(inode->i_mapping, index,
3097                                                   mask);
3098                        if (!page) {
3099                                btrfs_delalloc_release_metadata(inode,
3100                                                        PAGE_CACHE_SIZE);
3101                                ret = -ENOMEM;
3102                                goto out;
3103                        }
3104                }
3105
3106                if (PageReadahead(page)) {
3107                        page_cache_async_readahead(inode->i_mapping,
3108                                                   ra, NULL, page, index,
3109                                                   last_index + 1 - index);
3110                }
3111
3112                if (!PageUptodate(page)) {
3113                        btrfs_readpage(NULL, page);
3114                        lock_page(page);
3115                        if (!PageUptodate(page)) {
3116                                unlock_page(page);
3117                                page_cache_release(page);
3118                                btrfs_delalloc_release_metadata(inode,
3119                                                        PAGE_CACHE_SIZE);
3120                                ret = -EIO;
3121                                goto out;
3122                        }
3123                }
3124
3125                page_start = page_offset(page);
3126                page_end = page_start + PAGE_CACHE_SIZE - 1;
3127
3128                lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3129
3130                set_page_extent_mapped(page);
3131
3132                if (nr < cluster->nr &&
3133                    page_start + offset == cluster->boundary[nr]) {
3134                        set_extent_bits(&BTRFS_I(inode)->io_tree,
3135                                        page_start, page_end,
3136                                        EXTENT_BOUNDARY, GFP_NOFS);
3137                        nr++;
3138                }
3139
3140                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
3141                set_page_dirty(page);
3142
3143                unlock_extent(&BTRFS_I(inode)->io_tree,
3144                              page_start, page_end);
3145                unlock_page(page);
3146                page_cache_release(page);
3147
3148                index++;
3149                balance_dirty_pages_ratelimited(inode->i_mapping);
3150                btrfs_throttle(BTRFS_I(inode)->root);
3151        }
3152        WARN_ON(nr != cluster->nr);
3153out:
3154        kfree(ra);
3155        return ret;
3156}
3157
3158static noinline_for_stack
3159int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3160                         struct file_extent_cluster *cluster)
3161{
3162        int ret;
3163
3164        if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3165                ret = relocate_file_extent_cluster(inode, cluster);
3166                if (ret)
3167                        return ret;
3168                cluster->nr = 0;
3169        }
3170
3171        if (!cluster->nr)
3172                cluster->start = extent_key->objectid;
3173        else
3174                BUG_ON(cluster->nr >= MAX_EXTENTS);
3175        cluster->end = extent_key->objectid + extent_key->offset - 1;
3176        cluster->boundary[cluster->nr] = extent_key->objectid;
3177        cluster->nr++;
3178
3179        if (cluster->nr >= MAX_EXTENTS) {
3180                ret = relocate_file_extent_cluster(inode, cluster);
3181                if (ret)
3182                        return ret;
3183                cluster->nr = 0;
3184        }
3185        return 0;
3186}
3187
3188#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3189static int get_ref_objectid_v0(struct reloc_control *rc,
3190                               struct btrfs_path *path,
3191                               struct btrfs_key *extent_key,
3192                               u64 *ref_objectid, int *path_change)
3193{
3194        struct btrfs_key key;
3195        struct extent_buffer *leaf;
3196        struct btrfs_extent_ref_v0 *ref0;
3197        int ret;
3198        int slot;
3199
3200        leaf = path->nodes[0];
3201        slot = path->slots[0];
3202        while (1) {
3203                if (slot >= btrfs_header_nritems(leaf)) {
3204                        ret = btrfs_next_leaf(rc->extent_root, path);
3205                        if (ret < 0)
3206                                return ret;
3207                        BUG_ON(ret > 0);
3208                        leaf = path->nodes[0];
3209                        slot = path->slots[0];
3210                        if (path_change)
3211                                *path_change = 1;
3212                }
3213                btrfs_item_key_to_cpu(leaf, &key, slot);
3214                if (key.objectid != extent_key->objectid)
3215                        return -ENOENT;
3216
3217                if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3218                        slot++;
3219                        continue;
3220                }
3221                ref0 = btrfs_item_ptr(leaf, slot,
3222                                struct btrfs_extent_ref_v0);
3223                *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3224                break;
3225        }
3226        return 0;
3227}
3228#endif
3229
3230/*
3231 * helper to add a tree block to the list.
3232 * the major work is getting the generation and level of the block
3233 */
3234static int add_tree_block(struct reloc_control *rc,
3235                          struct btrfs_key *extent_key,
3236                          struct btrfs_path *path,
3237                          struct rb_root *blocks)
3238{
3239        struct extent_buffer *eb;
3240        struct btrfs_extent_item *ei;
3241        struct btrfs_tree_block_info *bi;
3242        struct tree_block *block;
3243        struct rb_node *rb_node;
3244        u32 item_size;
3245        int level = -1;
3246        int generation;
3247
3248        eb =  path->nodes[0];
3249        item_size = btrfs_item_size_nr(eb, path->slots[0]);
3250
3251        if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3252            item_size >= sizeof(*ei) + sizeof(*bi)) {
3253                ei = btrfs_item_ptr(eb, path->slots[0],
3254                                struct btrfs_extent_item);
3255                if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3256                        bi = (struct btrfs_tree_block_info *)(ei + 1);
3257                        level = btrfs_tree_block_level(eb, bi);
3258                } else {
3259                        level = (int)extent_key->offset;
3260                }
3261                generation = btrfs_extent_generation(eb, ei);
3262        } else {
3263#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3264                u64 ref_owner;
3265                int ret;
3266
3267                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3268                ret = get_ref_objectid_v0(rc, path, extent_key,
3269                                          &ref_owner, NULL);
3270                if (ret < 0)
3271                        return ret;
3272                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3273                level = (int)ref_owner;
3274                /* FIXME: get real generation */
3275                generation = 0;
3276#else
3277                BUG();
3278#endif
3279        }
3280
3281        btrfs_release_path(path);
3282
3283        BUG_ON(level == -1);
3284
3285        block = kmalloc(sizeof(*block), GFP_NOFS);
3286        if (!block)
3287                return -ENOMEM;
3288
3289        block->bytenr = extent_key->objectid;
3290        block->key.objectid = rc->extent_root->leafsize;
3291        block->key.offset = generation;
3292        block->level = level;
3293        block->key_ready = 0;
3294
3295        rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3296        if (rb_node)
3297                backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3298
3299        return 0;
3300}
3301
3302/*
3303 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3304 */
3305static int __add_tree_block(struct reloc_control *rc,
3306                            u64 bytenr, u32 blocksize,
3307                            struct rb_root *blocks)
3308{
3309        struct btrfs_path *path;
3310        struct btrfs_key key;
3311        int ret;
3312        bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
3313                                        SKINNY_METADATA);
3314
3315        if (tree_block_processed(bytenr, blocksize, rc))
3316                return 0;
3317
3318        if (tree_search(blocks, bytenr))
3319                return 0;
3320
3321        path = btrfs_alloc_path();
3322        if (!path)
3323                return -ENOMEM;
3324again:
3325        key.objectid = bytenr;
3326        if (skinny) {
3327                key.type = BTRFS_METADATA_ITEM_KEY;
3328                key.offset = (u64)-1;
3329        } else {
3330                key.type = BTRFS_EXTENT_ITEM_KEY;
3331                key.offset = blocksize;
3332        }
3333
3334        path->search_commit_root = 1;
3335        path->skip_locking = 1;
3336        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3337        if (ret < 0)
3338                goto out;
3339
3340        if (ret > 0 && skinny) {
3341                if (path->slots[0]) {
3342                        path->slots[0]--;
3343                        btrfs_item_key_to_cpu(path->nodes[0], &key,
3344                                              path->slots[0]);
3345                        if (key.objectid == bytenr &&
3346                            (key.type == BTRFS_METADATA_ITEM_KEY ||
3347                             (key.type == BTRFS_EXTENT_ITEM_KEY &&
3348                              key.offset == blocksize)))
3349                                ret = 0;
3350                }
3351
3352                if (ret) {
3353                        skinny = false;
3354                        btrfs_release_path(path);
3355                        goto again;
3356                }
3357        }
3358        BUG_ON(ret);
3359
3360        ret = add_tree_block(rc, &key, path, blocks);
3361out:
3362        btrfs_free_path(path);
3363        return ret;
3364}
3365
3366/*
3367 * helper to check if the block use full backrefs for pointers in it
3368 */
3369static int block_use_full_backref(struct reloc_control *rc,
3370                                  struct extent_buffer *eb)
3371{
3372        u64 flags;
3373        int ret;
3374
3375        if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3376            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3377                return 1;
3378
3379        ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
3380                                       eb->start, btrfs_header_level(eb), 1,
3381                                       NULL, &flags);
3382        BUG_ON(ret);
3383
3384        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3385                ret = 1;
3386        else
3387                ret = 0;
3388        return ret;
3389}
3390
3391static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3392                                    struct inode *inode, u64 ino)
3393{
3394        struct btrfs_key key;
3395        struct btrfs_path *path;
3396        struct btrfs_root *root = fs_info->tree_root;
3397        struct btrfs_trans_handle *trans;
3398        int ret = 0;
3399
3400        if (inode)
3401                goto truncate;
3402
3403        key.objectid = ino;
3404        key.type = BTRFS_INODE_ITEM_KEY;
3405        key.offset = 0;
3406
3407        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3408        if (IS_ERR(inode) || is_bad_inode(inode)) {
3409                if (!IS_ERR(inode))
3410                        iput(inode);
3411                return -ENOENT;
3412        }
3413
3414truncate:
3415        ret = btrfs_check_trunc_cache_free_space(root,
3416                                                 &fs_info->global_block_rsv);
3417        if (ret)
3418                goto out;
3419
3420        path = btrfs_alloc_path();
3421        if (!path) {
3422                ret = -ENOMEM;
3423                goto out;
3424        }
3425
3426        trans = btrfs_join_transaction(root);
3427        if (IS_ERR(trans)) {
3428                btrfs_free_path(path);
3429                ret = PTR_ERR(trans);
3430                goto out;
3431        }
3432
3433        ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3434
3435        btrfs_free_path(path);
3436        btrfs_end_transaction(trans, root);
3437        btrfs_btree_balance_dirty(root);
3438out:
3439        iput(inode);
3440        return ret;
3441}
3442
3443/*
3444 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3445 * this function scans fs tree to find blocks reference the data extent
3446 */
3447static int find_data_references(struct reloc_control *rc,
3448                                struct btrfs_key *extent_key,
3449                                struct extent_buffer *leaf,
3450                                struct btrfs_extent_data_ref *ref,
3451                                struct rb_root *blocks)
3452{
3453        struct btrfs_path *path;
3454        struct tree_block *block;
3455        struct btrfs_root *root;
3456        struct btrfs_file_extent_item *fi;
3457        struct rb_node *rb_node;
3458        struct btrfs_key key;
3459        u64 ref_root;
3460        u64 ref_objectid;
3461        u64 ref_offset;
3462        u32 ref_count;
3463        u32 nritems;
3464        int err = 0;
3465        int added = 0;
3466        int counted;
3467        int ret;
3468
3469        ref_root = btrfs_extent_data_ref_root(leaf, ref);
3470        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3471        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3472        ref_count = btrfs_extent_data_ref_count(leaf, ref);
3473
3474        /*
3475         * This is an extent belonging to the free space cache, lets just delete
3476         * it and redo the search.
3477         */
3478        if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3479                ret = delete_block_group_cache(rc->extent_root->fs_info,
3480                                               NULL, ref_objectid);
3481                if (ret != -ENOENT)
3482                        return ret;
3483                ret = 0;
3484        }
3485
3486        path = btrfs_alloc_path();
3487        if (!path)
3488                return -ENOMEM;
3489        path->reada = 1;
3490
3491        root = read_fs_root(rc->extent_root->fs_info, ref_root);
3492        if (IS_ERR(root)) {
3493                err = PTR_ERR(root);
3494                goto out;
3495        }
3496
3497        key.objectid = ref_objectid;
3498        key.type = BTRFS_EXTENT_DATA_KEY;
3499        if (ref_offset > ((u64)-1 << 32))
3500                key.offset = 0;
3501        else
3502                key.offset = ref_offset;
3503
3504        path->search_commit_root = 1;
3505        path->skip_locking = 1;
3506        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3507        if (ret < 0) {
3508                err = ret;
3509                goto out;
3510        }
3511
3512        leaf = path->nodes[0];
3513        nritems = btrfs_header_nritems(leaf);
3514        /*
3515         * the references in tree blocks that use full backrefs
3516         * are not counted in
3517         */
3518        if (block_use_full_backref(rc, leaf))
3519                counted = 0;
3520        else
3521                counted = 1;
3522        rb_node = tree_search(blocks, leaf->start);
3523        if (rb_node) {
3524                if (counted)
3525                        added = 1;
3526                else
3527                        path->slots[0] = nritems;
3528        }
3529
3530        while (ref_count > 0) {
3531                while (path->slots[0] >= nritems) {
3532                        ret = btrfs_next_leaf(root, path);
3533                        if (ret < 0) {
3534                                err = ret;
3535                                goto out;
3536                        }
3537                        if (ret > 0) {
3538                                WARN_ON(1);
3539                                goto out;
3540                        }
3541
3542                        leaf = path->nodes[0];
3543                        nritems = btrfs_header_nritems(leaf);
3544                        added = 0;
3545
3546                        if (block_use_full_backref(rc, leaf))
3547                                counted = 0;
3548                        else
3549                                counted = 1;
3550                        rb_node = tree_search(blocks, leaf->start);
3551                        if (rb_node) {
3552                                if (counted)
3553                                        added = 1;
3554                                else
3555                                        path->slots[0] = nritems;
3556                        }
3557                }
3558
3559                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3560                if (key.objectid != ref_objectid ||
3561                    key.type != BTRFS_EXTENT_DATA_KEY) {
3562                        WARN_ON(1);
3563                        break;
3564                }
3565
3566                fi = btrfs_item_ptr(leaf, path->slots[0],
3567                                    struct btrfs_file_extent_item);
3568
3569                if (btrfs_file_extent_type(leaf, fi) ==
3570                    BTRFS_FILE_EXTENT_INLINE)
3571                        goto next;
3572
3573                if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3574                    extent_key->objectid)
3575                        goto next;
3576
3577                key.offset -= btrfs_file_extent_offset(leaf, fi);
3578                if (key.offset != ref_offset)
3579                        goto next;
3580
3581                if (counted)
3582                        ref_count--;
3583                if (added)
3584                        goto next;
3585
3586                if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3587                        block = kmalloc(sizeof(*block), GFP_NOFS);
3588                        if (!block) {
3589                                err = -ENOMEM;
3590                                break;
3591                        }
3592                        block->bytenr = leaf->start;
3593                        btrfs_item_key_to_cpu(leaf, &block->key, 0);
3594                        block->level = 0;
3595                        block->key_ready = 1;
3596                        rb_node = tree_insert(blocks, block->bytenr,
3597                                              &block->rb_node);
3598                        if (rb_node)
3599                                backref_tree_panic(rb_node, -EEXIST,
3600                                                   block->bytenr);
3601                }
3602                if (counted)
3603                        added = 1;
3604                else
3605                        path->slots[0] = nritems;
3606next:
3607                path->slots[0]++;
3608
3609        }
3610out:
3611        btrfs_free_path(path);
3612        return err;
3613}
3614
3615/*
3616 * helper to find all tree blocks that reference a given data extent
3617 */
3618static noinline_for_stack
3619int add_data_references(struct reloc_control *rc,
3620                        struct btrfs_key *extent_key,
3621                        struct btrfs_path *path,
3622                        struct rb_root *blocks)
3623{
3624        struct btrfs_key key;
3625        struct extent_buffer *eb;
3626        struct btrfs_extent_data_ref *dref;
3627        struct btrfs_extent_inline_ref *iref;
3628        unsigned long ptr;
3629        unsigned long end;
3630        u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3631        int ret;
3632        int err = 0;
3633
3634        eb = path->nodes[0];
3635        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3636        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3637#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3638        if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3639                ptr = end;
3640        else
3641#endif
3642                ptr += sizeof(struct btrfs_extent_item);
3643
3644        while (ptr < end) {
3645                iref = (struct btrfs_extent_inline_ref *)ptr;
3646                key.type = btrfs_extent_inline_ref_type(eb, iref);
3647                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3648                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3649                        ret = __add_tree_block(rc, key.offset, blocksize,
3650                                               blocks);
3651                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3652                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3653                        ret = find_data_references(rc, extent_key,
3654                                                   eb, dref, blocks);
3655                } else {
3656                        BUG();
3657                }
3658                ptr += btrfs_extent_inline_ref_size(key.type);
3659        }
3660        WARN_ON(ptr > end);
3661
3662        while (1) {
3663                cond_resched();
3664                eb = path->nodes[0];
3665                if (path->slots[0] >= btrfs_header_nritems(eb)) {
3666                        ret = btrfs_next_leaf(rc->extent_root, path);
3667                        if (ret < 0) {
3668                                err = ret;
3669                                break;
3670                        }
3671                        if (ret > 0)
3672                                break;
3673                        eb = path->nodes[0];
3674                }
3675
3676                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3677                if (key.objectid != extent_key->objectid)
3678                        break;
3679
3680#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3681                if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3682                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3683#else
3684                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3685                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3686#endif
3687                        ret = __add_tree_block(rc, key.offset, blocksize,
3688                                               blocks);
3689                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3690                        dref = btrfs_item_ptr(eb, path->slots[0],
3691                                              struct btrfs_extent_data_ref);
3692                        ret = find_data_references(rc, extent_key,
3693                                                   eb, dref, blocks);
3694                } else {
3695                        ret = 0;
3696                }
3697                if (ret) {
3698                        err = ret;
3699                        break;
3700                }
3701                path->slots[0]++;
3702        }
3703        btrfs_release_path(path);
3704        if (err)
3705                free_block_list(blocks);
3706        return err;
3707}
3708
3709/*
3710 * helper to find next unprocessed extent
3711 */
3712static noinline_for_stack
3713int find_next_extent(struct btrfs_trans_handle *trans,
3714                     struct reloc_control *rc, struct btrfs_path *path,
3715                     struct btrfs_key *extent_key)
3716{
3717        struct btrfs_key key;
3718        struct extent_buffer *leaf;
3719        u64 start, end, last;
3720        int ret;
3721
3722        last = rc->block_group->key.objectid + rc->block_group->key.offset;
3723        while (1) {
3724                cond_resched();
3725                if (rc->search_start >= last) {
3726                        ret = 1;
3727                        break;
3728                }
3729
3730                key.objectid = rc->search_start;
3731                key.type = BTRFS_EXTENT_ITEM_KEY;
3732                key.offset = 0;
3733
3734                path->search_commit_root = 1;
3735                path->skip_locking = 1;
3736                ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3737                                        0, 0);
3738                if (ret < 0)
3739                        break;
3740next:
3741                leaf = path->nodes[0];
3742                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3743                        ret = btrfs_next_leaf(rc->extent_root, path);
3744                        if (ret != 0)
3745                                break;
3746                        leaf = path->nodes[0];
3747                }
3748
3749                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3750                if (key.objectid >= last) {
3751                        ret = 1;
3752                        break;
3753                }
3754
3755                if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3756                    key.type != BTRFS_METADATA_ITEM_KEY) {
3757                        path->slots[0]++;
3758                        goto next;
3759                }
3760
3761                if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3762                    key.objectid + key.offset <= rc->search_start) {
3763                        path->slots[0]++;
3764                        goto next;
3765                }
3766
3767                if (key.type == BTRFS_METADATA_ITEM_KEY &&
3768                    key.objectid + rc->extent_root->leafsize <=
3769                    rc->search_start) {
3770                        path->slots[0]++;
3771                        goto next;
3772                }
3773
3774                ret = find_first_extent_bit(&rc->processed_blocks,
3775                                            key.objectid, &start, &end,
3776                                            EXTENT_DIRTY, NULL);
3777
3778                if (ret == 0 && start <= key.objectid) {
3779                        btrfs_release_path(path);
3780                        rc->search_start = end + 1;
3781                } else {
3782                        if (key.type == BTRFS_EXTENT_ITEM_KEY)
3783                                rc->search_start = key.objectid + key.offset;
3784                        else
3785                                rc->search_start = key.objectid +
3786                                        rc->extent_root->leafsize;
3787                        memcpy(extent_key, &key, sizeof(key));
3788                        return 0;
3789                }
3790        }
3791        btrfs_release_path(path);
3792        return ret;
3793}
3794
3795static void set_reloc_control(struct reloc_control *rc)
3796{
3797        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3798
3799        mutex_lock(&fs_info->reloc_mutex);
3800        fs_info->reloc_ctl = rc;
3801        mutex_unlock(&fs_info->reloc_mutex);
3802}
3803
3804static void unset_reloc_control(struct reloc_control *rc)
3805{
3806        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3807
3808        mutex_lock(&fs_info->reloc_mutex);
3809        fs_info->reloc_ctl = NULL;
3810        mutex_unlock(&fs_info->reloc_mutex);
3811}
3812
3813static int check_extent_flags(u64 flags)
3814{
3815        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3816            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3817                return 1;
3818        if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3819            !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3820                return 1;
3821        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3822            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3823                return 1;
3824        return 0;
3825}
3826
3827static noinline_for_stack
3828int prepare_to_relocate(struct reloc_control *rc)
3829{
3830        struct btrfs_trans_handle *trans;
3831        int ret;
3832
3833        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
3834                                              BTRFS_BLOCK_RSV_TEMP);
3835        if (!rc->block_rsv)
3836                return -ENOMEM;
3837
3838        /*
3839         * reserve some space for creating reloc trees.
3840         * btrfs_init_reloc_root will use them when there
3841         * is no reservation in transaction handle.
3842         */
3843        ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3844                                  rc->extent_root->nodesize * 256,
3845                                  BTRFS_RESERVE_FLUSH_ALL);
3846        if (ret)
3847                return ret;
3848
3849        memset(&rc->cluster, 0, sizeof(rc->cluster));
3850        rc->search_start = rc->block_group->key.objectid;
3851        rc->extents_found = 0;
3852        rc->nodes_relocated = 0;
3853        rc->merging_rsv_size = 0;
3854
3855        rc->create_reloc_tree = 1;
3856        set_reloc_control(rc);
3857
3858        trans = btrfs_join_transaction(rc->extent_root);
3859        if (IS_ERR(trans)) {
3860                unset_reloc_control(rc);
3861                /*
3862                 * extent tree is not a ref_cow tree and has no reloc_root to
3863                 * cleanup.  And callers are responsible to free the above
3864                 * block rsv.
3865                 */
3866                return PTR_ERR(trans);
3867        }
3868        btrfs_commit_transaction(trans, rc->extent_root);
3869        return 0;
3870}
3871
3872static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3873{
3874        struct rb_root blocks = RB_ROOT;
3875        struct btrfs_key key;
3876        struct btrfs_trans_handle *trans = NULL;
3877        struct btrfs_path *path;
3878        struct btrfs_extent_item *ei;
3879        u64 flags;
3880        u32 item_size;
3881        int ret;
3882        int err = 0;
3883        int progress = 0;
3884
3885        path = btrfs_alloc_path();
3886        if (!path)
3887                return -ENOMEM;
3888        path->reada = 1;
3889
3890        ret = prepare_to_relocate(rc);
3891        if (ret) {
3892                err = ret;
3893                goto out_free;
3894        }
3895
3896        while (1) {
3897                progress++;
3898                trans = btrfs_start_transaction(rc->extent_root, 0);
3899                if (IS_ERR(trans)) {
3900                        err = PTR_ERR(trans);
3901                        trans = NULL;
3902                        break;
3903                }
3904restart:
3905                if (update_backref_cache(trans, &rc->backref_cache)) {
3906                        btrfs_end_transaction(trans, rc->extent_root);
3907                        continue;
3908                }
3909
3910                ret = find_next_extent(trans, rc, path, &key);
3911                if (ret < 0)
3912                        err = ret;
3913                if (ret != 0)
3914                        break;
3915
3916                rc->extents_found++;
3917
3918                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3919                                    struct btrfs_extent_item);
3920                item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3921                if (item_size >= sizeof(*ei)) {
3922                        flags = btrfs_extent_flags(path->nodes[0], ei);
3923                        ret = check_extent_flags(flags);
3924                        BUG_ON(ret);
3925
3926                } else {
3927#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3928                        u64 ref_owner;
3929                        int path_change = 0;
3930
3931                        BUG_ON(item_size !=
3932                               sizeof(struct btrfs_extent_item_v0));
3933                        ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3934                                                  &path_change);
3935                        if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3936                                flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3937                        else
3938                                flags = BTRFS_EXTENT_FLAG_DATA;
3939
3940                        if (path_change) {
3941                                btrfs_release_path(path);
3942
3943                                path->search_commit_root = 1;
3944                                path->skip_locking = 1;
3945                                ret = btrfs_search_slot(NULL, rc->extent_root,
3946                                                        &key, path, 0, 0);
3947                                if (ret < 0) {
3948                                        err = ret;
3949                                        break;
3950                                }
3951                                BUG_ON(ret > 0);
3952                        }
3953#else
3954                        BUG();
3955#endif
3956                }
3957
3958                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3959                        ret = add_tree_block(rc, &key, path, &blocks);
3960                } else if (rc->stage == UPDATE_DATA_PTRS &&
3961                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
3962                        ret = add_data_references(rc, &key, path, &blocks);
3963                } else {
3964                        btrfs_release_path(path);
3965                        ret = 0;
3966                }
3967                if (ret < 0) {
3968                        err = ret;
3969                        break;
3970                }
3971
3972                if (!RB_EMPTY_ROOT(&blocks)) {
3973                        ret = relocate_tree_blocks(trans, rc, &blocks);
3974                        if (ret < 0) {
3975                                if (ret != -EAGAIN) {
3976                                        err = ret;
3977                                        break;
3978                                }
3979                                rc->extents_found--;
3980                                rc->search_start = key.objectid;
3981                        }
3982                }
3983
3984                ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3985                if (ret < 0) {
3986                        if (ret != -ENOSPC) {
3987                                err = ret;
3988                                WARN_ON(1);
3989                                break;
3990                        }
3991                        rc->commit_transaction = 1;
3992                }
3993
3994                if (rc->commit_transaction) {
3995                        rc->commit_transaction = 0;
3996                        ret = btrfs_commit_transaction(trans, rc->extent_root);
3997                        BUG_ON(ret);
3998                } else {
3999                        btrfs_end_transaction_throttle(trans, rc->extent_root);
4000                        btrfs_btree_balance_dirty(rc->extent_root);
4001                }
4002                trans = NULL;
4003
4004                if (rc->stage == MOVE_DATA_EXTENTS &&
4005                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4006                        rc->found_file_extent = 1;
4007                        ret = relocate_data_extent(rc->data_inode,
4008                                                   &key, &rc->cluster);
4009                        if (ret < 0) {
4010                                err = ret;
4011                                break;
4012                        }
4013                }
4014        }
4015        if (trans && progress && err == -ENOSPC) {
4016                ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
4017                                              rc->block_group->flags);
4018                if (ret == 0) {
4019                        err = 0;
4020                        progress = 0;
4021                        goto restart;
4022                }
4023        }
4024
4025        btrfs_release_path(path);
4026        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
4027                          GFP_NOFS);
4028
4029        if (trans) {
4030                btrfs_end_transaction_throttle(trans, rc->extent_root);
4031                btrfs_btree_balance_dirty(rc->extent_root);
4032        }
4033
4034        if (!err) {
4035                ret = relocate_file_extent_cluster(rc->data_inode,
4036                                                   &rc->cluster);
4037                if (ret < 0)
4038                        err = ret;
4039        }
4040
4041        rc->create_reloc_tree = 0;
4042        set_reloc_control(rc);
4043
4044        backref_cache_cleanup(&rc->backref_cache);
4045        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4046
4047        err = prepare_to_merge(rc, err);
4048
4049        merge_reloc_roots(rc);
4050
4051        rc->merge_reloc_tree = 0;
4052        unset_reloc_control(rc);
4053        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
4054
4055        /* get rid of pinned extents */
4056        trans = btrfs_join_transaction(rc->extent_root);
4057        if (IS_ERR(trans))
4058                err = PTR_ERR(trans);
4059        else
4060                btrfs_commit_transaction(trans, rc->extent_root);
4061out_free:
4062        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
4063        btrfs_free_path(path);
4064        return err;
4065}
4066
4067static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4068                                 struct btrfs_root *root, u64 objectid)
4069{
4070        struct btrfs_path *path;
4071        struct btrfs_inode_item *item;
4072        struct extent_buffer *leaf;
4073        int ret;
4074
4075        path = btrfs_alloc_path();
4076        if (!path)
4077                return -ENOMEM;
4078
4079        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4080        if (ret)
4081                goto out;
4082
4083        leaf = path->nodes[0];
4084        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4085        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
4086        btrfs_set_inode_generation(leaf, item, 1);
4087        btrfs_set_inode_size(leaf, item, 0);
4088        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4089        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4090                                          BTRFS_INODE_PREALLOC);
4091        btrfs_mark_buffer_dirty(leaf);
4092        btrfs_release_path(path);
4093out:
4094        btrfs_free_path(path);
4095        return ret;
4096}
4097
4098/*
4099 * helper to create inode for data relocation.
4100 * the inode is in data relocation tree and its link count is 0
4101 */
4102static noinline_for_stack
4103struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4104                                 struct btrfs_block_group_cache *group)
4105{
4106        struct inode *inode = NULL;
4107        struct btrfs_trans_handle *trans;
4108        struct btrfs_root *root;
4109        struct btrfs_key key;
4110        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
4111        int err = 0;
4112
4113        root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4114        if (IS_ERR(root))
4115                return ERR_CAST(root);
4116
4117        trans = btrfs_start_transaction(root, 6);
4118        if (IS_ERR(trans))
4119                return ERR_CAST(trans);
4120
4121        err = btrfs_find_free_objectid(root, &objectid);
4122        if (err)
4123                goto out;
4124
4125        err = __insert_orphan_inode(trans, root, objectid);
4126        BUG_ON(err);
4127
4128        key.objectid = objectid;
4129        key.type = BTRFS_INODE_ITEM_KEY;
4130        key.offset = 0;
4131        inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
4132        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4133        BTRFS_I(inode)->index_cnt = group->key.objectid;
4134
4135        err = btrfs_orphan_add(trans, inode);
4136out:
4137        btrfs_end_transaction(trans, root);
4138        btrfs_btree_balance_dirty(root);
4139        if (err) {
4140                if (inode)
4141                        iput(inode);
4142                inode = ERR_PTR(err);
4143        }
4144        return inode;
4145}
4146
4147static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4148{
4149        struct reloc_control *rc;
4150
4151        rc = kzalloc(sizeof(*rc), GFP_NOFS);
4152        if (!rc)
4153                return NULL;
4154
4155        INIT_LIST_HEAD(&rc->reloc_roots);
4156        backref_cache_init(&rc->backref_cache);
4157        mapping_tree_init(&rc->reloc_root_tree);
4158        extent_io_tree_init(&rc->processed_blocks,
4159                            fs_info->btree_inode->i_mapping);
4160        return rc;
4161}
4162
4163/*
4164 * function to relocate all extents in a block group.
4165 */
4166int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4167{
4168        struct btrfs_fs_info *fs_info = extent_root->fs_info;
4169        struct reloc_control *rc;
4170        struct inode *inode;
4171        struct btrfs_path *path;
4172        int ret;
4173        int rw = 0;
4174        int err = 0;
4175
4176        rc = alloc_reloc_control(fs_info);
4177        if (!rc)
4178                return -ENOMEM;
4179
4180        rc->extent_root = extent_root;
4181
4182        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4183        BUG_ON(!rc->block_group);
4184
4185        if (!rc->block_group->ro) {
4186                ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
4187                if (ret) {
4188                        err = ret;
4189                        goto out;
4190                }
4191                rw = 1;
4192        }
4193
4194        path = btrfs_alloc_path();
4195        if (!path) {
4196                err = -ENOMEM;
4197                goto out;
4198        }
4199
4200        inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
4201                                        path);
4202        btrfs_free_path(path);
4203
4204        if (!IS_ERR(inode))
4205                ret = delete_block_group_cache(fs_info, inode, 0);
4206        else
4207                ret = PTR_ERR(inode);
4208
4209        if (ret && ret != -ENOENT) {
4210                err = ret;
4211                goto out;
4212        }
4213
4214        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4215        if (IS_ERR(rc->data_inode)) {
4216                err = PTR_ERR(rc->data_inode);
4217                rc->data_inode = NULL;
4218                goto out;
4219        }
4220
4221        printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
4222               (unsigned long long)rc->block_group->key.objectid,
4223               (unsigned long long)rc->block_group->flags);
4224
4225        ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
4226        if (ret < 0) {
4227                err = ret;
4228                goto out;
4229        }
4230        btrfs_wait_all_ordered_extents(fs_info, 0);
4231
4232        while (1) {
4233                mutex_lock(&fs_info->cleaner_mutex);
4234                ret = relocate_block_group(rc);
4235                mutex_unlock(&fs_info->cleaner_mutex);
4236                if (ret < 0) {
4237                        err = ret;
4238                        goto out;
4239                }
4240
4241                if (rc->extents_found == 0)
4242                        break;
4243
4244                printk(KERN_INFO "btrfs: found %llu extents\n",
4245                        (unsigned long long)rc->extents_found);
4246
4247                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4248                        btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
4249                        invalidate_mapping_pages(rc->data_inode->i_mapping,
4250                                                 0, -1);
4251                        rc->stage = UPDATE_DATA_PTRS;
4252                }
4253        }
4254
4255        filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
4256                                     rc->block_group->key.objectid,
4257                                     rc->block_group->key.objectid +
4258                                     rc->block_group->key.offset - 1);
4259
4260        WARN_ON(rc->block_group->pinned > 0);
4261        WARN_ON(rc->block_group->reserved > 0);
4262        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4263out:
4264        if (err && rw)
4265                btrfs_set_block_group_rw(extent_root, rc->block_group);
4266        iput(rc->data_inode);
4267        btrfs_put_block_group(rc->block_group);
4268        kfree(rc);
4269        return err;
4270}
4271
4272static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4273{
4274        struct btrfs_trans_handle *trans;
4275        int ret, err;
4276
4277        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4278        if (IS_ERR(trans))
4279                return PTR_ERR(trans);
4280
4281        memset(&root->root_item.drop_progress, 0,
4282                sizeof(root->root_item.drop_progress));
4283        root->root_item.drop_level = 0;
4284        btrfs_set_root_refs(&root->root_item, 0);
4285        ret = btrfs_update_root(trans, root->fs_info->tree_root,
4286                                &root->root_key, &root->root_item);
4287
4288        err = btrfs_end_transaction(trans, root->fs_info->tree_root);
4289        if (err)
4290                return err;
4291        return ret;
4292}
4293
4294/*
4295 * recover relocation interrupted by system crash.
4296 *
4297 * this function resumes merging reloc trees with corresponding fs trees.
4298 * this is important for keeping the sharing of tree blocks
4299 */
4300int btrfs_recover_relocation(struct btrfs_root *root)
4301{
4302        LIST_HEAD(reloc_roots);
4303        struct btrfs_key key;
4304        struct btrfs_root *fs_root;
4305        struct btrfs_root *reloc_root;
4306        struct btrfs_path *path;
4307        struct extent_buffer *leaf;
4308        struct reloc_control *rc = NULL;
4309        struct btrfs_trans_handle *trans;
4310        int ret;
4311        int err = 0;
4312
4313        path = btrfs_alloc_path();
4314        if (!path)
4315                return -ENOMEM;
4316        path->reada = -1;
4317
4318        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4319        key.type = BTRFS_ROOT_ITEM_KEY;
4320        key.offset = (u64)-1;
4321
4322        while (1) {
4323                ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
4324                                        path, 0, 0);
4325                if (ret < 0) {
4326                        err = ret;
4327                        goto out;
4328                }
4329                if (ret > 0) {
4330                        if (path->slots[0] == 0)
4331                                break;
4332                        path->slots[0]--;
4333                }
4334                leaf = path->nodes[0];
4335                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4336                btrfs_release_path(path);
4337
4338                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4339                    key.type != BTRFS_ROOT_ITEM_KEY)
4340                        break;
4341
4342                reloc_root = btrfs_read_fs_root(root, &key);
4343                if (IS_ERR(reloc_root)) {
4344                        err = PTR_ERR(reloc_root);
4345                        goto out;
4346                }
4347
4348                list_add(&reloc_root->root_list, &reloc_roots);
4349
4350                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4351                        fs_root = read_fs_root(root->fs_info,
4352                                               reloc_root->root_key.offset);
4353                        if (IS_ERR(fs_root)) {
4354                                ret = PTR_ERR(fs_root);
4355                                if (ret != -ENOENT) {
4356                                        err = ret;
4357                                        goto out;
4358                                }
4359                                ret = mark_garbage_root(reloc_root);
4360                                if (ret < 0) {
4361                                        err = ret;
4362                                        goto out;
4363                                }
4364                        }
4365                }
4366
4367                if (key.offset == 0)
4368                        break;
4369
4370                key.offset--;
4371        }
4372        btrfs_release_path(path);
4373
4374        if (list_empty(&reloc_roots))
4375                goto out;
4376
4377        rc = alloc_reloc_control(root->fs_info);
4378        if (!rc) {
4379                err = -ENOMEM;
4380                goto out;
4381        }
4382
4383        rc->extent_root = root->fs_info->extent_root;
4384
4385        set_reloc_control(rc);
4386
4387        trans = btrfs_join_transaction(rc->extent_root);
4388        if (IS_ERR(trans)) {
4389                unset_reloc_control(rc);
4390                err = PTR_ERR(trans);
4391                goto out_free;
4392        }
4393
4394        rc->merge_reloc_tree = 1;
4395
4396        while (!list_empty(&reloc_roots)) {
4397                reloc_root = list_entry(reloc_roots.next,
4398                                        struct btrfs_root, root_list);
4399                list_del(&reloc_root->root_list);
4400
4401                if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4402                        list_add_tail(&reloc_root->root_list,
4403                                      &rc->reloc_roots);
4404                        continue;
4405                }
4406
4407                fs_root = read_fs_root(root->fs_info,
4408                                       reloc_root->root_key.offset);
4409                if (IS_ERR(fs_root)) {
4410                        err = PTR_ERR(fs_root);
4411                        goto out_free;
4412                }
4413
4414                err = __add_reloc_root(reloc_root);
4415                BUG_ON(err < 0); /* -ENOMEM or logic error */
4416                fs_root->reloc_root = reloc_root;
4417        }
4418
4419        err = btrfs_commit_transaction(trans, rc->extent_root);
4420        if (err)
4421                goto out_free;
4422
4423        merge_reloc_roots(rc);
4424
4425        unset_reloc_control(rc);
4426
4427        trans = btrfs_join_transaction(rc->extent_root);
4428        if (IS_ERR(trans))
4429                err = PTR_ERR(trans);
4430        else
4431                err = btrfs_commit_transaction(trans, rc->extent_root);
4432out_free:
4433        kfree(rc);
4434out:
4435        if (!list_empty(&reloc_roots))
4436                free_reloc_roots(&reloc_roots);
4437
4438        btrfs_free_path(path);
4439
4440        if (err == 0) {
4441                /* cleanup orphan inode in data relocation tree */
4442                fs_root = read_fs_root(root->fs_info,
4443                                       BTRFS_DATA_RELOC_TREE_OBJECTID);
4444                if (IS_ERR(fs_root))
4445                        err = PTR_ERR(fs_root);
4446                else
4447                        err = btrfs_orphan_cleanup(fs_root);
4448        }
4449        return err;
4450}
4451
4452/*
4453 * helper to add ordered checksum for data relocation.
4454 *
4455 * cloning checksum properly handles the nodatasum extents.
4456 * it also saves CPU time to re-calculate the checksum.
4457 */
4458int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4459{
4460        struct btrfs_ordered_sum *sums;
4461        struct btrfs_ordered_extent *ordered;
4462        struct btrfs_root *root = BTRFS_I(inode)->root;
4463        int ret;
4464        u64 disk_bytenr;
4465        LIST_HEAD(list);
4466
4467        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4468        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4469
4470        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4471        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4472                                       disk_bytenr + len - 1, &list, 0);
4473        if (ret)
4474                goto out;
4475
4476        disk_bytenr = ordered->start;
4477        while (!list_empty(&list)) {
4478                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4479                list_del_init(&sums->list);
4480
4481                sums->bytenr = disk_bytenr;
4482                disk_bytenr += sums->len;
4483
4484                btrfs_add_ordered_sum(inode, ordered, sums);
4485        }
4486out:
4487        btrfs_put_ordered_extent(ordered);
4488        return ret;
4489}
4490
4491void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4492                           struct btrfs_root *root, struct extent_buffer *buf,
4493                           struct extent_buffer *cow)
4494{
4495        struct reloc_control *rc;
4496        struct backref_node *node;
4497        int first_cow = 0;
4498        int level;
4499        int ret;
4500
4501        rc = root->fs_info->reloc_ctl;
4502        if (!rc)
4503                return;
4504
4505        BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4506               root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4507
4508        level = btrfs_header_level(buf);
4509        if (btrfs_header_generation(buf) <=
4510            btrfs_root_last_snapshot(&root->root_item))
4511                first_cow = 1;
4512
4513        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4514            rc->create_reloc_tree) {
4515                WARN_ON(!first_cow && level == 0);
4516
4517                node = rc->backref_cache.path[level];
4518                BUG_ON(node->bytenr != buf->start &&
4519                       node->new_bytenr != buf->start);
4520
4521                drop_node_buffer(node);
4522                extent_buffer_get(cow);
4523                node->eb = cow;
4524                node->new_bytenr = cow->start;
4525
4526                if (!node->pending) {
4527                        list_move_tail(&node->list,
4528                                       &rc->backref_cache.pending[level]);
4529                        node->pending = 1;
4530                }
4531
4532                if (first_cow)
4533                        __mark_block_processed(rc, node);
4534
4535                if (first_cow && level > 0)
4536                        rc->nodes_relocated += buf->len;
4537        }
4538
4539        if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4540                ret = replace_file_extents(trans, rc, root, cow);
4541                BUG_ON(ret);
4542        }
4543}
4544
4545/*
4546 * called before creating snapshot. it calculates metadata reservation
4547 * requried for relocating tree blocks in the snapshot
4548 */
4549void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4550                              struct btrfs_pending_snapshot *pending,
4551                              u64 *bytes_to_reserve)
4552{
4553        struct btrfs_root *root;
4554        struct reloc_control *rc;
4555
4556        root = pending->root;
4557        if (!root->reloc_root)
4558                return;
4559
4560        rc = root->fs_info->reloc_ctl;
4561        if (!rc->merge_reloc_tree)
4562                return;
4563
4564        root = root->reloc_root;
4565        BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4566        /*
4567         * relocation is in the stage of merging trees. the space
4568         * used by merging a reloc tree is twice the size of
4569         * relocated tree nodes in the worst case. half for cowing
4570         * the reloc tree, half for cowing the fs tree. the space
4571         * used by cowing the reloc tree will be freed after the
4572         * tree is dropped. if we create snapshot, cowing the fs
4573         * tree may use more space than it frees. so we need
4574         * reserve extra space.
4575         */
4576        *bytes_to_reserve += rc->nodes_relocated;
4577}
4578
4579/*
4580 * called after snapshot is created. migrate block reservation
4581 * and create reloc root for the newly created snapshot
4582 */
4583int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4584                               struct btrfs_pending_snapshot *pending)
4585{
4586        struct btrfs_root *root = pending->root;
4587        struct btrfs_root *reloc_root;
4588        struct btrfs_root *new_root;
4589        struct reloc_control *rc;
4590        int ret;
4591
4592        if (!root->reloc_root)
4593                return 0;
4594
4595        rc = root->fs_info->reloc_ctl;
4596        rc->merging_rsv_size += rc->nodes_relocated;
4597
4598        if (rc->merge_reloc_tree) {
4599                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4600                                              rc->block_rsv,
4601                                              rc->nodes_relocated);
4602                if (ret)
4603                        return ret;
4604        }
4605
4606        new_root = pending->snap;
4607        reloc_root = create_reloc_root(trans, root->reloc_root,
4608                                       new_root->root_key.objectid);
4609        if (IS_ERR(reloc_root))
4610                return PTR_ERR(reloc_root);
4611
4612        ret = __add_reloc_root(reloc_root);
4613        BUG_ON(ret < 0);
4614        new_root->reloc_root = reloc_root;
4615
4616        if (rc->create_reloc_tree)
4617                ret = clone_backref_node(trans, rc, root, reloc_root);
4618        return ret;
4619}
4620