linux/fs/btrfs/relocation.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2009 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/pagemap.h>
  21#include <linux/writeback.h>
  22#include <linux/blkdev.h>
  23#include <linux/rbtree.h>
  24#include <linux/slab.h>
  25#include "ctree.h"
  26#include "disk-io.h"
  27#include "transaction.h"
  28#include "volumes.h"
  29#include "locking.h"
  30#include "btrfs_inode.h"
  31#include "async-thread.h"
  32#include "free-space-cache.h"
  33#include "inode-map.h"
  34#include "qgroup.h"
  35
  36/*
  37 * backref_node, mapping_node and tree_block start with this
  38 */
  39struct tree_entry {
  40        struct rb_node rb_node;
  41        u64 bytenr;
  42};
  43
  44/*
  45 * present a tree block in the backref cache
  46 */
  47struct backref_node {
  48        struct rb_node rb_node;
  49        u64 bytenr;
  50
  51        u64 new_bytenr;
  52        /* objectid of tree block owner, can be not uptodate */
  53        u64 owner;
  54        /* link to pending, changed or detached list */
  55        struct list_head list;
  56        /* list of upper level blocks reference this block */
  57        struct list_head upper;
  58        /* list of child blocks in the cache */
  59        struct list_head lower;
  60        /* NULL if this node is not tree root */
  61        struct btrfs_root *root;
  62        /* extent buffer got by COW the block */
  63        struct extent_buffer *eb;
  64        /* level of tree block */
  65        unsigned int level:8;
  66        /* is the block in non-reference counted tree */
  67        unsigned int cowonly:1;
  68        /* 1 if no child node in the cache */
  69        unsigned int lowest:1;
  70        /* is the extent buffer locked */
  71        unsigned int locked:1;
  72        /* has the block been processed */
  73        unsigned int processed:1;
  74        /* have backrefs of this block been checked */
  75        unsigned int checked:1;
  76        /*
  77         * 1 if corresponding block has been cowed but some upper
  78         * level block pointers may not point to the new location
  79         */
  80        unsigned int pending:1;
  81        /*
  82         * 1 if the backref node isn't connected to any other
  83         * backref node.
  84         */
  85        unsigned int detached:1;
  86};
  87
  88/*
  89 * present a block pointer in the backref cache
  90 */
  91struct backref_edge {
  92        struct list_head list[2];
  93        struct backref_node *node[2];
  94};
  95
  96#define LOWER   0
  97#define UPPER   1
  98#define RELOCATION_RESERVED_NODES       256
  99
 100struct backref_cache {
 101        /* red black tree of all backref nodes in the cache */
 102        struct rb_root rb_root;
 103        /* for passing backref nodes to btrfs_reloc_cow_block */
 104        struct backref_node *path[BTRFS_MAX_LEVEL];
 105        /*
 106         * list of blocks that have been cowed but some block
 107         * pointers in upper level blocks may not reflect the
 108         * new location
 109         */
 110        struct list_head pending[BTRFS_MAX_LEVEL];
 111        /* list of backref nodes with no child node */
 112        struct list_head leaves;
 113        /* list of blocks that have been cowed in current transaction */
 114        struct list_head changed;
 115        /* list of detached backref node. */
 116        struct list_head detached;
 117
 118        u64 last_trans;
 119
 120        int nr_nodes;
 121        int nr_edges;
 122};
 123
 124/*
 125 * map address of tree root to tree
 126 */
 127struct mapping_node {
 128        struct rb_node rb_node;
 129        u64 bytenr;
 130        void *data;
 131};
 132
 133struct mapping_tree {
 134        struct rb_root rb_root;
 135        spinlock_t lock;
 136};
 137
 138/*
 139 * present a tree block to process
 140 */
 141struct tree_block {
 142        struct rb_node rb_node;
 143        u64 bytenr;
 144        struct btrfs_key key;
 145        unsigned int level:8;
 146        unsigned int key_ready:1;
 147};
 148
 149#define MAX_EXTENTS 128
 150
 151struct file_extent_cluster {
 152        u64 start;
 153        u64 end;
 154        u64 boundary[MAX_EXTENTS];
 155        unsigned int nr;
 156};
 157
 158struct reloc_control {
 159        /* block group to relocate */
 160        struct btrfs_block_group_cache *block_group;
 161        /* extent tree */
 162        struct btrfs_root *extent_root;
 163        /* inode for moving data */
 164        struct inode *data_inode;
 165
 166        struct btrfs_block_rsv *block_rsv;
 167
 168        struct backref_cache backref_cache;
 169
 170        struct file_extent_cluster cluster;
 171        /* tree blocks have been processed */
 172        struct extent_io_tree processed_blocks;
 173        /* map start of tree root to corresponding reloc tree */
 174        struct mapping_tree reloc_root_tree;
 175        /* list of reloc trees */
 176        struct list_head reloc_roots;
 177        /* size of metadata reservation for merging reloc trees */
 178        u64 merging_rsv_size;
 179        /* size of relocated tree nodes */
 180        u64 nodes_relocated;
 181        /* reserved size for block group relocation*/
 182        u64 reserved_bytes;
 183
 184        u64 search_start;
 185        u64 extents_found;
 186
 187        unsigned int stage:8;
 188        unsigned int create_reloc_tree:1;
 189        unsigned int merge_reloc_tree:1;
 190        unsigned int found_file_extent:1;
 191};
 192
 193/* stages of data relocation */
 194#define MOVE_DATA_EXTENTS       0
 195#define UPDATE_DATA_PTRS        1
 196
 197static void remove_backref_node(struct backref_cache *cache,
 198                                struct backref_node *node);
 199static void __mark_block_processed(struct reloc_control *rc,
 200                                   struct backref_node *node);
 201
 202static void mapping_tree_init(struct mapping_tree *tree)
 203{
 204        tree->rb_root = RB_ROOT;
 205        spin_lock_init(&tree->lock);
 206}
 207
 208static void backref_cache_init(struct backref_cache *cache)
 209{
 210        int i;
 211        cache->rb_root = RB_ROOT;
 212        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 213                INIT_LIST_HEAD(&cache->pending[i]);
 214        INIT_LIST_HEAD(&cache->changed);
 215        INIT_LIST_HEAD(&cache->detached);
 216        INIT_LIST_HEAD(&cache->leaves);
 217}
 218
 219static void backref_cache_cleanup(struct backref_cache *cache)
 220{
 221        struct backref_node *node;
 222        int i;
 223
 224        while (!list_empty(&cache->detached)) {
 225                node = list_entry(cache->detached.next,
 226                                  struct backref_node, list);
 227                remove_backref_node(cache, node);
 228        }
 229
 230        while (!list_empty(&cache->leaves)) {
 231                node = list_entry(cache->leaves.next,
 232                                  struct backref_node, lower);
 233                remove_backref_node(cache, node);
 234        }
 235
 236        cache->last_trans = 0;
 237
 238        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 239                ASSERT(list_empty(&cache->pending[i]));
 240        ASSERT(list_empty(&cache->changed));
 241        ASSERT(list_empty(&cache->detached));
 242        ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
 243        ASSERT(!cache->nr_nodes);
 244        ASSERT(!cache->nr_edges);
 245}
 246
 247static struct backref_node *alloc_backref_node(struct backref_cache *cache)
 248{
 249        struct backref_node *node;
 250
 251        node = kzalloc(sizeof(*node), GFP_NOFS);
 252        if (node) {
 253                INIT_LIST_HEAD(&node->list);
 254                INIT_LIST_HEAD(&node->upper);
 255                INIT_LIST_HEAD(&node->lower);
 256                RB_CLEAR_NODE(&node->rb_node);
 257                cache->nr_nodes++;
 258        }
 259        return node;
 260}
 261
 262static void free_backref_node(struct backref_cache *cache,
 263                              struct backref_node *node)
 264{
 265        if (node) {
 266                cache->nr_nodes--;
 267                kfree(node);
 268        }
 269}
 270
 271static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
 272{
 273        struct backref_edge *edge;
 274
 275        edge = kzalloc(sizeof(*edge), GFP_NOFS);
 276        if (edge)
 277                cache->nr_edges++;
 278        return edge;
 279}
 280
 281static void free_backref_edge(struct backref_cache *cache,
 282                              struct backref_edge *edge)
 283{
 284        if (edge) {
 285                cache->nr_edges--;
 286                kfree(edge);
 287        }
 288}
 289
 290static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 291                                   struct rb_node *node)
 292{
 293        struct rb_node **p = &root->rb_node;
 294        struct rb_node *parent = NULL;
 295        struct tree_entry *entry;
 296
 297        while (*p) {
 298                parent = *p;
 299                entry = rb_entry(parent, struct tree_entry, rb_node);
 300
 301                if (bytenr < entry->bytenr)
 302                        p = &(*p)->rb_left;
 303                else if (bytenr > entry->bytenr)
 304                        p = &(*p)->rb_right;
 305                else
 306                        return parent;
 307        }
 308
 309        rb_link_node(node, parent, p);
 310        rb_insert_color(node, root);
 311        return NULL;
 312}
 313
 314static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 315{
 316        struct rb_node *n = root->rb_node;
 317        struct tree_entry *entry;
 318
 319        while (n) {
 320                entry = rb_entry(n, struct tree_entry, rb_node);
 321
 322                if (bytenr < entry->bytenr)
 323                        n = n->rb_left;
 324                else if (bytenr > entry->bytenr)
 325                        n = n->rb_right;
 326                else
 327                        return n;
 328        }
 329        return NULL;
 330}
 331
 332static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
 333{
 334
 335        struct btrfs_fs_info *fs_info = NULL;
 336        struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
 337                                              rb_node);
 338        if (bnode->root)
 339                fs_info = bnode->root->fs_info;
 340        btrfs_panic(fs_info, errno,
 341                    "Inconsistency in backref cache found at offset %llu",
 342                    bytenr);
 343}
 344
 345/*
 346 * walk up backref nodes until reach node presents tree root
 347 */
 348static struct backref_node *walk_up_backref(struct backref_node *node,
 349                                            struct backref_edge *edges[],
 350                                            int *index)
 351{
 352        struct backref_edge *edge;
 353        int idx = *index;
 354
 355        while (!list_empty(&node->upper)) {
 356                edge = list_entry(node->upper.next,
 357                                  struct backref_edge, list[LOWER]);
 358                edges[idx++] = edge;
 359                node = edge->node[UPPER];
 360        }
 361        BUG_ON(node->detached);
 362        *index = idx;
 363        return node;
 364}
 365
 366/*
 367 * walk down backref nodes to find start of next reference path
 368 */
 369static struct backref_node *walk_down_backref(struct backref_edge *edges[],
 370                                              int *index)
 371{
 372        struct backref_edge *edge;
 373        struct backref_node *lower;
 374        int idx = *index;
 375
 376        while (idx > 0) {
 377                edge = edges[idx - 1];
 378                lower = edge->node[LOWER];
 379                if (list_is_last(&edge->list[LOWER], &lower->upper)) {
 380                        idx--;
 381                        continue;
 382                }
 383                edge = list_entry(edge->list[LOWER].next,
 384                                  struct backref_edge, list[LOWER]);
 385                edges[idx - 1] = edge;
 386                *index = idx;
 387                return edge->node[UPPER];
 388        }
 389        *index = 0;
 390        return NULL;
 391}
 392
 393static void unlock_node_buffer(struct backref_node *node)
 394{
 395        if (node->locked) {
 396                btrfs_tree_unlock(node->eb);
 397                node->locked = 0;
 398        }
 399}
 400
 401static void drop_node_buffer(struct backref_node *node)
 402{
 403        if (node->eb) {
 404                unlock_node_buffer(node);
 405                free_extent_buffer(node->eb);
 406                node->eb = NULL;
 407        }
 408}
 409
 410static void drop_backref_node(struct backref_cache *tree,
 411                              struct backref_node *node)
 412{
 413        BUG_ON(!list_empty(&node->upper));
 414
 415        drop_node_buffer(node);
 416        list_del(&node->list);
 417        list_del(&node->lower);
 418        if (!RB_EMPTY_NODE(&node->rb_node))
 419                rb_erase(&node->rb_node, &tree->rb_root);
 420        free_backref_node(tree, node);
 421}
 422
 423/*
 424 * remove a backref node from the backref cache
 425 */
 426static void remove_backref_node(struct backref_cache *cache,
 427                                struct backref_node *node)
 428{
 429        struct backref_node *upper;
 430        struct backref_edge *edge;
 431
 432        if (!node)
 433                return;
 434
 435        BUG_ON(!node->lowest && !node->detached);
 436        while (!list_empty(&node->upper)) {
 437                edge = list_entry(node->upper.next, struct backref_edge,
 438                                  list[LOWER]);
 439                upper = edge->node[UPPER];
 440                list_del(&edge->list[LOWER]);
 441                list_del(&edge->list[UPPER]);
 442                free_backref_edge(cache, edge);
 443
 444                if (RB_EMPTY_NODE(&upper->rb_node)) {
 445                        BUG_ON(!list_empty(&node->upper));
 446                        drop_backref_node(cache, node);
 447                        node = upper;
 448                        node->lowest = 1;
 449                        continue;
 450                }
 451                /*
 452                 * add the node to leaf node list if no other
 453                 * child block cached.
 454                 */
 455                if (list_empty(&upper->lower)) {
 456                        list_add_tail(&upper->lower, &cache->leaves);
 457                        upper->lowest = 1;
 458                }
 459        }
 460
 461        drop_backref_node(cache, node);
 462}
 463
 464static void update_backref_node(struct backref_cache *cache,
 465                                struct backref_node *node, u64 bytenr)
 466{
 467        struct rb_node *rb_node;
 468        rb_erase(&node->rb_node, &cache->rb_root);
 469        node->bytenr = bytenr;
 470        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
 471        if (rb_node)
 472                backref_tree_panic(rb_node, -EEXIST, bytenr);
 473}
 474
 475/*
 476 * update backref cache after a transaction commit
 477 */
 478static int update_backref_cache(struct btrfs_trans_handle *trans,
 479                                struct backref_cache *cache)
 480{
 481        struct backref_node *node;
 482        int level = 0;
 483
 484        if (cache->last_trans == 0) {
 485                cache->last_trans = trans->transid;
 486                return 0;
 487        }
 488
 489        if (cache->last_trans == trans->transid)
 490                return 0;
 491
 492        /*
 493         * detached nodes are used to avoid unnecessary backref
 494         * lookup. transaction commit changes the extent tree.
 495         * so the detached nodes are no longer useful.
 496         */
 497        while (!list_empty(&cache->detached)) {
 498                node = list_entry(cache->detached.next,
 499                                  struct backref_node, list);
 500                remove_backref_node(cache, node);
 501        }
 502
 503        while (!list_empty(&cache->changed)) {
 504                node = list_entry(cache->changed.next,
 505                                  struct backref_node, list);
 506                list_del_init(&node->list);
 507                BUG_ON(node->pending);
 508                update_backref_node(cache, node, node->new_bytenr);
 509        }
 510
 511        /*
 512         * some nodes can be left in the pending list if there were
 513         * errors during processing the pending nodes.
 514         */
 515        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
 516                list_for_each_entry(node, &cache->pending[level], list) {
 517                        BUG_ON(!node->pending);
 518                        if (node->bytenr == node->new_bytenr)
 519                                continue;
 520                        update_backref_node(cache, node, node->new_bytenr);
 521                }
 522        }
 523
 524        cache->last_trans = 0;
 525        return 1;
 526}
 527
 528
 529static int should_ignore_root(struct btrfs_root *root)
 530{
 531        struct btrfs_root *reloc_root;
 532
 533        if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 534                return 0;
 535
 536        reloc_root = root->reloc_root;
 537        if (!reloc_root)
 538                return 0;
 539
 540        if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
 541            root->fs_info->running_transaction->transid - 1)
 542                return 0;
 543        /*
 544         * if there is reloc tree and it was created in previous
 545         * transaction backref lookup can find the reloc tree,
 546         * so backref node for the fs tree root is useless for
 547         * relocation.
 548         */
 549        return 1;
 550}
 551/*
 552 * find reloc tree by address of tree root
 553 */
 554static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
 555                                          u64 bytenr)
 556{
 557        struct rb_node *rb_node;
 558        struct mapping_node *node;
 559        struct btrfs_root *root = NULL;
 560
 561        spin_lock(&rc->reloc_root_tree.lock);
 562        rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
 563        if (rb_node) {
 564                node = rb_entry(rb_node, struct mapping_node, rb_node);
 565                root = (struct btrfs_root *)node->data;
 566        }
 567        spin_unlock(&rc->reloc_root_tree.lock);
 568        return root;
 569}
 570
 571static int is_cowonly_root(u64 root_objectid)
 572{
 573        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
 574            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
 575            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
 576            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
 577            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
 578            root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
 579            root_objectid == BTRFS_UUID_TREE_OBJECTID ||
 580            root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
 581            root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
 582                return 1;
 583        return 0;
 584}
 585
 586static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
 587                                        u64 root_objectid)
 588{
 589        struct btrfs_key key;
 590
 591        key.objectid = root_objectid;
 592        key.type = BTRFS_ROOT_ITEM_KEY;
 593        if (is_cowonly_root(root_objectid))
 594                key.offset = 0;
 595        else
 596                key.offset = (u64)-1;
 597
 598        return btrfs_get_fs_root(fs_info, &key, false);
 599}
 600
 601#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 602static noinline_for_stack
 603struct btrfs_root *find_tree_root(struct reloc_control *rc,
 604                                  struct extent_buffer *leaf,
 605                                  struct btrfs_extent_ref_v0 *ref0)
 606{
 607        struct btrfs_root *root;
 608        u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
 609        u64 generation = btrfs_ref_generation_v0(leaf, ref0);
 610
 611        BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
 612
 613        root = read_fs_root(rc->extent_root->fs_info, root_objectid);
 614        BUG_ON(IS_ERR(root));
 615
 616        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
 617            generation != btrfs_root_generation(&root->root_item))
 618                return NULL;
 619
 620        return root;
 621}
 622#endif
 623
 624static noinline_for_stack
 625int find_inline_backref(struct extent_buffer *leaf, int slot,
 626                        unsigned long *ptr, unsigned long *end)
 627{
 628        struct btrfs_key key;
 629        struct btrfs_extent_item *ei;
 630        struct btrfs_tree_block_info *bi;
 631        u32 item_size;
 632
 633        btrfs_item_key_to_cpu(leaf, &key, slot);
 634
 635        item_size = btrfs_item_size_nr(leaf, slot);
 636#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 637        if (item_size < sizeof(*ei)) {
 638                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
 639                return 1;
 640        }
 641#endif
 642        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 643        WARN_ON(!(btrfs_extent_flags(leaf, ei) &
 644                  BTRFS_EXTENT_FLAG_TREE_BLOCK));
 645
 646        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 647            item_size <= sizeof(*ei) + sizeof(*bi)) {
 648                WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 649                return 1;
 650        }
 651        if (key.type == BTRFS_METADATA_ITEM_KEY &&
 652            item_size <= sizeof(*ei)) {
 653                WARN_ON(item_size < sizeof(*ei));
 654                return 1;
 655        }
 656
 657        if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 658                bi = (struct btrfs_tree_block_info *)(ei + 1);
 659                *ptr = (unsigned long)(bi + 1);
 660        } else {
 661                *ptr = (unsigned long)(ei + 1);
 662        }
 663        *end = (unsigned long)ei + item_size;
 664        return 0;
 665}
 666
 667/*
 668 * build backref tree for a given tree block. root of the backref tree
 669 * corresponds the tree block, leaves of the backref tree correspond
 670 * roots of b-trees that reference the tree block.
 671 *
 672 * the basic idea of this function is check backrefs of a given block
 673 * to find upper level blocks that reference the block, and then check
 674 * backrefs of these upper level blocks recursively. the recursion stop
 675 * when tree root is reached or backrefs for the block is cached.
 676 *
 677 * NOTE: if we find backrefs for a block are cached, we know backrefs
 678 * for all upper level blocks that directly/indirectly reference the
 679 * block are also cached.
 680 */
 681static noinline_for_stack
 682struct backref_node *build_backref_tree(struct reloc_control *rc,
 683                                        struct btrfs_key *node_key,
 684                                        int level, u64 bytenr)
 685{
 686        struct backref_cache *cache = &rc->backref_cache;
 687        struct btrfs_path *path1;
 688        struct btrfs_path *path2;
 689        struct extent_buffer *eb;
 690        struct btrfs_root *root;
 691        struct backref_node *cur;
 692        struct backref_node *upper;
 693        struct backref_node *lower;
 694        struct backref_node *node = NULL;
 695        struct backref_node *exist = NULL;
 696        struct backref_edge *edge;
 697        struct rb_node *rb_node;
 698        struct btrfs_key key;
 699        unsigned long end;
 700        unsigned long ptr;
 701        LIST_HEAD(list);
 702        LIST_HEAD(useless);
 703        int cowonly;
 704        int ret;
 705        int err = 0;
 706        bool need_check = true;
 707
 708        path1 = btrfs_alloc_path();
 709        path2 = btrfs_alloc_path();
 710        if (!path1 || !path2) {
 711                err = -ENOMEM;
 712                goto out;
 713        }
 714        path1->reada = READA_FORWARD;
 715        path2->reada = READA_FORWARD;
 716
 717        node = alloc_backref_node(cache);
 718        if (!node) {
 719                err = -ENOMEM;
 720                goto out;
 721        }
 722
 723        node->bytenr = bytenr;
 724        node->level = level;
 725        node->lowest = 1;
 726        cur = node;
 727again:
 728        end = 0;
 729        ptr = 0;
 730        key.objectid = cur->bytenr;
 731        key.type = BTRFS_METADATA_ITEM_KEY;
 732        key.offset = (u64)-1;
 733
 734        path1->search_commit_root = 1;
 735        path1->skip_locking = 1;
 736        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
 737                                0, 0);
 738        if (ret < 0) {
 739                err = ret;
 740                goto out;
 741        }
 742        ASSERT(ret);
 743        ASSERT(path1->slots[0]);
 744
 745        path1->slots[0]--;
 746
 747        WARN_ON(cur->checked);
 748        if (!list_empty(&cur->upper)) {
 749                /*
 750                 * the backref was added previously when processing
 751                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
 752                 */
 753                ASSERT(list_is_singular(&cur->upper));
 754                edge = list_entry(cur->upper.next, struct backref_edge,
 755                                  list[LOWER]);
 756                ASSERT(list_empty(&edge->list[UPPER]));
 757                exist = edge->node[UPPER];
 758                /*
 759                 * add the upper level block to pending list if we need
 760                 * check its backrefs
 761                 */
 762                if (!exist->checked)
 763                        list_add_tail(&edge->list[UPPER], &list);
 764        } else {
 765                exist = NULL;
 766        }
 767
 768        while (1) {
 769                cond_resched();
 770                eb = path1->nodes[0];
 771
 772                if (ptr >= end) {
 773                        if (path1->slots[0] >= btrfs_header_nritems(eb)) {
 774                                ret = btrfs_next_leaf(rc->extent_root, path1);
 775                                if (ret < 0) {
 776                                        err = ret;
 777                                        goto out;
 778                                }
 779                                if (ret > 0)
 780                                        break;
 781                                eb = path1->nodes[0];
 782                        }
 783
 784                        btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
 785                        if (key.objectid != cur->bytenr) {
 786                                WARN_ON(exist);
 787                                break;
 788                        }
 789
 790                        if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 791                            key.type == BTRFS_METADATA_ITEM_KEY) {
 792                                ret = find_inline_backref(eb, path1->slots[0],
 793                                                          &ptr, &end);
 794                                if (ret)
 795                                        goto next;
 796                        }
 797                }
 798
 799                if (ptr < end) {
 800                        /* update key for inline back ref */
 801                        struct btrfs_extent_inline_ref *iref;
 802                        iref = (struct btrfs_extent_inline_ref *)ptr;
 803                        key.type = btrfs_extent_inline_ref_type(eb, iref);
 804                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
 805                        WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
 806                                key.type != BTRFS_SHARED_BLOCK_REF_KEY);
 807                }
 808
 809                if (exist &&
 810                    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
 811                      exist->owner == key.offset) ||
 812                     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
 813                      exist->bytenr == key.offset))) {
 814                        exist = NULL;
 815                        goto next;
 816                }
 817
 818#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 819                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
 820                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
 821                        if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
 822                                struct btrfs_extent_ref_v0 *ref0;
 823                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
 824                                                struct btrfs_extent_ref_v0);
 825                                if (key.objectid == key.offset) {
 826                                        root = find_tree_root(rc, eb, ref0);
 827                                        if (root && !should_ignore_root(root))
 828                                                cur->root = root;
 829                                        else
 830                                                list_add(&cur->list, &useless);
 831                                        break;
 832                                }
 833                                if (is_cowonly_root(btrfs_ref_root_v0(eb,
 834                                                                      ref0)))
 835                                        cur->cowonly = 1;
 836                        }
 837#else
 838                ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
 839                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
 840#endif
 841                        if (key.objectid == key.offset) {
 842                                /*
 843                                 * only root blocks of reloc trees use
 844                                 * backref of this type.
 845                                 */
 846                                root = find_reloc_root(rc, cur->bytenr);
 847                                ASSERT(root);
 848                                cur->root = root;
 849                                break;
 850                        }
 851
 852                        edge = alloc_backref_edge(cache);
 853                        if (!edge) {
 854                                err = -ENOMEM;
 855                                goto out;
 856                        }
 857                        rb_node = tree_search(&cache->rb_root, key.offset);
 858                        if (!rb_node) {
 859                                upper = alloc_backref_node(cache);
 860                                if (!upper) {
 861                                        free_backref_edge(cache, edge);
 862                                        err = -ENOMEM;
 863                                        goto out;
 864                                }
 865                                upper->bytenr = key.offset;
 866                                upper->level = cur->level + 1;
 867                                /*
 868                                 *  backrefs for the upper level block isn't
 869                                 *  cached, add the block to pending list
 870                                 */
 871                                list_add_tail(&edge->list[UPPER], &list);
 872                        } else {
 873                                upper = rb_entry(rb_node, struct backref_node,
 874                                                 rb_node);
 875                                ASSERT(upper->checked);
 876                                INIT_LIST_HEAD(&edge->list[UPPER]);
 877                        }
 878                        list_add_tail(&edge->list[LOWER], &cur->upper);
 879                        edge->node[LOWER] = cur;
 880                        edge->node[UPPER] = upper;
 881
 882                        goto next;
 883                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
 884                        goto next;
 885                }
 886
 887                /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
 888                root = read_fs_root(rc->extent_root->fs_info, key.offset);
 889                if (IS_ERR(root)) {
 890                        err = PTR_ERR(root);
 891                        goto out;
 892                }
 893
 894                if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 895                        cur->cowonly = 1;
 896
 897                if (btrfs_root_level(&root->root_item) == cur->level) {
 898                        /* tree root */
 899                        ASSERT(btrfs_root_bytenr(&root->root_item) ==
 900                               cur->bytenr);
 901                        if (should_ignore_root(root))
 902                                list_add(&cur->list, &useless);
 903                        else
 904                                cur->root = root;
 905                        break;
 906                }
 907
 908                level = cur->level + 1;
 909
 910                /*
 911                 * searching the tree to find upper level blocks
 912                 * reference the block.
 913                 */
 914                path2->search_commit_root = 1;
 915                path2->skip_locking = 1;
 916                path2->lowest_level = level;
 917                ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
 918                path2->lowest_level = 0;
 919                if (ret < 0) {
 920                        err = ret;
 921                        goto out;
 922                }
 923                if (ret > 0 && path2->slots[level] > 0)
 924                        path2->slots[level]--;
 925
 926                eb = path2->nodes[level];
 927                if (btrfs_node_blockptr(eb, path2->slots[level]) !=
 928                    cur->bytenr) {
 929                        btrfs_err(root->fs_info,
 930        "couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
 931                                  cur->bytenr, level - 1, root->objectid,
 932                                  node_key->objectid, node_key->type,
 933                                  node_key->offset);
 934                        err = -ENOENT;
 935                        goto out;
 936                }
 937                lower = cur;
 938                need_check = true;
 939                for (; level < BTRFS_MAX_LEVEL; level++) {
 940                        if (!path2->nodes[level]) {
 941                                ASSERT(btrfs_root_bytenr(&root->root_item) ==
 942                                       lower->bytenr);
 943                                if (should_ignore_root(root))
 944                                        list_add(&lower->list, &useless);
 945                                else
 946                                        lower->root = root;
 947                                break;
 948                        }
 949
 950                        edge = alloc_backref_edge(cache);
 951                        if (!edge) {
 952                                err = -ENOMEM;
 953                                goto out;
 954                        }
 955
 956                        eb = path2->nodes[level];
 957                        rb_node = tree_search(&cache->rb_root, eb->start);
 958                        if (!rb_node) {
 959                                upper = alloc_backref_node(cache);
 960                                if (!upper) {
 961                                        free_backref_edge(cache, edge);
 962                                        err = -ENOMEM;
 963                                        goto out;
 964                                }
 965                                upper->bytenr = eb->start;
 966                                upper->owner = btrfs_header_owner(eb);
 967                                upper->level = lower->level + 1;
 968                                if (!test_bit(BTRFS_ROOT_REF_COWS,
 969                                              &root->state))
 970                                        upper->cowonly = 1;
 971
 972                                /*
 973                                 * if we know the block isn't shared
 974                                 * we can void checking its backrefs.
 975                                 */
 976                                if (btrfs_block_can_be_shared(root, eb))
 977                                        upper->checked = 0;
 978                                else
 979                                        upper->checked = 1;
 980
 981                                /*
 982                                 * add the block to pending list if we
 983                                 * need check its backrefs, we only do this once
 984                                 * while walking up a tree as we will catch
 985                                 * anything else later on.
 986                                 */
 987                                if (!upper->checked && need_check) {
 988                                        need_check = false;
 989                                        list_add_tail(&edge->list[UPPER],
 990                                                      &list);
 991                                } else {
 992                                        if (upper->checked)
 993                                                need_check = true;
 994                                        INIT_LIST_HEAD(&edge->list[UPPER]);
 995                                }
 996                        } else {
 997                                upper = rb_entry(rb_node, struct backref_node,
 998                                                 rb_node);
 999                                ASSERT(upper->checked);
1000                                INIT_LIST_HEAD(&edge->list[UPPER]);
1001                                if (!upper->owner)
1002                                        upper->owner = btrfs_header_owner(eb);
1003                        }
1004                        list_add_tail(&edge->list[LOWER], &lower->upper);
1005                        edge->node[LOWER] = lower;
1006                        edge->node[UPPER] = upper;
1007
1008                        if (rb_node)
1009                                break;
1010                        lower = upper;
1011                        upper = NULL;
1012                }
1013                btrfs_release_path(path2);
1014next:
1015                if (ptr < end) {
1016                        ptr += btrfs_extent_inline_ref_size(key.type);
1017                        if (ptr >= end) {
1018                                WARN_ON(ptr > end);
1019                                ptr = 0;
1020                                end = 0;
1021                        }
1022                }
1023                if (ptr >= end)
1024                        path1->slots[0]++;
1025        }
1026        btrfs_release_path(path1);
1027
1028        cur->checked = 1;
1029        WARN_ON(exist);
1030
1031        /* the pending list isn't empty, take the first block to process */
1032        if (!list_empty(&list)) {
1033                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1034                list_del_init(&edge->list[UPPER]);
1035                cur = edge->node[UPPER];
1036                goto again;
1037        }
1038
1039        /*
1040         * everything goes well, connect backref nodes and insert backref nodes
1041         * into the cache.
1042         */
1043        ASSERT(node->checked);
1044        cowonly = node->cowonly;
1045        if (!cowonly) {
1046                rb_node = tree_insert(&cache->rb_root, node->bytenr,
1047                                      &node->rb_node);
1048                if (rb_node)
1049                        backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1050                list_add_tail(&node->lower, &cache->leaves);
1051        }
1052
1053        list_for_each_entry(edge, &node->upper, list[LOWER])
1054                list_add_tail(&edge->list[UPPER], &list);
1055
1056        while (!list_empty(&list)) {
1057                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
1058                list_del_init(&edge->list[UPPER]);
1059                upper = edge->node[UPPER];
1060                if (upper->detached) {
1061                        list_del(&edge->list[LOWER]);
1062                        lower = edge->node[LOWER];
1063                        free_backref_edge(cache, edge);
1064                        if (list_empty(&lower->upper))
1065                                list_add(&lower->list, &useless);
1066                        continue;
1067                }
1068
1069                if (!RB_EMPTY_NODE(&upper->rb_node)) {
1070                        if (upper->lowest) {
1071                                list_del_init(&upper->lower);
1072                                upper->lowest = 0;
1073                        }
1074
1075                        list_add_tail(&edge->list[UPPER], &upper->lower);
1076                        continue;
1077                }
1078
1079                if (!upper->checked) {
1080                        /*
1081                         * Still want to blow up for developers since this is a
1082                         * logic bug.
1083                         */
1084                        ASSERT(0);
1085                        err = -EINVAL;
1086                        goto out;
1087                }
1088                if (cowonly != upper->cowonly) {
1089                        ASSERT(0);
1090                        err = -EINVAL;
1091                        goto out;
1092                }
1093
1094                if (!cowonly) {
1095                        rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1096                                              &upper->rb_node);
1097                        if (rb_node)
1098                                backref_tree_panic(rb_node, -EEXIST,
1099                                                   upper->bytenr);
1100                }
1101
1102                list_add_tail(&edge->list[UPPER], &upper->lower);
1103
1104                list_for_each_entry(edge, &upper->upper, list[LOWER])
1105                        list_add_tail(&edge->list[UPPER], &list);
1106        }
1107        /*
1108         * process useless backref nodes. backref nodes for tree leaves
1109         * are deleted from the cache. backref nodes for upper level
1110         * tree blocks are left in the cache to avoid unnecessary backref
1111         * lookup.
1112         */
1113        while (!list_empty(&useless)) {
1114                upper = list_entry(useless.next, struct backref_node, list);
1115                list_del_init(&upper->list);
1116                ASSERT(list_empty(&upper->upper));
1117                if (upper == node)
1118                        node = NULL;
1119                if (upper->lowest) {
1120                        list_del_init(&upper->lower);
1121                        upper->lowest = 0;
1122                }
1123                while (!list_empty(&upper->lower)) {
1124                        edge = list_entry(upper->lower.next,
1125                                          struct backref_edge, list[UPPER]);
1126                        list_del(&edge->list[UPPER]);
1127                        list_del(&edge->list[LOWER]);
1128                        lower = edge->node[LOWER];
1129                        free_backref_edge(cache, edge);
1130
1131                        if (list_empty(&lower->upper))
1132                                list_add(&lower->list, &useless);
1133                }
1134                __mark_block_processed(rc, upper);
1135                if (upper->level > 0) {
1136                        list_add(&upper->list, &cache->detached);
1137                        upper->detached = 1;
1138                } else {
1139                        rb_erase(&upper->rb_node, &cache->rb_root);
1140                        free_backref_node(cache, upper);
1141                }
1142        }
1143out:
1144        btrfs_free_path(path1);
1145        btrfs_free_path(path2);
1146        if (err) {
1147                while (!list_empty(&useless)) {
1148                        lower = list_entry(useless.next,
1149                                           struct backref_node, list);
1150                        list_del_init(&lower->list);
1151                }
1152                while (!list_empty(&list)) {
1153                        edge = list_first_entry(&list, struct backref_edge,
1154                                                list[UPPER]);
1155                        list_del(&edge->list[UPPER]);
1156                        list_del(&edge->list[LOWER]);
1157                        lower = edge->node[LOWER];
1158                        upper = edge->node[UPPER];
1159                        free_backref_edge(cache, edge);
1160
1161                        /*
1162                         * Lower is no longer linked to any upper backref nodes
1163                         * and isn't in the cache, we can free it ourselves.
1164                         */
1165                        if (list_empty(&lower->upper) &&
1166                            RB_EMPTY_NODE(&lower->rb_node))
1167                                list_add(&lower->list, &useless);
1168
1169                        if (!RB_EMPTY_NODE(&upper->rb_node))
1170                                continue;
1171
1172                        /* Add this guy's upper edges to the list to process */
1173                        list_for_each_entry(edge, &upper->upper, list[LOWER])
1174                                list_add_tail(&edge->list[UPPER], &list);
1175                        if (list_empty(&upper->upper))
1176                                list_add(&upper->list, &useless);
1177                }
1178
1179                while (!list_empty(&useless)) {
1180                        lower = list_entry(useless.next,
1181                                           struct backref_node, list);
1182                        list_del_init(&lower->list);
1183                        if (lower == node)
1184                                node = NULL;
1185                        free_backref_node(cache, lower);
1186                }
1187
1188                free_backref_node(cache, node);
1189                return ERR_PTR(err);
1190        }
1191        ASSERT(!node || !node->detached);
1192        return node;
1193}
1194
1195/*
1196 * helper to add backref node for the newly created snapshot.
1197 * the backref node is created by cloning backref node that
1198 * corresponds to root of source tree
1199 */
1200static int clone_backref_node(struct btrfs_trans_handle *trans,
1201                              struct reloc_control *rc,
1202                              struct btrfs_root *src,
1203                              struct btrfs_root *dest)
1204{
1205        struct btrfs_root *reloc_root = src->reloc_root;
1206        struct backref_cache *cache = &rc->backref_cache;
1207        struct backref_node *node = NULL;
1208        struct backref_node *new_node;
1209        struct backref_edge *edge;
1210        struct backref_edge *new_edge;
1211        struct rb_node *rb_node;
1212
1213        if (cache->last_trans > 0)
1214                update_backref_cache(trans, cache);
1215
1216        rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1217        if (rb_node) {
1218                node = rb_entry(rb_node, struct backref_node, rb_node);
1219                if (node->detached)
1220                        node = NULL;
1221                else
1222                        BUG_ON(node->new_bytenr != reloc_root->node->start);
1223        }
1224
1225        if (!node) {
1226                rb_node = tree_search(&cache->rb_root,
1227                                      reloc_root->commit_root->start);
1228                if (rb_node) {
1229                        node = rb_entry(rb_node, struct backref_node,
1230                                        rb_node);
1231                        BUG_ON(node->detached);
1232                }
1233        }
1234
1235        if (!node)
1236                return 0;
1237
1238        new_node = alloc_backref_node(cache);
1239        if (!new_node)
1240                return -ENOMEM;
1241
1242        new_node->bytenr = dest->node->start;
1243        new_node->level = node->level;
1244        new_node->lowest = node->lowest;
1245        new_node->checked = 1;
1246        new_node->root = dest;
1247
1248        if (!node->lowest) {
1249                list_for_each_entry(edge, &node->lower, list[UPPER]) {
1250                        new_edge = alloc_backref_edge(cache);
1251                        if (!new_edge)
1252                                goto fail;
1253
1254                        new_edge->node[UPPER] = new_node;
1255                        new_edge->node[LOWER] = edge->node[LOWER];
1256                        list_add_tail(&new_edge->list[UPPER],
1257                                      &new_node->lower);
1258                }
1259        } else {
1260                list_add_tail(&new_node->lower, &cache->leaves);
1261        }
1262
1263        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1264                              &new_node->rb_node);
1265        if (rb_node)
1266                backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
1267
1268        if (!new_node->lowest) {
1269                list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1270                        list_add_tail(&new_edge->list[LOWER],
1271                                      &new_edge->node[LOWER]->upper);
1272                }
1273        }
1274        return 0;
1275fail:
1276        while (!list_empty(&new_node->lower)) {
1277                new_edge = list_entry(new_node->lower.next,
1278                                      struct backref_edge, list[UPPER]);
1279                list_del(&new_edge->list[UPPER]);
1280                free_backref_edge(cache, new_edge);
1281        }
1282        free_backref_node(cache, new_node);
1283        return -ENOMEM;
1284}
1285
1286/*
1287 * helper to add 'address of tree root -> reloc tree' mapping
1288 */
1289static int __must_check __add_reloc_root(struct btrfs_root *root)
1290{
1291        struct btrfs_fs_info *fs_info = root->fs_info;
1292        struct rb_node *rb_node;
1293        struct mapping_node *node;
1294        struct reloc_control *rc = fs_info->reloc_ctl;
1295
1296        node = kmalloc(sizeof(*node), GFP_NOFS);
1297        if (!node)
1298                return -ENOMEM;
1299
1300        node->bytenr = root->node->start;
1301        node->data = root;
1302
1303        spin_lock(&rc->reloc_root_tree.lock);
1304        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1305                              node->bytenr, &node->rb_node);
1306        spin_unlock(&rc->reloc_root_tree.lock);
1307        if (rb_node) {
1308                btrfs_panic(fs_info, -EEXIST,
1309                            "Duplicate root found for start=%llu while inserting into relocation tree",
1310                            node->bytenr);
1311                kfree(node);
1312                return -EEXIST;
1313        }
1314
1315        list_add_tail(&root->root_list, &rc->reloc_roots);
1316        return 0;
1317}
1318
1319/*
1320 * helper to delete the 'address of tree root -> reloc tree'
1321 * mapping
1322 */
1323static void __del_reloc_root(struct btrfs_root *root)
1324{
1325        struct btrfs_fs_info *fs_info = root->fs_info;
1326        struct rb_node *rb_node;
1327        struct mapping_node *node = NULL;
1328        struct reloc_control *rc = fs_info->reloc_ctl;
1329
1330        spin_lock(&rc->reloc_root_tree.lock);
1331        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1332                              root->node->start);
1333        if (rb_node) {
1334                node = rb_entry(rb_node, struct mapping_node, rb_node);
1335                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1336        }
1337        spin_unlock(&rc->reloc_root_tree.lock);
1338
1339        if (!node)
1340                return;
1341        BUG_ON((struct btrfs_root *)node->data != root);
1342
1343        spin_lock(&fs_info->trans_lock);
1344        list_del_init(&root->root_list);
1345        spin_unlock(&fs_info->trans_lock);
1346        kfree(node);
1347}
1348
1349/*
1350 * helper to update the 'address of tree root -> reloc tree'
1351 * mapping
1352 */
1353static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1354{
1355        struct btrfs_fs_info *fs_info = root->fs_info;
1356        struct rb_node *rb_node;
1357        struct mapping_node *node = NULL;
1358        struct reloc_control *rc = fs_info->reloc_ctl;
1359
1360        spin_lock(&rc->reloc_root_tree.lock);
1361        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1362                              root->node->start);
1363        if (rb_node) {
1364                node = rb_entry(rb_node, struct mapping_node, rb_node);
1365                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1366        }
1367        spin_unlock(&rc->reloc_root_tree.lock);
1368
1369        if (!node)
1370                return 0;
1371        BUG_ON((struct btrfs_root *)node->data != root);
1372
1373        spin_lock(&rc->reloc_root_tree.lock);
1374        node->bytenr = new_bytenr;
1375        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1376                              node->bytenr, &node->rb_node);
1377        spin_unlock(&rc->reloc_root_tree.lock);
1378        if (rb_node)
1379                backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1380        return 0;
1381}
1382
1383static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
1384                                        struct btrfs_root *root, u64 objectid)
1385{
1386        struct btrfs_fs_info *fs_info = root->fs_info;
1387        struct btrfs_root *reloc_root;
1388        struct extent_buffer *eb;
1389        struct btrfs_root_item *root_item;
1390        struct btrfs_key root_key;
1391        int ret;
1392
1393        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
1394        BUG_ON(!root_item);
1395
1396        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
1397        root_key.type = BTRFS_ROOT_ITEM_KEY;
1398        root_key.offset = objectid;
1399
1400        if (root->root_key.objectid == objectid) {
1401                u64 commit_root_gen;
1402
1403                /* called by btrfs_init_reloc_root */
1404                ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1405                                      BTRFS_TREE_RELOC_OBJECTID);
1406                BUG_ON(ret);
1407                /*
1408                 * Set the last_snapshot field to the generation of the commit
1409                 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
1410                 * correctly (returns true) when the relocation root is created
1411                 * either inside the critical section of a transaction commit
1412                 * (through transaction.c:qgroup_account_snapshot()) and when
1413                 * it's created before the transaction commit is started.
1414                 */
1415                commit_root_gen = btrfs_header_generation(root->commit_root);
1416                btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
1417        } else {
1418                /*
1419                 * called by btrfs_reloc_post_snapshot_hook.
1420                 * the source tree is a reloc tree, all tree blocks
1421                 * modified after it was created have RELOC flag
1422                 * set in their headers. so it's OK to not update
1423                 * the 'last_snapshot'.
1424                 */
1425                ret = btrfs_copy_root(trans, root, root->node, &eb,
1426                                      BTRFS_TREE_RELOC_OBJECTID);
1427                BUG_ON(ret);
1428        }
1429
1430        memcpy(root_item, &root->root_item, sizeof(*root_item));
1431        btrfs_set_root_bytenr(root_item, eb->start);
1432        btrfs_set_root_level(root_item, btrfs_header_level(eb));
1433        btrfs_set_root_generation(root_item, trans->transid);
1434
1435        if (root->root_key.objectid == objectid) {
1436                btrfs_set_root_refs(root_item, 0);
1437                memset(&root_item->drop_progress, 0,
1438                       sizeof(struct btrfs_disk_key));
1439                root_item->drop_level = 0;
1440        }
1441
1442        btrfs_tree_unlock(eb);
1443        free_extent_buffer(eb);
1444
1445        ret = btrfs_insert_root(trans, fs_info->tree_root,
1446                                &root_key, root_item);
1447        BUG_ON(ret);
1448        kfree(root_item);
1449
1450        reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key);
1451        BUG_ON(IS_ERR(reloc_root));
1452        reloc_root->last_trans = trans->transid;
1453        return reloc_root;
1454}
1455
1456/*
1457 * create reloc tree for a given fs tree. reloc tree is just a
1458 * snapshot of the fs tree with special root objectid.
1459 */
1460int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1461                          struct btrfs_root *root)
1462{
1463        struct btrfs_fs_info *fs_info = root->fs_info;
1464        struct btrfs_root *reloc_root;
1465        struct reloc_control *rc = fs_info->reloc_ctl;
1466        struct btrfs_block_rsv *rsv;
1467        int clear_rsv = 0;
1468        int ret;
1469
1470        if (root->reloc_root) {
1471                reloc_root = root->reloc_root;
1472                reloc_root->last_trans = trans->transid;
1473                return 0;
1474        }
1475
1476        if (!rc || !rc->create_reloc_tree ||
1477            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1478                return 0;
1479
1480        if (!trans->reloc_reserved) {
1481                rsv = trans->block_rsv;
1482                trans->block_rsv = rc->block_rsv;
1483                clear_rsv = 1;
1484        }
1485        reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1486        if (clear_rsv)
1487                trans->block_rsv = rsv;
1488
1489        ret = __add_reloc_root(reloc_root);
1490        BUG_ON(ret < 0);
1491        root->reloc_root = reloc_root;
1492        return 0;
1493}
1494
1495/*
1496 * update root item of reloc tree
1497 */
1498int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1499                            struct btrfs_root *root)
1500{
1501        struct btrfs_fs_info *fs_info = root->fs_info;
1502        struct btrfs_root *reloc_root;
1503        struct btrfs_root_item *root_item;
1504        int ret;
1505
1506        if (!root->reloc_root)
1507                goto out;
1508
1509        reloc_root = root->reloc_root;
1510        root_item = &reloc_root->root_item;
1511
1512        if (fs_info->reloc_ctl->merge_reloc_tree &&
1513            btrfs_root_refs(root_item) == 0) {
1514                root->reloc_root = NULL;
1515                __del_reloc_root(reloc_root);
1516        }
1517
1518        if (reloc_root->commit_root != reloc_root->node) {
1519                btrfs_set_root_node(root_item, reloc_root->node);
1520                free_extent_buffer(reloc_root->commit_root);
1521                reloc_root->commit_root = btrfs_root_node(reloc_root);
1522        }
1523
1524        ret = btrfs_update_root(trans, fs_info->tree_root,
1525                                &reloc_root->root_key, root_item);
1526        BUG_ON(ret);
1527
1528out:
1529        return 0;
1530}
1531
1532/*
1533 * helper to find first cached inode with inode number >= objectid
1534 * in a subvolume
1535 */
1536static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1537{
1538        struct rb_node *node;
1539        struct rb_node *prev;
1540        struct btrfs_inode *entry;
1541        struct inode *inode;
1542
1543        spin_lock(&root->inode_lock);
1544again:
1545        node = root->inode_tree.rb_node;
1546        prev = NULL;
1547        while (node) {
1548                prev = node;
1549                entry = rb_entry(node, struct btrfs_inode, rb_node);
1550
1551                if (objectid < btrfs_ino(&entry->vfs_inode))
1552                        node = node->rb_left;
1553                else if (objectid > btrfs_ino(&entry->vfs_inode))
1554                        node = node->rb_right;
1555                else
1556                        break;
1557        }
1558        if (!node) {
1559                while (prev) {
1560                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
1561                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
1562                                node = prev;
1563                                break;
1564                        }
1565                        prev = rb_next(prev);
1566                }
1567        }
1568        while (node) {
1569                entry = rb_entry(node, struct btrfs_inode, rb_node);
1570                inode = igrab(&entry->vfs_inode);
1571                if (inode) {
1572                        spin_unlock(&root->inode_lock);
1573                        return inode;
1574                }
1575
1576                objectid = btrfs_ino(&entry->vfs_inode) + 1;
1577                if (cond_resched_lock(&root->inode_lock))
1578                        goto again;
1579
1580                node = rb_next(node);
1581        }
1582        spin_unlock(&root->inode_lock);
1583        return NULL;
1584}
1585
1586static int in_block_group(u64 bytenr,
1587                          struct btrfs_block_group_cache *block_group)
1588{
1589        if (bytenr >= block_group->key.objectid &&
1590            bytenr < block_group->key.objectid + block_group->key.offset)
1591                return 1;
1592        return 0;
1593}
1594
1595/*
1596 * get new location of data
1597 */
1598static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1599                            u64 bytenr, u64 num_bytes)
1600{
1601        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1602        struct btrfs_path *path;
1603        struct btrfs_file_extent_item *fi;
1604        struct extent_buffer *leaf;
1605        int ret;
1606
1607        path = btrfs_alloc_path();
1608        if (!path)
1609                return -ENOMEM;
1610
1611        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1612        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
1613                                       bytenr, 0);
1614        if (ret < 0)
1615                goto out;
1616        if (ret > 0) {
1617                ret = -ENOENT;
1618                goto out;
1619        }
1620
1621        leaf = path->nodes[0];
1622        fi = btrfs_item_ptr(leaf, path->slots[0],
1623                            struct btrfs_file_extent_item);
1624
1625        BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1626               btrfs_file_extent_compression(leaf, fi) ||
1627               btrfs_file_extent_encryption(leaf, fi) ||
1628               btrfs_file_extent_other_encoding(leaf, fi));
1629
1630        if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1631                ret = -EINVAL;
1632                goto out;
1633        }
1634
1635        *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1636        ret = 0;
1637out:
1638        btrfs_free_path(path);
1639        return ret;
1640}
1641
1642/*
1643 * update file extent items in the tree leaf to point to
1644 * the new locations.
1645 */
1646static noinline_for_stack
1647int replace_file_extents(struct btrfs_trans_handle *trans,
1648                         struct reloc_control *rc,
1649                         struct btrfs_root *root,
1650                         struct extent_buffer *leaf)
1651{
1652        struct btrfs_fs_info *fs_info = root->fs_info;
1653        struct btrfs_key key;
1654        struct btrfs_file_extent_item *fi;
1655        struct inode *inode = NULL;
1656        u64 parent;
1657        u64 bytenr;
1658        u64 new_bytenr = 0;
1659        u64 num_bytes;
1660        u64 end;
1661        u32 nritems;
1662        u32 i;
1663        int ret = 0;
1664        int first = 1;
1665        int dirty = 0;
1666
1667        if (rc->stage != UPDATE_DATA_PTRS)
1668                return 0;
1669
1670        /* reloc trees always use full backref */
1671        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1672                parent = leaf->start;
1673        else
1674                parent = 0;
1675
1676        nritems = btrfs_header_nritems(leaf);
1677        for (i = 0; i < nritems; i++) {
1678                cond_resched();
1679                btrfs_item_key_to_cpu(leaf, &key, i);
1680                if (key.type != BTRFS_EXTENT_DATA_KEY)
1681                        continue;
1682                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1683                if (btrfs_file_extent_type(leaf, fi) ==
1684                    BTRFS_FILE_EXTENT_INLINE)
1685                        continue;
1686                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1687                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1688                if (bytenr == 0)
1689                        continue;
1690                if (!in_block_group(bytenr, rc->block_group))
1691                        continue;
1692
1693                /*
1694                 * if we are modifying block in fs tree, wait for readpage
1695                 * to complete and drop the extent cache
1696                 */
1697                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1698                        if (first) {
1699                                inode = find_next_inode(root, key.objectid);
1700                                first = 0;
1701                        } else if (inode && btrfs_ino(inode) < key.objectid) {
1702                                btrfs_add_delayed_iput(inode);
1703                                inode = find_next_inode(root, key.objectid);
1704                        }
1705                        if (inode && btrfs_ino(inode) == key.objectid) {
1706                                end = key.offset +
1707                                      btrfs_file_extent_num_bytes(leaf, fi);
1708                                WARN_ON(!IS_ALIGNED(key.offset,
1709                                                    fs_info->sectorsize));
1710                                WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1711                                end--;
1712                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1713                                                      key.offset, end);
1714                                if (!ret)
1715                                        continue;
1716
1717                                btrfs_drop_extent_cache(inode, key.offset, end,
1718                                                        1);
1719                                unlock_extent(&BTRFS_I(inode)->io_tree,
1720                                              key.offset, end);
1721                        }
1722                }
1723
1724                ret = get_new_location(rc->data_inode, &new_bytenr,
1725                                       bytenr, num_bytes);
1726                if (ret) {
1727                        /*
1728                         * Don't have to abort since we've not changed anything
1729                         * in the file extent yet.
1730                         */
1731                        break;
1732                }
1733
1734                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1735                dirty = 1;
1736
1737                key.offset -= btrfs_file_extent_offset(leaf, fi);
1738                ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
1739                                           num_bytes, parent,
1740                                           btrfs_header_owner(leaf),
1741                                           key.objectid, key.offset);
1742                if (ret) {
1743                        btrfs_abort_transaction(trans, ret);
1744                        break;
1745                }
1746
1747                ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes,
1748                                        parent, btrfs_header_owner(leaf),
1749                                        key.objectid, key.offset);
1750                if (ret) {
1751                        btrfs_abort_transaction(trans, ret);
1752                        break;
1753                }
1754        }
1755        if (dirty)
1756                btrfs_mark_buffer_dirty(leaf);
1757        if (inode)
1758                btrfs_add_delayed_iput(inode);
1759        return ret;
1760}
1761
1762static noinline_for_stack
1763int memcmp_node_keys(struct extent_buffer *eb, int slot,
1764                     struct btrfs_path *path, int level)
1765{
1766        struct btrfs_disk_key key1;
1767        struct btrfs_disk_key key2;
1768        btrfs_node_key(eb, &key1, slot);
1769        btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1770        return memcmp(&key1, &key2, sizeof(key1));
1771}
1772
1773/*
1774 * try to replace tree blocks in fs tree with the new blocks
1775 * in reloc tree. tree blocks haven't been modified since the
1776 * reloc tree was create can be replaced.
1777 *
1778 * if a block was replaced, level of the block + 1 is returned.
1779 * if no block got replaced, 0 is returned. if there are other
1780 * errors, a negative error number is returned.
1781 */
1782static noinline_for_stack
1783int replace_path(struct btrfs_trans_handle *trans,
1784                 struct btrfs_root *dest, struct btrfs_root *src,
1785                 struct btrfs_path *path, struct btrfs_key *next_key,
1786                 int lowest_level, int max_level)
1787{
1788        struct btrfs_fs_info *fs_info = dest->fs_info;
1789        struct extent_buffer *eb;
1790        struct extent_buffer *parent;
1791        struct btrfs_key key;
1792        u64 old_bytenr;
1793        u64 new_bytenr;
1794        u64 old_ptr_gen;
1795        u64 new_ptr_gen;
1796        u64 last_snapshot;
1797        u32 blocksize;
1798        int cow = 0;
1799        int level;
1800        int ret;
1801        int slot;
1802
1803        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1804        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1805
1806        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1807again:
1808        slot = path->slots[lowest_level];
1809        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1810
1811        eb = btrfs_lock_root_node(dest);
1812        btrfs_set_lock_blocking(eb);
1813        level = btrfs_header_level(eb);
1814
1815        if (level < lowest_level) {
1816                btrfs_tree_unlock(eb);
1817                free_extent_buffer(eb);
1818                return 0;
1819        }
1820
1821        if (cow) {
1822                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1823                BUG_ON(ret);
1824        }
1825        btrfs_set_lock_blocking(eb);
1826
1827        if (next_key) {
1828                next_key->objectid = (u64)-1;
1829                next_key->type = (u8)-1;
1830                next_key->offset = (u64)-1;
1831        }
1832
1833        parent = eb;
1834        while (1) {
1835                level = btrfs_header_level(parent);
1836                BUG_ON(level < lowest_level);
1837
1838                ret = btrfs_bin_search(parent, &key, level, &slot);
1839                if (ret && slot > 0)
1840                        slot--;
1841
1842                if (next_key && slot + 1 < btrfs_header_nritems(parent))
1843                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1844
1845                old_bytenr = btrfs_node_blockptr(parent, slot);
1846                blocksize = fs_info->nodesize;
1847                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1848
1849                if (level <= max_level) {
1850                        eb = path->nodes[level];
1851                        new_bytenr = btrfs_node_blockptr(eb,
1852                                                        path->slots[level]);
1853                        new_ptr_gen = btrfs_node_ptr_generation(eb,
1854                                                        path->slots[level]);
1855                } else {
1856                        new_bytenr = 0;
1857                        new_ptr_gen = 0;
1858                }
1859
1860                if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1861                        ret = level;
1862                        break;
1863                }
1864
1865                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1866                    memcmp_node_keys(parent, slot, path, level)) {
1867                        if (level <= lowest_level) {
1868                                ret = 0;
1869                                break;
1870                        }
1871
1872                        eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen);
1873                        if (IS_ERR(eb)) {
1874                                ret = PTR_ERR(eb);
1875                                break;
1876                        } else if (!extent_buffer_uptodate(eb)) {
1877                                ret = -EIO;
1878                                free_extent_buffer(eb);
1879                                break;
1880                        }
1881                        btrfs_tree_lock(eb);
1882                        if (cow) {
1883                                ret = btrfs_cow_block(trans, dest, eb, parent,
1884                                                      slot, &eb);
1885                                BUG_ON(ret);
1886                        }
1887                        btrfs_set_lock_blocking(eb);
1888
1889                        btrfs_tree_unlock(parent);
1890                        free_extent_buffer(parent);
1891
1892                        parent = eb;
1893                        continue;
1894                }
1895
1896                if (!cow) {
1897                        btrfs_tree_unlock(parent);
1898                        free_extent_buffer(parent);
1899                        cow = 1;
1900                        goto again;
1901                }
1902
1903                btrfs_node_key_to_cpu(path->nodes[level], &key,
1904                                      path->slots[level]);
1905                btrfs_release_path(path);
1906
1907                path->lowest_level = level;
1908                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1909                path->lowest_level = 0;
1910                BUG_ON(ret);
1911
1912                /*
1913                 * Info qgroup to trace both subtrees.
1914                 *
1915                 * We must trace both trees.
1916                 * 1) Tree reloc subtree
1917                 *    If not traced, we will leak data numbers
1918                 * 2) Fs subtree
1919                 *    If not traced, we will double count old data
1920                 *    and tree block numbers, if current trans doesn't free
1921                 *    data reloc tree inode.
1922                 */
1923                ret = btrfs_qgroup_trace_subtree(trans, src, parent,
1924                                btrfs_header_generation(parent),
1925                                btrfs_header_level(parent));
1926                if (ret < 0)
1927                        break;
1928                ret = btrfs_qgroup_trace_subtree(trans, dest,
1929                                path->nodes[level],
1930                                btrfs_header_generation(path->nodes[level]),
1931                                btrfs_header_level(path->nodes[level]));
1932                if (ret < 0)
1933                        break;
1934
1935                /*
1936                 * swap blocks in fs tree and reloc tree.
1937                 */
1938                btrfs_set_node_blockptr(parent, slot, new_bytenr);
1939                btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1940                btrfs_mark_buffer_dirty(parent);
1941
1942                btrfs_set_node_blockptr(path->nodes[level],
1943                                        path->slots[level], old_bytenr);
1944                btrfs_set_node_ptr_generation(path->nodes[level],
1945                                              path->slots[level], old_ptr_gen);
1946                btrfs_mark_buffer_dirty(path->nodes[level]);
1947
1948                ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr,
1949                                        blocksize, path->nodes[level]->start,
1950                                        src->root_key.objectid, level - 1, 0);
1951                BUG_ON(ret);
1952                ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr,
1953                                        blocksize, 0, dest->root_key.objectid,
1954                                        level - 1, 0);
1955                BUG_ON(ret);
1956
1957                ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize,
1958                                        path->nodes[level]->start,
1959                                        src->root_key.objectid, level - 1, 0);
1960                BUG_ON(ret);
1961
1962                ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize,
1963                                        0, dest->root_key.objectid, level - 1,
1964                                        0);
1965                BUG_ON(ret);
1966
1967                btrfs_unlock_up_safe(path, 0);
1968
1969                ret = level;
1970                break;
1971        }
1972        btrfs_tree_unlock(parent);
1973        free_extent_buffer(parent);
1974        return ret;
1975}
1976
1977/*
1978 * helper to find next relocated block in reloc tree
1979 */
1980static noinline_for_stack
1981int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1982                       int *level)
1983{
1984        struct extent_buffer *eb;
1985        int i;
1986        u64 last_snapshot;
1987        u32 nritems;
1988
1989        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1990
1991        for (i = 0; i < *level; i++) {
1992                free_extent_buffer(path->nodes[i]);
1993                path->nodes[i] = NULL;
1994        }
1995
1996        for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1997                eb = path->nodes[i];
1998                nritems = btrfs_header_nritems(eb);
1999                while (path->slots[i] + 1 < nritems) {
2000                        path->slots[i]++;
2001                        if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
2002                            last_snapshot)
2003                                continue;
2004
2005                        *level = i;
2006                        return 0;
2007                }
2008                free_extent_buffer(path->nodes[i]);
2009                path->nodes[i] = NULL;
2010        }
2011        return 1;
2012}
2013
2014/*
2015 * walk down reloc tree to find relocated block of lowest level
2016 */
2017static noinline_for_stack
2018int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
2019                         int *level)
2020{
2021        struct btrfs_fs_info *fs_info = root->fs_info;
2022        struct extent_buffer *eb = NULL;
2023        int i;
2024        u64 bytenr;
2025        u64 ptr_gen = 0;
2026        u64 last_snapshot;
2027        u32 nritems;
2028
2029        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2030
2031        for (i = *level; i > 0; i--) {
2032                eb = path->nodes[i];
2033                nritems = btrfs_header_nritems(eb);
2034                while (path->slots[i] < nritems) {
2035                        ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
2036                        if (ptr_gen > last_snapshot)
2037                                break;
2038                        path->slots[i]++;
2039                }
2040                if (path->slots[i] >= nritems) {
2041                        if (i == *level)
2042                                break;
2043                        *level = i + 1;
2044                        return 0;
2045                }
2046                if (i == 1) {
2047                        *level = i;
2048                        return 0;
2049                }
2050
2051                bytenr = btrfs_node_blockptr(eb, path->slots[i]);
2052                eb = read_tree_block(fs_info, bytenr, ptr_gen);
2053                if (IS_ERR(eb)) {
2054                        return PTR_ERR(eb);
2055                } else if (!extent_buffer_uptodate(eb)) {
2056                        free_extent_buffer(eb);
2057                        return -EIO;
2058                }
2059                BUG_ON(btrfs_header_level(eb) != i - 1);
2060                path->nodes[i - 1] = eb;
2061                path->slots[i - 1] = 0;
2062        }
2063        return 1;
2064}
2065
2066/*
2067 * invalidate extent cache for file extents whose key in range of
2068 * [min_key, max_key)
2069 */
2070static int invalidate_extent_cache(struct btrfs_root *root,
2071                                   struct btrfs_key *min_key,
2072                                   struct btrfs_key *max_key)
2073{
2074        struct btrfs_fs_info *fs_info = root->fs_info;
2075        struct inode *inode = NULL;
2076        u64 objectid;
2077        u64 start, end;
2078        u64 ino;
2079
2080        objectid = min_key->objectid;
2081        while (1) {
2082                cond_resched();
2083                iput(inode);
2084
2085                if (objectid > max_key->objectid)
2086                        break;
2087
2088                inode = find_next_inode(root, objectid);
2089                if (!inode)
2090                        break;
2091                ino = btrfs_ino(inode);
2092
2093                if (ino > max_key->objectid) {
2094                        iput(inode);
2095                        break;
2096                }
2097
2098                objectid = ino + 1;
2099                if (!S_ISREG(inode->i_mode))
2100                        continue;
2101
2102                if (unlikely(min_key->objectid == ino)) {
2103                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
2104                                continue;
2105                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
2106                                start = 0;
2107                        else {
2108                                start = min_key->offset;
2109                                WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
2110                        }
2111                } else {
2112                        start = 0;
2113                }
2114
2115                if (unlikely(max_key->objectid == ino)) {
2116                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
2117                                continue;
2118                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
2119                                end = (u64)-1;
2120                        } else {
2121                                if (max_key->offset == 0)
2122                                        continue;
2123                                end = max_key->offset;
2124                                WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
2125                                end--;
2126                        }
2127                } else {
2128                        end = (u64)-1;
2129                }
2130
2131                /* the lock_extent waits for readpage to complete */
2132                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2133                btrfs_drop_extent_cache(inode, start, end, 1);
2134                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2135        }
2136        return 0;
2137}
2138
2139static int find_next_key(struct btrfs_path *path, int level,
2140                         struct btrfs_key *key)
2141
2142{
2143        while (level < BTRFS_MAX_LEVEL) {
2144                if (!path->nodes[level])
2145                        break;
2146                if (path->slots[level] + 1 <
2147                    btrfs_header_nritems(path->nodes[level])) {
2148                        btrfs_node_key_to_cpu(path->nodes[level], key,
2149                                              path->slots[level] + 1);
2150                        return 0;
2151                }
2152                level++;
2153        }
2154        return 1;
2155}
2156
2157/*
2158 * merge the relocated tree blocks in reloc tree with corresponding
2159 * fs tree.
2160 */
2161static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2162                                               struct btrfs_root *root)
2163{
2164        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2165        LIST_HEAD(inode_list);
2166        struct btrfs_key key;
2167        struct btrfs_key next_key;
2168        struct btrfs_trans_handle *trans = NULL;
2169        struct btrfs_root *reloc_root;
2170        struct btrfs_root_item *root_item;
2171        struct btrfs_path *path;
2172        struct extent_buffer *leaf;
2173        int level;
2174        int max_level;
2175        int replaced = 0;
2176        int ret;
2177        int err = 0;
2178        u32 min_reserved;
2179
2180        path = btrfs_alloc_path();
2181        if (!path)
2182                return -ENOMEM;
2183        path->reada = READA_FORWARD;
2184
2185        reloc_root = root->reloc_root;
2186        root_item = &reloc_root->root_item;
2187
2188        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
2189                level = btrfs_root_level(root_item);
2190                extent_buffer_get(reloc_root->node);
2191                path->nodes[level] = reloc_root->node;
2192                path->slots[level] = 0;
2193        } else {
2194                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
2195
2196                level = root_item->drop_level;
2197                BUG_ON(level == 0);
2198                path->lowest_level = level;
2199                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
2200                path->lowest_level = 0;
2201                if (ret < 0) {
2202                        btrfs_free_path(path);
2203                        return ret;
2204                }
2205
2206                btrfs_node_key_to_cpu(path->nodes[level], &next_key,
2207                                      path->slots[level]);
2208                WARN_ON(memcmp(&key, &next_key, sizeof(key)));
2209
2210                btrfs_unlock_up_safe(path, 0);
2211        }
2212
2213        min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2214        memset(&next_key, 0, sizeof(next_key));
2215
2216        while (1) {
2217                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
2218                                             BTRFS_RESERVE_FLUSH_ALL);
2219                if (ret) {
2220                        err = ret;
2221                        goto out;
2222                }
2223                trans = btrfs_start_transaction(root, 0);
2224                if (IS_ERR(trans)) {
2225                        err = PTR_ERR(trans);
2226                        trans = NULL;
2227                        goto out;
2228                }
2229                trans->block_rsv = rc->block_rsv;
2230
2231                replaced = 0;
2232                max_level = level;
2233
2234                ret = walk_down_reloc_tree(reloc_root, path, &level);
2235                if (ret < 0) {
2236                        err = ret;
2237                        goto out;
2238                }
2239                if (ret > 0)
2240                        break;
2241
2242                if (!find_next_key(path, level, &key) &&
2243                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
2244                        ret = 0;
2245                } else {
2246                        ret = replace_path(trans, root, reloc_root, path,
2247                                           &next_key, level, max_level);
2248                }
2249                if (ret < 0) {
2250                        err = ret;
2251                        goto out;
2252                }
2253
2254                if (ret > 0) {
2255                        level = ret;
2256                        btrfs_node_key_to_cpu(path->nodes[level], &key,
2257                                              path->slots[level]);
2258                        replaced = 1;
2259                }
2260
2261                ret = walk_up_reloc_tree(reloc_root, path, &level);
2262                if (ret > 0)
2263                        break;
2264
2265                BUG_ON(level == 0);
2266                /*
2267                 * save the merging progress in the drop_progress.
2268                 * this is OK since root refs == 1 in this case.
2269                 */
2270                btrfs_node_key(path->nodes[level], &root_item->drop_progress,
2271                               path->slots[level]);
2272                root_item->drop_level = level;
2273
2274                btrfs_end_transaction_throttle(trans);
2275                trans = NULL;
2276
2277                btrfs_btree_balance_dirty(fs_info);
2278
2279                if (replaced && rc->stage == UPDATE_DATA_PTRS)
2280                        invalidate_extent_cache(root, &key, &next_key);
2281        }
2282
2283        /*
2284         * handle the case only one block in the fs tree need to be
2285         * relocated and the block is tree root.
2286         */
2287        leaf = btrfs_lock_root_node(root);
2288        ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
2289        btrfs_tree_unlock(leaf);
2290        free_extent_buffer(leaf);
2291        if (ret < 0)
2292                err = ret;
2293out:
2294        btrfs_free_path(path);
2295
2296        if (err == 0) {
2297                memset(&root_item->drop_progress, 0,
2298                       sizeof(root_item->drop_progress));
2299                root_item->drop_level = 0;
2300                btrfs_set_root_refs(root_item, 0);
2301                btrfs_update_reloc_root(trans, root);
2302        }
2303
2304        if (trans)
2305                btrfs_end_transaction_throttle(trans);
2306
2307        btrfs_btree_balance_dirty(fs_info);
2308
2309        if (replaced && rc->stage == UPDATE_DATA_PTRS)
2310                invalidate_extent_cache(root, &key, &next_key);
2311
2312        return err;
2313}
2314
2315static noinline_for_stack
2316int prepare_to_merge(struct reloc_control *rc, int err)
2317{
2318        struct btrfs_root *root = rc->extent_root;
2319        struct btrfs_fs_info *fs_info = root->fs_info;
2320        struct btrfs_root *reloc_root;
2321        struct btrfs_trans_handle *trans;
2322        LIST_HEAD(reloc_roots);
2323        u64 num_bytes = 0;
2324        int ret;
2325
2326        mutex_lock(&fs_info->reloc_mutex);
2327        rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2328        rc->merging_rsv_size += rc->nodes_relocated * 2;
2329        mutex_unlock(&fs_info->reloc_mutex);
2330
2331again:
2332        if (!err) {
2333                num_bytes = rc->merging_rsv_size;
2334                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
2335                                          BTRFS_RESERVE_FLUSH_ALL);
2336                if (ret)
2337                        err = ret;
2338        }
2339
2340        trans = btrfs_join_transaction(rc->extent_root);
2341        if (IS_ERR(trans)) {
2342                if (!err)
2343                        btrfs_block_rsv_release(fs_info, rc->block_rsv,
2344                                                num_bytes);
2345                return PTR_ERR(trans);
2346        }
2347
2348        if (!err) {
2349                if (num_bytes != rc->merging_rsv_size) {
2350                        btrfs_end_transaction(trans);
2351                        btrfs_block_rsv_release(fs_info, rc->block_rsv,
2352                                                num_bytes);
2353                        goto again;
2354                }
2355        }
2356
2357        rc->merge_reloc_tree = 1;
2358
2359        while (!list_empty(&rc->reloc_roots)) {
2360                reloc_root = list_entry(rc->reloc_roots.next,
2361                                        struct btrfs_root, root_list);
2362                list_del_init(&reloc_root->root_list);
2363
2364                root = read_fs_root(fs_info, reloc_root->root_key.offset);
2365                BUG_ON(IS_ERR(root));
2366                BUG_ON(root->reloc_root != reloc_root);
2367
2368                /*
2369                 * set reference count to 1, so btrfs_recover_relocation
2370                 * knows it should resumes merging
2371                 */
2372                if (!err)
2373                        btrfs_set_root_refs(&reloc_root->root_item, 1);
2374                btrfs_update_reloc_root(trans, root);
2375
2376                list_add(&reloc_root->root_list, &reloc_roots);
2377        }
2378
2379        list_splice(&reloc_roots, &rc->reloc_roots);
2380
2381        if (!err)
2382                btrfs_commit_transaction(trans);
2383        else
2384                btrfs_end_transaction(trans);
2385        return err;
2386}
2387
2388static noinline_for_stack
2389void free_reloc_roots(struct list_head *list)
2390{
2391        struct btrfs_root *reloc_root;
2392
2393        while (!list_empty(list)) {
2394                reloc_root = list_entry(list->next, struct btrfs_root,
2395                                        root_list);
2396                free_extent_buffer(reloc_root->node);
2397                free_extent_buffer(reloc_root->commit_root);
2398                reloc_root->node = NULL;
2399                reloc_root->commit_root = NULL;
2400                __del_reloc_root(reloc_root);
2401        }
2402}
2403
2404static noinline_for_stack
2405void merge_reloc_roots(struct reloc_control *rc)
2406{
2407        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2408        struct btrfs_root *root;
2409        struct btrfs_root *reloc_root;
2410        LIST_HEAD(reloc_roots);
2411        int found = 0;
2412        int ret = 0;
2413again:
2414        root = rc->extent_root;
2415
2416        /*
2417         * this serializes us with btrfs_record_root_in_transaction,
2418         * we have to make sure nobody is in the middle of
2419         * adding their roots to the list while we are
2420         * doing this splice
2421         */
2422        mutex_lock(&fs_info->reloc_mutex);
2423        list_splice_init(&rc->reloc_roots, &reloc_roots);
2424        mutex_unlock(&fs_info->reloc_mutex);
2425
2426        while (!list_empty(&reloc_roots)) {
2427                found = 1;
2428                reloc_root = list_entry(reloc_roots.next,
2429                                        struct btrfs_root, root_list);
2430
2431                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2432                        root = read_fs_root(fs_info,
2433                                            reloc_root->root_key.offset);
2434                        BUG_ON(IS_ERR(root));
2435                        BUG_ON(root->reloc_root != reloc_root);
2436
2437                        ret = merge_reloc_root(rc, root);
2438                        if (ret) {
2439                                if (list_empty(&reloc_root->root_list))
2440                                        list_add_tail(&reloc_root->root_list,
2441                                                      &reloc_roots);
2442                                goto out;
2443                        }
2444                } else {
2445                        list_del_init(&reloc_root->root_list);
2446                }
2447
2448                ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2449                if (ret < 0) {
2450                        if (list_empty(&reloc_root->root_list))
2451                                list_add_tail(&reloc_root->root_list,
2452                                              &reloc_roots);
2453                        goto out;
2454                }
2455        }
2456
2457        if (found) {
2458                found = 0;
2459                goto again;
2460        }
2461out:
2462        if (ret) {
2463                btrfs_handle_fs_error(fs_info, ret, NULL);
2464                if (!list_empty(&reloc_roots))
2465                        free_reloc_roots(&reloc_roots);
2466
2467                /* new reloc root may be added */
2468                mutex_lock(&fs_info->reloc_mutex);
2469                list_splice_init(&rc->reloc_roots, &reloc_roots);
2470                mutex_unlock(&fs_info->reloc_mutex);
2471                if (!list_empty(&reloc_roots))
2472                        free_reloc_roots(&reloc_roots);
2473        }
2474
2475        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2476}
2477
2478static void free_block_list(struct rb_root *blocks)
2479{
2480        struct tree_block *block;
2481        struct rb_node *rb_node;
2482        while ((rb_node = rb_first(blocks))) {
2483                block = rb_entry(rb_node, struct tree_block, rb_node);
2484                rb_erase(rb_node, blocks);
2485                kfree(block);
2486        }
2487}
2488
2489static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2490                                      struct btrfs_root *reloc_root)
2491{
2492        struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2493        struct btrfs_root *root;
2494
2495        if (reloc_root->last_trans == trans->transid)
2496                return 0;
2497
2498        root = read_fs_root(fs_info, reloc_root->root_key.offset);
2499        BUG_ON(IS_ERR(root));
2500        BUG_ON(root->reloc_root != reloc_root);
2501
2502        return btrfs_record_root_in_trans(trans, root);
2503}
2504
2505static noinline_for_stack
2506struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2507                                     struct reloc_control *rc,
2508                                     struct backref_node *node,
2509                                     struct backref_edge *edges[])
2510{
2511        struct backref_node *next;
2512        struct btrfs_root *root;
2513        int index = 0;
2514
2515        next = node;
2516        while (1) {
2517                cond_resched();
2518                next = walk_up_backref(next, edges, &index);
2519                root = next->root;
2520                BUG_ON(!root);
2521                BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
2522
2523                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2524                        record_reloc_root_in_trans(trans, root);
2525                        break;
2526                }
2527
2528                btrfs_record_root_in_trans(trans, root);
2529                root = root->reloc_root;
2530
2531                if (next->new_bytenr != root->node->start) {
2532                        BUG_ON(next->new_bytenr);
2533                        BUG_ON(!list_empty(&next->list));
2534                        next->new_bytenr = root->node->start;
2535                        next->root = root;
2536                        list_add_tail(&next->list,
2537                                      &rc->backref_cache.changed);
2538                        __mark_block_processed(rc, next);
2539                        break;
2540                }
2541
2542                WARN_ON(1);
2543                root = NULL;
2544                next = walk_down_backref(edges, &index);
2545                if (!next || next->level <= node->level)
2546                        break;
2547        }
2548        if (!root)
2549                return NULL;
2550
2551        next = node;
2552        /* setup backref node path for btrfs_reloc_cow_block */
2553        while (1) {
2554                rc->backref_cache.path[next->level] = next;
2555                if (--index < 0)
2556                        break;
2557                next = edges[index]->node[UPPER];
2558        }
2559        return root;
2560}
2561
2562/*
2563 * select a tree root for relocation. return NULL if the block
2564 * is reference counted. we should use do_relocation() in this
2565 * case. return a tree root pointer if the block isn't reference
2566 * counted. return -ENOENT if the block is root of reloc tree.
2567 */
2568static noinline_for_stack
2569struct btrfs_root *select_one_root(struct backref_node *node)
2570{
2571        struct backref_node *next;
2572        struct btrfs_root *root;
2573        struct btrfs_root *fs_root = NULL;
2574        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2575        int index = 0;
2576
2577        next = node;
2578        while (1) {
2579                cond_resched();
2580                next = walk_up_backref(next, edges, &index);
2581                root = next->root;
2582                BUG_ON(!root);
2583
2584                /* no other choice for non-references counted tree */
2585                if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
2586                        return root;
2587
2588                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2589                        fs_root = root;
2590
2591                if (next != node)
2592                        return NULL;
2593
2594                next = walk_down_backref(edges, &index);
2595                if (!next || next->level <= node->level)
2596                        break;
2597        }
2598
2599        if (!fs_root)
2600                return ERR_PTR(-ENOENT);
2601        return fs_root;
2602}
2603
2604static noinline_for_stack
2605u64 calcu_metadata_size(struct reloc_control *rc,
2606                        struct backref_node *node, int reserve)
2607{
2608        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2609        struct backref_node *next = node;
2610        struct backref_edge *edge;
2611        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2612        u64 num_bytes = 0;
2613        int index = 0;
2614
2615        BUG_ON(reserve && node->processed);
2616
2617        while (next) {
2618                cond_resched();
2619                while (1) {
2620                        if (next->processed && (reserve || next != node))
2621                                break;
2622
2623                        num_bytes += fs_info->nodesize;
2624
2625                        if (list_empty(&next->upper))
2626                                break;
2627
2628                        edge = list_entry(next->upper.next,
2629                                          struct backref_edge, list[LOWER]);
2630                        edges[index++] = edge;
2631                        next = edge->node[UPPER];
2632                }
2633                next = walk_down_backref(edges, &index);
2634        }
2635        return num_bytes;
2636}
2637
2638static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2639                                  struct reloc_control *rc,
2640                                  struct backref_node *node)
2641{
2642        struct btrfs_root *root = rc->extent_root;
2643        struct btrfs_fs_info *fs_info = root->fs_info;
2644        u64 num_bytes;
2645        int ret;
2646        u64 tmp;
2647
2648        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2649
2650        trans->block_rsv = rc->block_rsv;
2651        rc->reserved_bytes += num_bytes;
2652
2653        /*
2654         * We are under a transaction here so we can only do limited flushing.
2655         * If we get an enospc just kick back -EAGAIN so we know to drop the
2656         * transaction and try to refill when we can flush all the things.
2657         */
2658        ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2659                                BTRFS_RESERVE_FLUSH_LIMIT);
2660        if (ret) {
2661                tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2662                while (tmp <= rc->reserved_bytes)
2663                        tmp <<= 1;
2664                /*
2665                 * only one thread can access block_rsv at this point,
2666                 * so we don't need hold lock to protect block_rsv.
2667                 * we expand more reservation size here to allow enough
2668                 * space for relocation and we will return eailer in
2669                 * enospc case.
2670                 */
2671                rc->block_rsv->size = tmp + fs_info->nodesize *
2672                                      RELOCATION_RESERVED_NODES;
2673                return -EAGAIN;
2674        }
2675
2676        return 0;
2677}
2678
2679/*
2680 * relocate a block tree, and then update pointers in upper level
2681 * blocks that reference the block to point to the new location.
2682 *
2683 * if called by link_to_upper, the block has already been relocated.
2684 * in that case this function just updates pointers.
2685 */
2686static int do_relocation(struct btrfs_trans_handle *trans,
2687                         struct reloc_control *rc,
2688                         struct backref_node *node,
2689                         struct btrfs_key *key,
2690                         struct btrfs_path *path, int lowest)
2691{
2692        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2693        struct backref_node *upper;
2694        struct backref_edge *edge;
2695        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2696        struct btrfs_root *root;
2697        struct extent_buffer *eb;
2698        u32 blocksize;
2699        u64 bytenr;
2700        u64 generation;
2701        int slot;
2702        int ret;
2703        int err = 0;
2704
2705        BUG_ON(lowest && node->eb);
2706
2707        path->lowest_level = node->level + 1;
2708        rc->backref_cache.path[node->level] = node;
2709        list_for_each_entry(edge, &node->upper, list[LOWER]) {
2710                cond_resched();
2711
2712                upper = edge->node[UPPER];
2713                root = select_reloc_root(trans, rc, upper, edges);
2714                BUG_ON(!root);
2715
2716                if (upper->eb && !upper->locked) {
2717                        if (!lowest) {
2718                                ret = btrfs_bin_search(upper->eb, key,
2719                                                       upper->level, &slot);
2720                                BUG_ON(ret);
2721                                bytenr = btrfs_node_blockptr(upper->eb, slot);
2722                                if (node->eb->start == bytenr)
2723                                        goto next;
2724                        }
2725                        drop_node_buffer(upper);
2726                }
2727
2728                if (!upper->eb) {
2729                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2730                        if (ret) {
2731                                if (ret < 0)
2732                                        err = ret;
2733                                else
2734                                        err = -ENOENT;
2735
2736                                btrfs_release_path(path);
2737                                break;
2738                        }
2739
2740                        if (!upper->eb) {
2741                                upper->eb = path->nodes[upper->level];
2742                                path->nodes[upper->level] = NULL;
2743                        } else {
2744                                BUG_ON(upper->eb != path->nodes[upper->level]);
2745                        }
2746
2747                        upper->locked = 1;
2748                        path->locks[upper->level] = 0;
2749
2750                        slot = path->slots[upper->level];
2751                        btrfs_release_path(path);
2752                } else {
2753                        ret = btrfs_bin_search(upper->eb, key, upper->level,
2754                                               &slot);
2755                        BUG_ON(ret);
2756                }
2757
2758                bytenr = btrfs_node_blockptr(upper->eb, slot);
2759                if (lowest) {
2760                        if (bytenr != node->bytenr) {
2761                                btrfs_err(root->fs_info,
2762                "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2763                                          bytenr, node->bytenr, slot,
2764                                          upper->eb->start);
2765                                err = -EIO;
2766                                goto next;
2767                        }
2768                } else {
2769                        if (node->eb->start == bytenr)
2770                                goto next;
2771                }
2772
2773                blocksize = root->fs_info->nodesize;
2774                generation = btrfs_node_ptr_generation(upper->eb, slot);
2775                eb = read_tree_block(fs_info, bytenr, generation);
2776                if (IS_ERR(eb)) {
2777                        err = PTR_ERR(eb);
2778                        goto next;
2779                } else if (!extent_buffer_uptodate(eb)) {
2780                        free_extent_buffer(eb);
2781                        err = -EIO;
2782                        goto next;
2783                }
2784                btrfs_tree_lock(eb);
2785                btrfs_set_lock_blocking(eb);
2786
2787                if (!node->eb) {
2788                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
2789                                              slot, &eb);
2790                        btrfs_tree_unlock(eb);
2791                        free_extent_buffer(eb);
2792                        if (ret < 0) {
2793                                err = ret;
2794                                goto next;
2795                        }
2796                        BUG_ON(node->eb != eb);
2797                } else {
2798                        btrfs_set_node_blockptr(upper->eb, slot,
2799                                                node->eb->start);
2800                        btrfs_set_node_ptr_generation(upper->eb, slot,
2801                                                      trans->transid);
2802                        btrfs_mark_buffer_dirty(upper->eb);
2803
2804                        ret = btrfs_inc_extent_ref(trans, root->fs_info,
2805                                                node->eb->start, blocksize,
2806                                                upper->eb->start,
2807                                                btrfs_header_owner(upper->eb),
2808                                                node->level, 0);
2809                        BUG_ON(ret);
2810
2811                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2812                        BUG_ON(ret);
2813                }
2814next:
2815                if (!upper->pending)
2816                        drop_node_buffer(upper);
2817                else
2818                        unlock_node_buffer(upper);
2819                if (err)
2820                        break;
2821        }
2822
2823        if (!err && node->pending) {
2824                drop_node_buffer(node);
2825                list_move_tail(&node->list, &rc->backref_cache.changed);
2826                node->pending = 0;
2827        }
2828
2829        path->lowest_level = 0;
2830        BUG_ON(err == -ENOSPC);
2831        return err;
2832}
2833
2834static int link_to_upper(struct btrfs_trans_handle *trans,
2835                         struct reloc_control *rc,
2836                         struct backref_node *node,
2837                         struct btrfs_path *path)
2838{
2839        struct btrfs_key key;
2840
2841        btrfs_node_key_to_cpu(node->eb, &key, 0);
2842        return do_relocation(trans, rc, node, &key, path, 0);
2843}
2844
2845static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2846                                struct reloc_control *rc,
2847                                struct btrfs_path *path, int err)
2848{
2849        LIST_HEAD(list);
2850        struct backref_cache *cache = &rc->backref_cache;
2851        struct backref_node *node;
2852        int level;
2853        int ret;
2854
2855        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2856                while (!list_empty(&cache->pending[level])) {
2857                        node = list_entry(cache->pending[level].next,
2858                                          struct backref_node, list);
2859                        list_move_tail(&node->list, &list);
2860                        BUG_ON(!node->pending);
2861
2862                        if (!err) {
2863                                ret = link_to_upper(trans, rc, node, path);
2864                                if (ret < 0)
2865                                        err = ret;
2866                        }
2867                }
2868                list_splice_init(&list, &cache->pending[level]);
2869        }
2870        return err;
2871}
2872
2873static void mark_block_processed(struct reloc_control *rc,
2874                                 u64 bytenr, u32 blocksize)
2875{
2876        set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2877                        EXTENT_DIRTY);
2878}
2879
2880static void __mark_block_processed(struct reloc_control *rc,
2881                                   struct backref_node *node)
2882{
2883        u32 blocksize;
2884        if (node->level == 0 ||
2885            in_block_group(node->bytenr, rc->block_group)) {
2886                blocksize = rc->extent_root->fs_info->nodesize;
2887                mark_block_processed(rc, node->bytenr, blocksize);
2888        }
2889        node->processed = 1;
2890}
2891
2892/*
2893 * mark a block and all blocks directly/indirectly reference the block
2894 * as processed.
2895 */
2896static void update_processed_blocks(struct reloc_control *rc,
2897                                    struct backref_node *node)
2898{
2899        struct backref_node *next = node;
2900        struct backref_edge *edge;
2901        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2902        int index = 0;
2903
2904        while (next) {
2905                cond_resched();
2906                while (1) {
2907                        if (next->processed)
2908                                break;
2909
2910                        __mark_block_processed(rc, next);
2911
2912                        if (list_empty(&next->upper))
2913                                break;
2914
2915                        edge = list_entry(next->upper.next,
2916                                          struct backref_edge, list[LOWER]);
2917                        edges[index++] = edge;
2918                        next = edge->node[UPPER];
2919                }
2920                next = walk_down_backref(edges, &index);
2921        }
2922}
2923
2924static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2925{
2926        u32 blocksize = rc->extent_root->fs_info->nodesize;
2927
2928        if (test_range_bit(&rc->processed_blocks, bytenr,
2929                           bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2930                return 1;
2931        return 0;
2932}
2933
2934static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2935                              struct tree_block *block)
2936{
2937        struct extent_buffer *eb;
2938
2939        BUG_ON(block->key_ready);
2940        eb = read_tree_block(fs_info, block->bytenr, block->key.offset);
2941        if (IS_ERR(eb)) {
2942                return PTR_ERR(eb);
2943        } else if (!extent_buffer_uptodate(eb)) {
2944                free_extent_buffer(eb);
2945                return -EIO;
2946        }
2947        WARN_ON(btrfs_header_level(eb) != block->level);
2948        if (block->level == 0)
2949                btrfs_item_key_to_cpu(eb, &block->key, 0);
2950        else
2951                btrfs_node_key_to_cpu(eb, &block->key, 0);
2952        free_extent_buffer(eb);
2953        block->key_ready = 1;
2954        return 0;
2955}
2956
2957/*
2958 * helper function to relocate a tree block
2959 */
2960static int relocate_tree_block(struct btrfs_trans_handle *trans,
2961                                struct reloc_control *rc,
2962                                struct backref_node *node,
2963                                struct btrfs_key *key,
2964                                struct btrfs_path *path)
2965{
2966        struct btrfs_root *root;
2967        int ret = 0;
2968
2969        if (!node)
2970                return 0;
2971
2972        BUG_ON(node->processed);
2973        root = select_one_root(node);
2974        if (root == ERR_PTR(-ENOENT)) {
2975                update_processed_blocks(rc, node);
2976                goto out;
2977        }
2978
2979        if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2980                ret = reserve_metadata_space(trans, rc, node);
2981                if (ret)
2982                        goto out;
2983        }
2984
2985        if (root) {
2986                if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
2987                        BUG_ON(node->new_bytenr);
2988                        BUG_ON(!list_empty(&node->list));
2989                        btrfs_record_root_in_trans(trans, root);
2990                        root = root->reloc_root;
2991                        node->new_bytenr = root->node->start;
2992                        node->root = root;
2993                        list_add_tail(&node->list, &rc->backref_cache.changed);
2994                } else {
2995                        path->lowest_level = node->level;
2996                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2997                        btrfs_release_path(path);
2998                        if (ret > 0)
2999                                ret = 0;
3000                }
3001                if (!ret)
3002                        update_processed_blocks(rc, node);
3003        } else {
3004                ret = do_relocation(trans, rc, node, key, path, 1);
3005        }
3006out:
3007        if (ret || node->level == 0 || node->cowonly)
3008                remove_backref_node(&rc->backref_cache, node);
3009        return ret;
3010}
3011
3012/*
3013 * relocate a list of blocks
3014 */
3015static noinline_for_stack
3016int relocate_tree_blocks(struct btrfs_trans_handle *trans,
3017                         struct reloc_control *rc, struct rb_root *blocks)
3018{
3019        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3020        struct backref_node *node;
3021        struct btrfs_path *path;
3022        struct tree_block *block;
3023        struct rb_node *rb_node;
3024        int ret;
3025        int err = 0;
3026
3027        path = btrfs_alloc_path();
3028        if (!path) {
3029                err = -ENOMEM;
3030                goto out_free_blocks;
3031        }
3032
3033        rb_node = rb_first(blocks);
3034        while (rb_node) {
3035                block = rb_entry(rb_node, struct tree_block, rb_node);
3036                if (!block->key_ready)
3037                        readahead_tree_block(fs_info, block->bytenr);
3038                rb_node = rb_next(rb_node);
3039        }
3040
3041        rb_node = rb_first(blocks);
3042        while (rb_node) {
3043                block = rb_entry(rb_node, struct tree_block, rb_node);
3044                if (!block->key_ready) {
3045                        err = get_tree_block_key(fs_info, block);
3046                        if (err)
3047                                goto out_free_path;
3048                }
3049                rb_node = rb_next(rb_node);
3050        }
3051
3052        rb_node = rb_first(blocks);
3053        while (rb_node) {
3054                block = rb_entry(rb_node, struct tree_block, rb_node);
3055
3056                node = build_backref_tree(rc, &block->key,
3057                                          block->level, block->bytenr);
3058                if (IS_ERR(node)) {
3059                        err = PTR_ERR(node);
3060                        goto out;
3061                }
3062
3063                ret = relocate_tree_block(trans, rc, node, &block->key,
3064                                          path);
3065                if (ret < 0) {
3066                        if (ret != -EAGAIN || rb_node == rb_first(blocks))
3067                                err = ret;
3068                        goto out;
3069                }
3070                rb_node = rb_next(rb_node);
3071        }
3072out:
3073        err = finish_pending_nodes(trans, rc, path, err);
3074
3075out_free_path:
3076        btrfs_free_path(path);
3077out_free_blocks:
3078        free_block_list(blocks);
3079        return err;
3080}
3081
3082static noinline_for_stack
3083int prealloc_file_extent_cluster(struct inode *inode,
3084                                 struct file_extent_cluster *cluster)
3085{
3086        u64 alloc_hint = 0;
3087        u64 start;
3088        u64 end;
3089        u64 offset = BTRFS_I(inode)->index_cnt;
3090        u64 num_bytes;
3091        int nr = 0;
3092        int ret = 0;
3093        u64 prealloc_start = cluster->start - offset;
3094        u64 prealloc_end = cluster->end - offset;
3095        u64 cur_offset;
3096
3097        BUG_ON(cluster->start != cluster->boundary[0]);
3098        inode_lock(inode);
3099
3100        ret = btrfs_check_data_free_space(inode, prealloc_start,
3101                                          prealloc_end + 1 - prealloc_start);
3102        if (ret)
3103                goto out;
3104
3105        cur_offset = prealloc_start;
3106        while (nr < cluster->nr) {
3107                start = cluster->boundary[nr] - offset;
3108                if (nr + 1 < cluster->nr)
3109                        end = cluster->boundary[nr + 1] - 1 - offset;
3110                else
3111                        end = cluster->end - offset;
3112
3113                lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3114                num_bytes = end + 1 - start;
3115                if (cur_offset < start)
3116                        btrfs_free_reserved_data_space(inode, cur_offset,
3117                                        start - cur_offset);
3118                ret = btrfs_prealloc_file_range(inode, 0, start,
3119                                                num_bytes, num_bytes,
3120                                                end + 1, &alloc_hint);
3121                cur_offset = end + 1;
3122                unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3123                if (ret)
3124                        break;
3125                nr++;
3126        }
3127        if (cur_offset < prealloc_end)
3128                btrfs_free_reserved_data_space(inode, cur_offset,
3129                                       prealloc_end + 1 - cur_offset);
3130out:
3131        inode_unlock(inode);
3132        return ret;
3133}
3134
3135static noinline_for_stack
3136int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
3137                         u64 block_start)
3138{
3139        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3140        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3141        struct extent_map *em;
3142        int ret = 0;
3143
3144        em = alloc_extent_map();
3145        if (!em)
3146                return -ENOMEM;
3147
3148        em->start = start;
3149        em->len = end + 1 - start;
3150        em->block_len = em->len;
3151        em->block_start = block_start;
3152        em->bdev = fs_info->fs_devices->latest_bdev;
3153        set_bit(EXTENT_FLAG_PINNED, &em->flags);
3154
3155        lock_extent(&BTRFS_I(inode)->io_tree, start, end);
3156        while (1) {
3157                write_lock(&em_tree->lock);
3158                ret = add_extent_mapping(em_tree, em, 0);
3159                write_unlock(&em_tree->lock);
3160                if (ret != -EEXIST) {
3161                        free_extent_map(em);
3162                        break;
3163                }
3164                btrfs_drop_extent_cache(inode, start, end, 0);
3165        }
3166        unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
3167        return ret;
3168}
3169
3170static int relocate_file_extent_cluster(struct inode *inode,
3171                                        struct file_extent_cluster *cluster)
3172{
3173        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3174        u64 page_start;
3175        u64 page_end;
3176        u64 offset = BTRFS_I(inode)->index_cnt;
3177        unsigned long index;
3178        unsigned long last_index;
3179        struct page *page;
3180        struct file_ra_state *ra;
3181        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
3182        int nr = 0;
3183        int ret = 0;
3184
3185        if (!cluster->nr)
3186                return 0;
3187
3188        ra = kzalloc(sizeof(*ra), GFP_NOFS);
3189        if (!ra)
3190                return -ENOMEM;
3191
3192        ret = prealloc_file_extent_cluster(inode, cluster);
3193        if (ret)
3194                goto out;
3195
3196        file_ra_state_init(ra, inode->i_mapping);
3197
3198        ret = setup_extent_mapping(inode, cluster->start - offset,
3199                                   cluster->end - offset, cluster->start);
3200        if (ret)
3201                goto out;
3202
3203        index = (cluster->start - offset) >> PAGE_SHIFT;
3204        last_index = (cluster->end - offset) >> PAGE_SHIFT;
3205        while (index <= last_index) {
3206                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE);
3207                if (ret)
3208                        goto out;
3209
3210                page = find_lock_page(inode->i_mapping, index);
3211                if (!page) {
3212                        page_cache_sync_readahead(inode->i_mapping,
3213                                                  ra, NULL, index,
3214                                                  last_index + 1 - index);
3215                        page = find_or_create_page(inode->i_mapping, index,
3216                                                   mask);
3217                        if (!page) {
3218                                btrfs_delalloc_release_metadata(inode,
3219                                                        PAGE_SIZE);
3220                                ret = -ENOMEM;
3221                                goto out;
3222                        }
3223                }
3224
3225                if (PageReadahead(page)) {
3226                        page_cache_async_readahead(inode->i_mapping,
3227                                                   ra, NULL, page, index,
3228                                                   last_index + 1 - index);
3229                }
3230
3231                if (!PageUptodate(page)) {
3232                        btrfs_readpage(NULL, page);
3233                        lock_page(page);
3234                        if (!PageUptodate(page)) {
3235                                unlock_page(page);
3236                                put_page(page);
3237                                btrfs_delalloc_release_metadata(inode,
3238                                                        PAGE_SIZE);
3239                                ret = -EIO;
3240                                goto out;
3241                        }
3242                }
3243
3244                page_start = page_offset(page);
3245                page_end = page_start + PAGE_SIZE - 1;
3246
3247                lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
3248
3249                set_page_extent_mapped(page);
3250
3251                if (nr < cluster->nr &&
3252                    page_start + offset == cluster->boundary[nr]) {
3253                        set_extent_bits(&BTRFS_I(inode)->io_tree,
3254                                        page_start, page_end,
3255                                        EXTENT_BOUNDARY);
3256                        nr++;
3257                }
3258
3259                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0);
3260                set_page_dirty(page);
3261
3262                unlock_extent(&BTRFS_I(inode)->io_tree,
3263                              page_start, page_end);
3264                unlock_page(page);
3265                put_page(page);
3266
3267                index++;
3268                balance_dirty_pages_ratelimited(inode->i_mapping);
3269                btrfs_throttle(fs_info);
3270        }
3271        WARN_ON(nr != cluster->nr);
3272out:
3273        kfree(ra);
3274        return ret;
3275}
3276
3277static noinline_for_stack
3278int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
3279                         struct file_extent_cluster *cluster)
3280{
3281        int ret;
3282
3283        if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3284                ret = relocate_file_extent_cluster(inode, cluster);
3285                if (ret)
3286                        return ret;
3287                cluster->nr = 0;
3288        }
3289
3290        if (!cluster->nr)
3291                cluster->start = extent_key->objectid;
3292        else
3293                BUG_ON(cluster->nr >= MAX_EXTENTS);
3294        cluster->end = extent_key->objectid + extent_key->offset - 1;
3295        cluster->boundary[cluster->nr] = extent_key->objectid;
3296        cluster->nr++;
3297
3298        if (cluster->nr >= MAX_EXTENTS) {
3299                ret = relocate_file_extent_cluster(inode, cluster);
3300                if (ret)
3301                        return ret;
3302                cluster->nr = 0;
3303        }
3304        return 0;
3305}
3306
3307#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3308static int get_ref_objectid_v0(struct reloc_control *rc,
3309                               struct btrfs_path *path,
3310                               struct btrfs_key *extent_key,
3311                               u64 *ref_objectid, int *path_change)
3312{
3313        struct btrfs_key key;
3314        struct extent_buffer *leaf;
3315        struct btrfs_extent_ref_v0 *ref0;
3316        int ret;
3317        int slot;
3318
3319        leaf = path->nodes[0];
3320        slot = path->slots[0];
3321        while (1) {
3322                if (slot >= btrfs_header_nritems(leaf)) {
3323                        ret = btrfs_next_leaf(rc->extent_root, path);
3324                        if (ret < 0)
3325                                return ret;
3326                        BUG_ON(ret > 0);
3327                        leaf = path->nodes[0];
3328                        slot = path->slots[0];
3329                        if (path_change)
3330                                *path_change = 1;
3331                }
3332                btrfs_item_key_to_cpu(leaf, &key, slot);
3333                if (key.objectid != extent_key->objectid)
3334                        return -ENOENT;
3335
3336                if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
3337                        slot++;
3338                        continue;
3339                }
3340                ref0 = btrfs_item_ptr(leaf, slot,
3341                                struct btrfs_extent_ref_v0);
3342                *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
3343                break;
3344        }
3345        return 0;
3346}
3347#endif
3348
3349/*
3350 * helper to add a tree block to the list.
3351 * the major work is getting the generation and level of the block
3352 */
3353static int add_tree_block(struct reloc_control *rc,
3354                          struct btrfs_key *extent_key,
3355                          struct btrfs_path *path,
3356                          struct rb_root *blocks)
3357{
3358        struct extent_buffer *eb;
3359        struct btrfs_extent_item *ei;
3360        struct btrfs_tree_block_info *bi;
3361        struct tree_block *block;
3362        struct rb_node *rb_node;
3363        u32 item_size;
3364        int level = -1;
3365        u64 generation;
3366
3367        eb =  path->nodes[0];
3368        item_size = btrfs_item_size_nr(eb, path->slots[0]);
3369
3370        if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3371            item_size >= sizeof(*ei) + sizeof(*bi)) {
3372                ei = btrfs_item_ptr(eb, path->slots[0],
3373                                struct btrfs_extent_item);
3374                if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3375                        bi = (struct btrfs_tree_block_info *)(ei + 1);
3376                        level = btrfs_tree_block_level(eb, bi);
3377                } else {
3378                        level = (int)extent_key->offset;
3379                }
3380                generation = btrfs_extent_generation(eb, ei);
3381        } else {
3382#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3383                u64 ref_owner;
3384                int ret;
3385
3386                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3387                ret = get_ref_objectid_v0(rc, path, extent_key,
3388                                          &ref_owner, NULL);
3389                if (ret < 0)
3390                        return ret;
3391                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3392                level = (int)ref_owner;
3393                /* FIXME: get real generation */
3394                generation = 0;
3395#else
3396                BUG();
3397#endif
3398        }
3399
3400        btrfs_release_path(path);
3401
3402        BUG_ON(level == -1);
3403
3404        block = kmalloc(sizeof(*block), GFP_NOFS);
3405        if (!block)
3406                return -ENOMEM;
3407
3408        block->bytenr = extent_key->objectid;
3409        block->key.objectid = rc->extent_root->fs_info->nodesize;
3410        block->key.offset = generation;
3411        block->level = level;
3412        block->key_ready = 0;
3413
3414        rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
3415        if (rb_node)
3416                backref_tree_panic(rb_node, -EEXIST, block->bytenr);
3417
3418        return 0;
3419}
3420
3421/*
3422 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3423 */
3424static int __add_tree_block(struct reloc_control *rc,
3425                            u64 bytenr, u32 blocksize,
3426                            struct rb_root *blocks)
3427{
3428        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3429        struct btrfs_path *path;
3430        struct btrfs_key key;
3431        int ret;
3432        bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3433
3434        if (tree_block_processed(bytenr, rc))
3435                return 0;
3436
3437        if (tree_search(blocks, bytenr))
3438                return 0;
3439
3440        path = btrfs_alloc_path();
3441        if (!path)
3442                return -ENOMEM;
3443again:
3444        key.objectid = bytenr;
3445        if (skinny) {
3446                key.type = BTRFS_METADATA_ITEM_KEY;
3447                key.offset = (u64)-1;
3448        } else {
3449                key.type = BTRFS_EXTENT_ITEM_KEY;
3450                key.offset = blocksize;
3451        }
3452
3453        path->search_commit_root = 1;
3454        path->skip_locking = 1;
3455        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3456        if (ret < 0)
3457                goto out;
3458
3459        if (ret > 0 && skinny) {
3460                if (path->slots[0]) {
3461                        path->slots[0]--;
3462                        btrfs_item_key_to_cpu(path->nodes[0], &key,
3463                                              path->slots[0]);
3464                        if (key.objectid == bytenr &&
3465                            (key.type == BTRFS_METADATA_ITEM_KEY ||
3466                             (key.type == BTRFS_EXTENT_ITEM_KEY &&
3467                              key.offset == blocksize)))
3468                                ret = 0;
3469                }
3470
3471                if (ret) {
3472                        skinny = false;
3473                        btrfs_release_path(path);
3474                        goto again;
3475                }
3476        }
3477        BUG_ON(ret);
3478
3479        ret = add_tree_block(rc, &key, path, blocks);
3480out:
3481        btrfs_free_path(path);
3482        return ret;
3483}
3484
3485/*
3486 * helper to check if the block use full backrefs for pointers in it
3487 */
3488static int block_use_full_backref(struct reloc_control *rc,
3489                                  struct extent_buffer *eb)
3490{
3491        u64 flags;
3492        int ret;
3493
3494        if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
3495            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
3496                return 1;
3497
3498        ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info,
3499                                       eb->start, btrfs_header_level(eb), 1,
3500                                       NULL, &flags);
3501        BUG_ON(ret);
3502
3503        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
3504                ret = 1;
3505        else
3506                ret = 0;
3507        return ret;
3508}
3509
3510static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3511                                    struct btrfs_block_group_cache *block_group,
3512                                    struct inode *inode,
3513                                    u64 ino)
3514{
3515        struct btrfs_key key;
3516        struct btrfs_root *root = fs_info->tree_root;
3517        struct btrfs_trans_handle *trans;
3518        int ret = 0;
3519
3520        if (inode)
3521                goto truncate;
3522
3523        key.objectid = ino;
3524        key.type = BTRFS_INODE_ITEM_KEY;
3525        key.offset = 0;
3526
3527        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3528        if (IS_ERR(inode) || is_bad_inode(inode)) {
3529                if (!IS_ERR(inode))
3530                        iput(inode);
3531                return -ENOENT;
3532        }
3533
3534truncate:
3535        ret = btrfs_check_trunc_cache_free_space(fs_info,
3536                                                 &fs_info->global_block_rsv);
3537        if (ret)
3538                goto out;
3539
3540        trans = btrfs_join_transaction(root);
3541        if (IS_ERR(trans)) {
3542                ret = PTR_ERR(trans);
3543                goto out;
3544        }
3545
3546        ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
3547
3548        btrfs_end_transaction(trans);
3549        btrfs_btree_balance_dirty(fs_info);
3550out:
3551        iput(inode);
3552        return ret;
3553}
3554
3555/*
3556 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3557 * this function scans fs tree to find blocks reference the data extent
3558 */
3559static int find_data_references(struct reloc_control *rc,
3560                                struct btrfs_key *extent_key,
3561                                struct extent_buffer *leaf,
3562                                struct btrfs_extent_data_ref *ref,
3563                                struct rb_root *blocks)
3564{
3565        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3566        struct btrfs_path *path;
3567        struct tree_block *block;
3568        struct btrfs_root *root;
3569        struct btrfs_file_extent_item *fi;
3570        struct rb_node *rb_node;
3571        struct btrfs_key key;
3572        u64 ref_root;
3573        u64 ref_objectid;
3574        u64 ref_offset;
3575        u32 ref_count;
3576        u32 nritems;
3577        int err = 0;
3578        int added = 0;
3579        int counted;
3580        int ret;
3581
3582        ref_root = btrfs_extent_data_ref_root(leaf, ref);
3583        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3584        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3585        ref_count = btrfs_extent_data_ref_count(leaf, ref);
3586
3587        /*
3588         * This is an extent belonging to the free space cache, lets just delete
3589         * it and redo the search.
3590         */
3591        if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3592                ret = delete_block_group_cache(fs_info, rc->block_group,
3593                                               NULL, ref_objectid);
3594                if (ret != -ENOENT)
3595                        return ret;
3596                ret = 0;
3597        }
3598
3599        path = btrfs_alloc_path();
3600        if (!path)
3601                return -ENOMEM;
3602        path->reada = READA_FORWARD;
3603
3604        root = read_fs_root(fs_info, ref_root);
3605        if (IS_ERR(root)) {
3606                err = PTR_ERR(root);
3607                goto out;
3608        }
3609
3610        key.objectid = ref_objectid;
3611        key.type = BTRFS_EXTENT_DATA_KEY;
3612        if (ref_offset > ((u64)-1 << 32))
3613                key.offset = 0;
3614        else
3615                key.offset = ref_offset;
3616
3617        path->search_commit_root = 1;
3618        path->skip_locking = 1;
3619        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3620        if (ret < 0) {
3621                err = ret;
3622                goto out;
3623        }
3624
3625        leaf = path->nodes[0];
3626        nritems = btrfs_header_nritems(leaf);
3627        /*
3628         * the references in tree blocks that use full backrefs
3629         * are not counted in
3630         */
3631        if (block_use_full_backref(rc, leaf))
3632                counted = 0;
3633        else
3634                counted = 1;
3635        rb_node = tree_search(blocks, leaf->start);
3636        if (rb_node) {
3637                if (counted)
3638                        added = 1;
3639                else
3640                        path->slots[0] = nritems;
3641        }
3642
3643        while (ref_count > 0) {
3644                while (path->slots[0] >= nritems) {
3645                        ret = btrfs_next_leaf(root, path);
3646                        if (ret < 0) {
3647                                err = ret;
3648                                goto out;
3649                        }
3650                        if (WARN_ON(ret > 0))
3651                                goto out;
3652
3653                        leaf = path->nodes[0];
3654                        nritems = btrfs_header_nritems(leaf);
3655                        added = 0;
3656
3657                        if (block_use_full_backref(rc, leaf))
3658                                counted = 0;
3659                        else
3660                                counted = 1;
3661                        rb_node = tree_search(blocks, leaf->start);
3662                        if (rb_node) {
3663                                if (counted)
3664                                        added = 1;
3665                                else
3666                                        path->slots[0] = nritems;
3667                        }
3668                }
3669
3670                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3671                if (WARN_ON(key.objectid != ref_objectid ||
3672                    key.type != BTRFS_EXTENT_DATA_KEY))
3673                        break;
3674
3675                fi = btrfs_item_ptr(leaf, path->slots[0],
3676                                    struct btrfs_file_extent_item);
3677
3678                if (btrfs_file_extent_type(leaf, fi) ==
3679                    BTRFS_FILE_EXTENT_INLINE)
3680                        goto next;
3681
3682                if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3683                    extent_key->objectid)
3684                        goto next;
3685
3686                key.offset -= btrfs_file_extent_offset(leaf, fi);
3687                if (key.offset != ref_offset)
3688                        goto next;
3689
3690                if (counted)
3691                        ref_count--;
3692                if (added)
3693                        goto next;
3694
3695                if (!tree_block_processed(leaf->start, rc)) {
3696                        block = kmalloc(sizeof(*block), GFP_NOFS);
3697                        if (!block) {
3698                                err = -ENOMEM;
3699                                break;
3700                        }
3701                        block->bytenr = leaf->start;
3702                        btrfs_item_key_to_cpu(leaf, &block->key, 0);
3703                        block->level = 0;
3704                        block->key_ready = 1;
3705                        rb_node = tree_insert(blocks, block->bytenr,
3706                                              &block->rb_node);
3707                        if (rb_node)
3708                                backref_tree_panic(rb_node, -EEXIST,
3709                                                   block->bytenr);
3710                }
3711                if (counted)
3712                        added = 1;
3713                else
3714                        path->slots[0] = nritems;
3715next:
3716                path->slots[0]++;
3717
3718        }
3719out:
3720        btrfs_free_path(path);
3721        return err;
3722}
3723
3724/*
3725 * helper to find all tree blocks that reference a given data extent
3726 */
3727static noinline_for_stack
3728int add_data_references(struct reloc_control *rc,
3729                        struct btrfs_key *extent_key,
3730                        struct btrfs_path *path,
3731                        struct rb_root *blocks)
3732{
3733        struct btrfs_key key;
3734        struct extent_buffer *eb;
3735        struct btrfs_extent_data_ref *dref;
3736        struct btrfs_extent_inline_ref *iref;
3737        unsigned long ptr;
3738        unsigned long end;
3739        u32 blocksize = rc->extent_root->fs_info->nodesize;
3740        int ret = 0;
3741        int err = 0;
3742
3743        eb = path->nodes[0];
3744        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3745        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3746#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3747        if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3748                ptr = end;
3749        else
3750#endif
3751                ptr += sizeof(struct btrfs_extent_item);
3752
3753        while (ptr < end) {
3754                iref = (struct btrfs_extent_inline_ref *)ptr;
3755                key.type = btrfs_extent_inline_ref_type(eb, iref);
3756                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3757                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3758                        ret = __add_tree_block(rc, key.offset, blocksize,
3759                                               blocks);
3760                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3761                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3762                        ret = find_data_references(rc, extent_key,
3763                                                   eb, dref, blocks);
3764                } else {
3765                        BUG();
3766                }
3767                if (ret) {
3768                        err = ret;
3769                        goto out;
3770                }
3771                ptr += btrfs_extent_inline_ref_size(key.type);
3772        }
3773        WARN_ON(ptr > end);
3774
3775        while (1) {
3776                cond_resched();
3777                eb = path->nodes[0];
3778                if (path->slots[0] >= btrfs_header_nritems(eb)) {
3779                        ret = btrfs_next_leaf(rc->extent_root, path);
3780                        if (ret < 0) {
3781                                err = ret;
3782                                break;
3783                        }
3784                        if (ret > 0)
3785                                break;
3786                        eb = path->nodes[0];
3787                }
3788
3789                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3790                if (key.objectid != extent_key->objectid)
3791                        break;
3792
3793#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3794                if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3795                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3796#else
3797                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3798                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3799#endif
3800                        ret = __add_tree_block(rc, key.offset, blocksize,
3801                                               blocks);
3802                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3803                        dref = btrfs_item_ptr(eb, path->slots[0],
3804                                              struct btrfs_extent_data_ref);
3805                        ret = find_data_references(rc, extent_key,
3806                                                   eb, dref, blocks);
3807                } else {
3808                        ret = 0;
3809                }
3810                if (ret) {
3811                        err = ret;
3812                        break;
3813                }
3814                path->slots[0]++;
3815        }
3816out:
3817        btrfs_release_path(path);
3818        if (err)
3819                free_block_list(blocks);
3820        return err;
3821}
3822
3823/*
3824 * helper to find next unprocessed extent
3825 */
3826static noinline_for_stack
3827int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3828                     struct btrfs_key *extent_key)
3829{
3830        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3831        struct btrfs_key key;
3832        struct extent_buffer *leaf;
3833        u64 start, end, last;
3834        int ret;
3835
3836        last = rc->block_group->key.objectid + rc->block_group->key.offset;
3837        while (1) {
3838                cond_resched();
3839                if (rc->search_start >= last) {
3840                        ret = 1;
3841                        break;
3842                }
3843
3844                key.objectid = rc->search_start;
3845                key.type = BTRFS_EXTENT_ITEM_KEY;
3846                key.offset = 0;
3847
3848                path->search_commit_root = 1;
3849                path->skip_locking = 1;
3850                ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3851                                        0, 0);
3852                if (ret < 0)
3853                        break;
3854next:
3855                leaf = path->nodes[0];
3856                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3857                        ret = btrfs_next_leaf(rc->extent_root, path);
3858                        if (ret != 0)
3859                                break;
3860                        leaf = path->nodes[0];
3861                }
3862
3863                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3864                if (key.objectid >= last) {
3865                        ret = 1;
3866                        break;
3867                }
3868
3869                if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3870                    key.type != BTRFS_METADATA_ITEM_KEY) {
3871                        path->slots[0]++;
3872                        goto next;
3873                }
3874
3875                if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3876                    key.objectid + key.offset <= rc->search_start) {
3877                        path->slots[0]++;
3878                        goto next;
3879                }
3880
3881                if (key.type == BTRFS_METADATA_ITEM_KEY &&
3882                    key.objectid + fs_info->nodesize <=
3883                    rc->search_start) {
3884                        path->slots[0]++;
3885                        goto next;
3886                }
3887
3888                ret = find_first_extent_bit(&rc->processed_blocks,
3889                                            key.objectid, &start, &end,
3890                                            EXTENT_DIRTY, NULL);
3891
3892                if (ret == 0 && start <= key.objectid) {
3893                        btrfs_release_path(path);
3894                        rc->search_start = end + 1;
3895                } else {
3896                        if (key.type == BTRFS_EXTENT_ITEM_KEY)
3897                                rc->search_start = key.objectid + key.offset;
3898                        else
3899                                rc->search_start = key.objectid +
3900                                        fs_info->nodesize;
3901                        memcpy(extent_key, &key, sizeof(key));
3902                        return 0;
3903                }
3904        }
3905        btrfs_release_path(path);
3906        return ret;
3907}
3908
3909static void set_reloc_control(struct reloc_control *rc)
3910{
3911        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3912
3913        mutex_lock(&fs_info->reloc_mutex);
3914        fs_info->reloc_ctl = rc;
3915        mutex_unlock(&fs_info->reloc_mutex);
3916}
3917
3918static void unset_reloc_control(struct reloc_control *rc)
3919{
3920        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3921
3922        mutex_lock(&fs_info->reloc_mutex);
3923        fs_info->reloc_ctl = NULL;
3924        mutex_unlock(&fs_info->reloc_mutex);
3925}
3926
3927static int check_extent_flags(u64 flags)
3928{
3929        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3930            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3931                return 1;
3932        if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3933            !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3934                return 1;
3935        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3936            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3937                return 1;
3938        return 0;
3939}
3940
3941static noinline_for_stack
3942int prepare_to_relocate(struct reloc_control *rc)
3943{
3944        struct btrfs_trans_handle *trans;
3945        int ret;
3946
3947        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3948                                              BTRFS_BLOCK_RSV_TEMP);
3949        if (!rc->block_rsv)
3950                return -ENOMEM;
3951
3952        memset(&rc->cluster, 0, sizeof(rc->cluster));
3953        rc->search_start = rc->block_group->key.objectid;
3954        rc->extents_found = 0;
3955        rc->nodes_relocated = 0;
3956        rc->merging_rsv_size = 0;
3957        rc->reserved_bytes = 0;
3958        rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3959                              RELOCATION_RESERVED_NODES;
3960        ret = btrfs_block_rsv_refill(rc->extent_root,
3961                                     rc->block_rsv, rc->block_rsv->size,
3962                                     BTRFS_RESERVE_FLUSH_ALL);
3963        if (ret)
3964                return ret;
3965
3966        rc->create_reloc_tree = 1;
3967        set_reloc_control(rc);
3968
3969        trans = btrfs_join_transaction(rc->extent_root);
3970        if (IS_ERR(trans)) {
3971                unset_reloc_control(rc);
3972                /*
3973                 * extent tree is not a ref_cow tree and has no reloc_root to
3974                 * cleanup.  And callers are responsible to free the above
3975                 * block rsv.
3976                 */
3977                return PTR_ERR(trans);
3978        }
3979        btrfs_commit_transaction(trans);
3980        return 0;
3981}
3982
3983static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3984{
3985        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3986        struct rb_root blocks = RB_ROOT;
3987        struct btrfs_key key;
3988        struct btrfs_trans_handle *trans = NULL;
3989        struct btrfs_path *path;
3990        struct btrfs_extent_item *ei;
3991        u64 flags;
3992        u32 item_size;
3993        int ret;
3994        int err = 0;
3995        int progress = 0;
3996
3997        path = btrfs_alloc_path();
3998        if (!path)
3999                return -ENOMEM;
4000        path->reada = READA_FORWARD;
4001
4002        ret = prepare_to_relocate(rc);
4003        if (ret) {
4004                err = ret;
4005                goto out_free;
4006        }
4007
4008        while (1) {
4009                rc->reserved_bytes = 0;
4010                ret = btrfs_block_rsv_refill(rc->extent_root,
4011                                        rc->block_rsv, rc->block_rsv->size,
4012                                        BTRFS_RESERVE_FLUSH_ALL);
4013                if (ret) {
4014                        err = ret;
4015                        break;
4016                }
4017                progress++;
4018                trans = btrfs_start_transaction(rc->extent_root, 0);
4019                if (IS_ERR(trans)) {
4020                        err = PTR_ERR(trans);
4021                        trans = NULL;
4022                        break;
4023                }
4024restart:
4025                if (update_backref_cache(trans, &rc->backref_cache)) {
4026                        btrfs_end_transaction(trans);
4027                        continue;
4028                }
4029
4030                ret = find_next_extent(rc, path, &key);
4031                if (ret < 0)
4032                        err = ret;
4033                if (ret != 0)
4034                        break;
4035
4036                rc->extents_found++;
4037
4038                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
4039                                    struct btrfs_extent_item);
4040                item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
4041                if (item_size >= sizeof(*ei)) {
4042                        flags = btrfs_extent_flags(path->nodes[0], ei);
4043                        ret = check_extent_flags(flags);
4044                        BUG_ON(ret);
4045
4046                } else {
4047#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4048                        u64 ref_owner;
4049                        int path_change = 0;
4050
4051                        BUG_ON(item_size !=
4052                               sizeof(struct btrfs_extent_item_v0));
4053                        ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
4054                                                  &path_change);
4055                        if (ret < 0) {
4056                                err = ret;
4057                                break;
4058                        }
4059                        if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
4060                                flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
4061                        else
4062                                flags = BTRFS_EXTENT_FLAG_DATA;
4063
4064                        if (path_change) {
4065                                btrfs_release_path(path);
4066
4067                                path->search_commit_root = 1;
4068                                path->skip_locking = 1;
4069                                ret = btrfs_search_slot(NULL, rc->extent_root,
4070                                                        &key, path, 0, 0);
4071                                if (ret < 0) {
4072                                        err = ret;
4073                                        break;
4074                                }
4075                                BUG_ON(ret > 0);
4076                        }
4077#else
4078                        BUG();
4079#endif
4080                }
4081
4082                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
4083                        ret = add_tree_block(rc, &key, path, &blocks);
4084                } else if (rc->stage == UPDATE_DATA_PTRS &&
4085                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
4086                        ret = add_data_references(rc, &key, path, &blocks);
4087                } else {
4088                        btrfs_release_path(path);
4089                        ret = 0;
4090                }
4091                if (ret < 0) {
4092                        err = ret;
4093                        break;
4094                }
4095
4096                if (!RB_EMPTY_ROOT(&blocks)) {
4097                        ret = relocate_tree_blocks(trans, rc, &blocks);
4098                        if (ret < 0) {
4099                                /*
4100                                 * if we fail to relocate tree blocks, force to update
4101                                 * backref cache when committing transaction.
4102                                 */
4103                                rc->backref_cache.last_trans = trans->transid - 1;
4104
4105                                if (ret != -EAGAIN) {
4106                                        err = ret;
4107                                        break;
4108                                }
4109                                rc->extents_found--;
4110                                rc->search_start = key.objectid;
4111                        }
4112                }
4113
4114                btrfs_end_transaction_throttle(trans);
4115                btrfs_btree_balance_dirty(fs_info);
4116                trans = NULL;
4117
4118                if (rc->stage == MOVE_DATA_EXTENTS &&
4119                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
4120                        rc->found_file_extent = 1;
4121                        ret = relocate_data_extent(rc->data_inode,
4122                                                   &key, &rc->cluster);
4123                        if (ret < 0) {
4124                                err = ret;
4125                                break;
4126                        }
4127                }
4128        }
4129        if (trans && progress && err == -ENOSPC) {
4130                ret = btrfs_force_chunk_alloc(trans, fs_info,
4131                                              rc->block_group->flags);
4132                if (ret == 1) {
4133                        err = 0;
4134                        progress = 0;
4135                        goto restart;
4136                }
4137        }
4138
4139        btrfs_release_path(path);
4140        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
4141
4142        if (trans) {
4143                btrfs_end_transaction_throttle(trans);
4144                btrfs_btree_balance_dirty(fs_info);
4145        }
4146
4147        if (!err) {
4148                ret = relocate_file_extent_cluster(rc->data_inode,
4149                                                   &rc->cluster);
4150                if (ret < 0)
4151                        err = ret;
4152        }
4153
4154        rc->create_reloc_tree = 0;
4155        set_reloc_control(rc);
4156
4157        backref_cache_cleanup(&rc->backref_cache);
4158        btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4159
4160        err = prepare_to_merge(rc, err);
4161
4162        merge_reloc_roots(rc);
4163
4164        rc->merge_reloc_tree = 0;
4165        unset_reloc_control(rc);
4166        btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1);
4167
4168        /* get rid of pinned extents */
4169        trans = btrfs_join_transaction(rc->extent_root);
4170        if (IS_ERR(trans)) {
4171                err = PTR_ERR(trans);
4172                goto out_free;
4173        }
4174        btrfs_commit_transaction(trans);
4175out_free:
4176        btrfs_free_block_rsv(fs_info, rc->block_rsv);
4177        btrfs_free_path(path);
4178        return err;
4179}
4180
4181static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4182                                 struct btrfs_root *root, u64 objectid)
4183{
4184        struct btrfs_path *path;
4185        struct btrfs_inode_item *item;
4186        struct extent_buffer *leaf;
4187        int ret;
4188
4189        path = btrfs_alloc_path();
4190        if (!path)
4191                return -ENOMEM;
4192
4193        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
4194        if (ret)
4195                goto out;
4196
4197        leaf = path->nodes[0];
4198        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
4199        memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
4200        btrfs_set_inode_generation(leaf, item, 1);
4201        btrfs_set_inode_size(leaf, item, 0);
4202        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
4203        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4204                                          BTRFS_INODE_PREALLOC);
4205        btrfs_mark_buffer_dirty(leaf);
4206out:
4207        btrfs_free_path(path);
4208        return ret;
4209}
4210
4211/*
4212 * helper to create inode for data relocation.
4213 * the inode is in data relocation tree and its link count is 0
4214 */
4215static noinline_for_stack
4216struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
4217                                 struct btrfs_block_group_cache *group)
4218{
4219        struct inode *inode = NULL;
4220        struct btrfs_trans_handle *trans;
4221        struct btrfs_root *root;
4222        struct btrfs_key key;
4223        u64 objectid;
4224        int err = 0;
4225
4226        root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4227        if (IS_ERR(root))
4228                return ERR_CAST(root);
4229
4230        trans = btrfs_start_transaction(root, 6);
4231        if (IS_ERR(trans))
4232                return ERR_CAST(trans);
4233
4234        err = btrfs_find_free_objectid(root, &objectid);
4235        if (err)
4236                goto out;
4237
4238        err = __insert_orphan_inode(trans, root, objectid);
4239        BUG_ON(err);
4240
4241        key.objectid = objectid;
4242        key.type = BTRFS_INODE_ITEM_KEY;
4243        key.offset = 0;
4244        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4245        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
4246        BTRFS_I(inode)->index_cnt = group->key.objectid;
4247
4248        err = btrfs_orphan_add(trans, inode);
4249out:
4250        btrfs_end_transaction(trans);
4251        btrfs_btree_balance_dirty(fs_info);
4252        if (err) {
4253                if (inode)
4254                        iput(inode);
4255                inode = ERR_PTR(err);
4256        }
4257        return inode;
4258}
4259
4260static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4261{
4262        struct reloc_control *rc;
4263
4264        rc = kzalloc(sizeof(*rc), GFP_NOFS);
4265        if (!rc)
4266                return NULL;
4267
4268        INIT_LIST_HEAD(&rc->reloc_roots);
4269        backref_cache_init(&rc->backref_cache);
4270        mapping_tree_init(&rc->reloc_root_tree);
4271        extent_io_tree_init(&rc->processed_blocks,
4272                            fs_info->btree_inode->i_mapping);
4273        return rc;
4274}
4275
4276/*
4277 * Print the block group being relocated
4278 */
4279static void describe_relocation(struct btrfs_fs_info *fs_info,
4280                                struct btrfs_block_group_cache *block_group)
4281{
4282        char buf[128];          /* prefixed by a '|' that'll be dropped */
4283        u64 flags = block_group->flags;
4284
4285        /* Shouldn't happen */
4286        if (!flags) {
4287                strcpy(buf, "|NONE");
4288        } else {
4289                char *bp = buf;
4290
4291#define DESCRIBE_FLAG(f, d) \
4292                if (flags & BTRFS_BLOCK_GROUP_##f) { \
4293                        bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \
4294                        flags &= ~BTRFS_BLOCK_GROUP_##f; \
4295                }
4296                DESCRIBE_FLAG(DATA,     "data");
4297                DESCRIBE_FLAG(SYSTEM,   "system");
4298                DESCRIBE_FLAG(METADATA, "metadata");
4299                DESCRIBE_FLAG(RAID0,    "raid0");
4300                DESCRIBE_FLAG(RAID1,    "raid1");
4301                DESCRIBE_FLAG(DUP,      "dup");
4302                DESCRIBE_FLAG(RAID10,   "raid10");
4303                DESCRIBE_FLAG(RAID5,    "raid5");
4304                DESCRIBE_FLAG(RAID6,    "raid6");
4305                if (flags)
4306                        snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags);
4307#undef DESCRIBE_FLAG
4308        }
4309
4310        btrfs_info(fs_info,
4311                   "relocating block group %llu flags %s",
4312                   block_group->key.objectid, buf + 1);
4313}
4314
4315/*
4316 * function to relocate all extents in a block group.
4317 */
4318int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4319{
4320        struct btrfs_root *extent_root = fs_info->extent_root;
4321        struct reloc_control *rc;
4322        struct inode *inode;
4323        struct btrfs_path *path;
4324        int ret;
4325        int rw = 0;
4326        int err = 0;
4327
4328        rc = alloc_reloc_control(fs_info);
4329        if (!rc)
4330                return -ENOMEM;
4331
4332        rc->extent_root = extent_root;
4333
4334        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
4335        BUG_ON(!rc->block_group);
4336
4337        ret = btrfs_inc_block_group_ro(extent_root, rc->block_group);
4338        if (ret) {
4339                err = ret;
4340                goto out;
4341        }
4342        rw = 1;
4343
4344        path = btrfs_alloc_path();
4345        if (!path) {
4346                err = -ENOMEM;
4347                goto out;
4348        }
4349
4350        inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
4351                                        path);
4352        btrfs_free_path(path);
4353
4354        if (!IS_ERR(inode))
4355                ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4356        else
4357                ret = PTR_ERR(inode);
4358
4359        if (ret && ret != -ENOENT) {
4360                err = ret;
4361                goto out;
4362        }
4363
4364        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4365        if (IS_ERR(rc->data_inode)) {
4366                err = PTR_ERR(rc->data_inode);
4367                rc->data_inode = NULL;
4368                goto out;
4369        }
4370
4371        describe_relocation(fs_info, rc->block_group);
4372
4373        btrfs_wait_block_group_reservations(rc->block_group);
4374        btrfs_wait_nocow_writers(rc->block_group);
4375        btrfs_wait_ordered_roots(fs_info, -1,
4376                                 rc->block_group->key.objectid,
4377                                 rc->block_group->key.offset);
4378
4379        while (1) {
4380                mutex_lock(&fs_info->cleaner_mutex);
4381                ret = relocate_block_group(rc);
4382                mutex_unlock(&fs_info->cleaner_mutex);
4383                if (ret < 0) {
4384                        err = ret;
4385                        goto out;
4386                }
4387
4388                if (rc->extents_found == 0)
4389                        break;
4390
4391                btrfs_info(fs_info, "found %llu extents", rc->extents_found);
4392
4393                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4394                        ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4395                                                       (u64)-1);
4396                        if (ret) {
4397                                err = ret;
4398                                goto out;
4399                        }
4400                        invalidate_mapping_pages(rc->data_inode->i_mapping,
4401                                                 0, -1);
4402                        rc->stage = UPDATE_DATA_PTRS;
4403                }
4404        }
4405
4406        WARN_ON(rc->block_group->pinned > 0);
4407        WARN_ON(rc->block_group->reserved > 0);
4408        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
4409out:
4410        if (err && rw)
4411                btrfs_dec_block_group_ro(rc->block_group);
4412        iput(rc->data_inode);
4413        btrfs_put_block_group(rc->block_group);
4414        kfree(rc);
4415        return err;
4416}
4417
4418static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4419{
4420        struct btrfs_fs_info *fs_info = root->fs_info;
4421        struct btrfs_trans_handle *trans;
4422        int ret, err;
4423
4424        trans = btrfs_start_transaction(fs_info->tree_root, 0);
4425        if (IS_ERR(trans))
4426                return PTR_ERR(trans);
4427
4428        memset(&root->root_item.drop_progress, 0,
4429                sizeof(root->root_item.drop_progress));
4430        root->root_item.drop_level = 0;
4431        btrfs_set_root_refs(&root->root_item, 0);
4432        ret = btrfs_update_root(trans, fs_info->tree_root,
4433                                &root->root_key, &root->root_item);
4434
4435        err = btrfs_end_transaction(trans);
4436        if (err)
4437                return err;
4438        return ret;
4439}
4440
4441/*
4442 * recover relocation interrupted by system crash.
4443 *
4444 * this function resumes merging reloc trees with corresponding fs trees.
4445 * this is important for keeping the sharing of tree blocks
4446 */
4447int btrfs_recover_relocation(struct btrfs_root *root)
4448{
4449        struct btrfs_fs_info *fs_info = root->fs_info;
4450        LIST_HEAD(reloc_roots);
4451        struct btrfs_key key;
4452        struct btrfs_root *fs_root;
4453        struct btrfs_root *reloc_root;
4454        struct btrfs_path *path;
4455        struct extent_buffer *leaf;
4456        struct reloc_control *rc = NULL;
4457        struct btrfs_trans_handle *trans;
4458        int ret;
4459        int err = 0;
4460
4461        path = btrfs_alloc_path();
4462        if (!path)
4463                return -ENOMEM;
4464        path->reada = READA_BACK;
4465
4466        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4467        key.type = BTRFS_ROOT_ITEM_KEY;
4468        key.offset = (u64)-1;
4469
4470        while (1) {
4471                ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4472                                        path, 0, 0);
4473                if (ret < 0) {
4474                        err = ret;
4475                        goto out;
4476                }
4477                if (ret > 0) {
4478                        if (path->slots[0] == 0)
4479                                break;
4480                        path->slots[0]--;
4481                }
4482                leaf = path->nodes[0];
4483                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4484                btrfs_release_path(path);
4485
4486                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4487                    key.type != BTRFS_ROOT_ITEM_KEY)
4488                        break;
4489
4490                reloc_root = btrfs_read_fs_root(root, &key);
4491                if (IS_ERR(reloc_root)) {
4492                        err = PTR_ERR(reloc_root);
4493                        goto out;
4494                }
4495
4496                list_add(&reloc_root->root_list, &reloc_roots);
4497
4498                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4499                        fs_root = read_fs_root(fs_info,
4500                                               reloc_root->root_key.offset);
4501                        if (IS_ERR(fs_root)) {
4502                                ret = PTR_ERR(fs_root);
4503                                if (ret != -ENOENT) {
4504                                        err = ret;
4505                                        goto out;
4506                                }
4507                                ret = mark_garbage_root(reloc_root);
4508                                if (ret < 0) {
4509                                        err = ret;
4510                                        goto out;
4511                                }
4512                        }
4513                }
4514
4515                if (key.offset == 0)
4516                        break;
4517
4518                key.offset--;
4519        }
4520        btrfs_release_path(path);
4521
4522        if (list_empty(&reloc_roots))
4523                goto out;
4524
4525        rc = alloc_reloc_control(fs_info);
4526        if (!rc) {
4527                err = -ENOMEM;
4528                goto out;
4529        }
4530
4531        rc->extent_root = fs_info->extent_root;
4532
4533        set_reloc_control(rc);
4534
4535        trans = btrfs_join_transaction(rc->extent_root);
4536        if (IS_ERR(trans)) {
4537                unset_reloc_control(rc);
4538                err = PTR_ERR(trans);
4539                goto out_free;
4540        }
4541
4542        rc->merge_reloc_tree = 1;
4543
4544        while (!list_empty(&reloc_roots)) {
4545                reloc_root = list_entry(reloc_roots.next,
4546                                        struct btrfs_root, root_list);
4547                list_del(&reloc_root->root_list);
4548
4549                if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4550                        list_add_tail(&reloc_root->root_list,
4551                                      &rc->reloc_roots);
4552                        continue;
4553                }
4554
4555                fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
4556                if (IS_ERR(fs_root)) {
4557                        err = PTR_ERR(fs_root);
4558                        goto out_free;
4559                }
4560
4561                err = __add_reloc_root(reloc_root);
4562                BUG_ON(err < 0); /* -ENOMEM or logic error */
4563                fs_root->reloc_root = reloc_root;
4564        }
4565
4566        err = btrfs_commit_transaction(trans);
4567        if (err)
4568                goto out_free;
4569
4570        merge_reloc_roots(rc);
4571
4572        unset_reloc_control(rc);
4573
4574        trans = btrfs_join_transaction(rc->extent_root);
4575        if (IS_ERR(trans)) {
4576                err = PTR_ERR(trans);
4577                goto out_free;
4578        }
4579        err = btrfs_commit_transaction(trans);
4580out_free:
4581        kfree(rc);
4582out:
4583        if (!list_empty(&reloc_roots))
4584                free_reloc_roots(&reloc_roots);
4585
4586        btrfs_free_path(path);
4587
4588        if (err == 0) {
4589                /* cleanup orphan inode in data relocation tree */
4590                fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
4591                if (IS_ERR(fs_root))
4592                        err = PTR_ERR(fs_root);
4593                else
4594                        err = btrfs_orphan_cleanup(fs_root);
4595        }
4596        return err;
4597}
4598
4599/*
4600 * helper to add ordered checksum for data relocation.
4601 *
4602 * cloning checksum properly handles the nodatasum extents.
4603 * it also saves CPU time to re-calculate the checksum.
4604 */
4605int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4606{
4607        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4608        struct btrfs_ordered_sum *sums;
4609        struct btrfs_ordered_extent *ordered;
4610        int ret;
4611        u64 disk_bytenr;
4612        u64 new_bytenr;
4613        LIST_HEAD(list);
4614
4615        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4616        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
4617
4618        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4619        ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
4620                                       disk_bytenr + len - 1, &list, 0);
4621        if (ret)
4622                goto out;
4623
4624        while (!list_empty(&list)) {
4625                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
4626                list_del_init(&sums->list);
4627
4628                /*
4629                 * We need to offset the new_bytenr based on where the csum is.
4630                 * We need to do this because we will read in entire prealloc
4631                 * extents but we may have written to say the middle of the
4632                 * prealloc extent, so we need to make sure the csum goes with
4633                 * the right disk offset.
4634                 *
4635                 * We can do this because the data reloc inode refers strictly
4636                 * to the on disk bytes, so we don't have to worry about
4637                 * disk_len vs real len like with real inodes since it's all
4638                 * disk length.
4639                 */
4640                new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
4641                sums->bytenr = new_bytenr;
4642
4643                btrfs_add_ordered_sum(inode, ordered, sums);
4644        }
4645out:
4646        btrfs_put_ordered_extent(ordered);
4647        return ret;
4648}
4649
4650int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4651                          struct btrfs_root *root, struct extent_buffer *buf,
4652                          struct extent_buffer *cow)
4653{
4654        struct btrfs_fs_info *fs_info = root->fs_info;
4655        struct reloc_control *rc;
4656        struct backref_node *node;
4657        int first_cow = 0;
4658        int level;
4659        int ret = 0;
4660
4661        rc = fs_info->reloc_ctl;
4662        if (!rc)
4663                return 0;
4664
4665        BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4666               root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4667
4668        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4669                if (buf == root->node)
4670                        __update_reloc_root(root, cow->start);
4671        }
4672
4673        level = btrfs_header_level(buf);
4674        if (btrfs_header_generation(buf) <=
4675            btrfs_root_last_snapshot(&root->root_item))
4676                first_cow = 1;
4677
4678        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4679            rc->create_reloc_tree) {
4680                WARN_ON(!first_cow && level == 0);
4681
4682                node = rc->backref_cache.path[level];
4683                BUG_ON(node->bytenr != buf->start &&
4684                       node->new_bytenr != buf->start);
4685
4686                drop_node_buffer(node);
4687                extent_buffer_get(cow);
4688                node->eb = cow;
4689                node->new_bytenr = cow->start;
4690
4691                if (!node->pending) {
4692                        list_move_tail(&node->list,
4693                                       &rc->backref_cache.pending[level]);
4694                        node->pending = 1;
4695                }
4696
4697                if (first_cow)
4698                        __mark_block_processed(rc, node);
4699
4700                if (first_cow && level > 0)
4701                        rc->nodes_relocated += buf->len;
4702        }
4703
4704        if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4705                ret = replace_file_extents(trans, rc, root, cow);
4706        return ret;
4707}
4708
4709/*
4710 * called before creating snapshot. it calculates metadata reservation
4711 * required for relocating tree blocks in the snapshot
4712 */
4713void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4714                              u64 *bytes_to_reserve)
4715{
4716        struct btrfs_root *root;
4717        struct reloc_control *rc;
4718
4719        root = pending->root;
4720        if (!root->reloc_root)
4721                return;
4722
4723        rc = root->fs_info->reloc_ctl;
4724        if (!rc->merge_reloc_tree)
4725                return;
4726
4727        root = root->reloc_root;
4728        BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4729        /*
4730         * relocation is in the stage of merging trees. the space
4731         * used by merging a reloc tree is twice the size of
4732         * relocated tree nodes in the worst case. half for cowing
4733         * the reloc tree, half for cowing the fs tree. the space
4734         * used by cowing the reloc tree will be freed after the
4735         * tree is dropped. if we create snapshot, cowing the fs
4736         * tree may use more space than it frees. so we need
4737         * reserve extra space.
4738         */
4739        *bytes_to_reserve += rc->nodes_relocated;
4740}
4741
4742/*
4743 * called after snapshot is created. migrate block reservation
4744 * and create reloc root for the newly created snapshot
4745 */
4746int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4747                               struct btrfs_pending_snapshot *pending)
4748{
4749        struct btrfs_root *root = pending->root;
4750        struct btrfs_root *reloc_root;
4751        struct btrfs_root *new_root;
4752        struct reloc_control *rc;
4753        int ret;
4754
4755        if (!root->reloc_root)
4756                return 0;
4757
4758        rc = root->fs_info->reloc_ctl;
4759        rc->merging_rsv_size += rc->nodes_relocated;
4760
4761        if (rc->merge_reloc_tree) {
4762                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4763                                              rc->block_rsv,
4764                                              rc->nodes_relocated, 1);
4765                if (ret)
4766                        return ret;
4767        }
4768
4769        new_root = pending->snap;
4770        reloc_root = create_reloc_root(trans, root->reloc_root,
4771                                       new_root->root_key.objectid);
4772        if (IS_ERR(reloc_root))
4773                return PTR_ERR(reloc_root);
4774
4775        ret = __add_reloc_root(reloc_root);
4776        BUG_ON(ret < 0);
4777        new_root->reloc_root = reloc_root;
4778
4779        if (rc->create_reloc_tree)
4780                ret = clone_backref_node(trans, rc, root, reloc_root);
4781        return ret;
4782}
4783