linux/fs/btrfs/relocation.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2009 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/sched.h>
  20#include <linux/pagemap.h>
  21#include <linux/writeback.h>
  22#include <linux/blkdev.h>
  23#include <linux/rbtree.h>
  24#include "ctree.h"
  25#include "disk-io.h"
  26#include "transaction.h"
  27#include "volumes.h"
  28#include "locking.h"
  29#include "btrfs_inode.h"
  30#include "async-thread.h"
  31
  32/*
  33 * backref_node, mapping_node and tree_block start with this
  34 */
  35struct tree_entry {
  36        struct rb_node rb_node;
  37        u64 bytenr;
  38};
  39
  40/*
  41 * present a tree block in the backref cache
  42 */
  43struct backref_node {
  44        struct rb_node rb_node;
  45        u64 bytenr;
  46        /* objectid tree block owner */
  47        u64 owner;
  48        /* list of upper level blocks reference this block */
  49        struct list_head upper;
  50        /* list of child blocks in the cache */
  51        struct list_head lower;
  52        /* NULL if this node is not tree root */
  53        struct btrfs_root *root;
  54        /* extent buffer got by COW the block */
  55        struct extent_buffer *eb;
  56        /* level of tree block */
  57        unsigned int level:8;
  58        /* 1 if the block is root of old snapshot */
  59        unsigned int old_root:1;
  60        /* 1 if no child blocks in the cache */
  61        unsigned int lowest:1;
  62        /* is the extent buffer locked */
  63        unsigned int locked:1;
  64        /* has the block been processed */
  65        unsigned int processed:1;
  66        /* have backrefs of this block been checked */
  67        unsigned int checked:1;
  68};
  69
  70/*
  71 * present a block pointer in the backref cache
  72 */
  73struct backref_edge {
  74        struct list_head list[2];
  75        struct backref_node *node[2];
  76        u64 blockptr;
  77};
  78
  79#define LOWER   0
  80#define UPPER   1
  81
  82struct backref_cache {
  83        /* red black tree of all backref nodes in the cache */
  84        struct rb_root rb_root;
  85        /* list of backref nodes with no child block in the cache */
  86        struct list_head pending[BTRFS_MAX_LEVEL];
  87        spinlock_t lock;
  88};
  89
  90/*
  91 * map address of tree root to tree
  92 */
  93struct mapping_node {
  94        struct rb_node rb_node;
  95        u64 bytenr;
  96        void *data;
  97};
  98
  99struct mapping_tree {
 100        struct rb_root rb_root;
 101        spinlock_t lock;
 102};
 103
 104/*
 105 * present a tree block to process
 106 */
 107struct tree_block {
 108        struct rb_node rb_node;
 109        u64 bytenr;
 110        struct btrfs_key key;
 111        unsigned int level:8;
 112        unsigned int key_ready:1;
 113};
 114
 115/* inode vector */
 116#define INODEVEC_SIZE 16
 117
 118struct inodevec {
 119        struct list_head list;
 120        struct inode *inode[INODEVEC_SIZE];
 121        int nr;
 122};
 123
 124#define MAX_EXTENTS 128
 125
 126struct file_extent_cluster {
 127        u64 start;
 128        u64 end;
 129        u64 boundary[MAX_EXTENTS];
 130        unsigned int nr;
 131};
 132
 133struct reloc_control {
 134        /* block group to relocate */
 135        struct btrfs_block_group_cache *block_group;
 136        /* extent tree */
 137        struct btrfs_root *extent_root;
 138        /* inode for moving data */
 139        struct inode *data_inode;
 140        struct btrfs_workers workers;
 141        /* tree blocks have been processed */
 142        struct extent_io_tree processed_blocks;
 143        /* map start of tree root to corresponding reloc tree */
 144        struct mapping_tree reloc_root_tree;
 145        /* list of reloc trees */
 146        struct list_head reloc_roots;
 147        u64 search_start;
 148        u64 extents_found;
 149        u64 extents_skipped;
 150        int stage;
 151        int create_reloc_root;
 152        unsigned int found_file_extent:1;
 153        unsigned int found_old_snapshot:1;
 154};
 155
 156/* stages of data relocation */
 157#define MOVE_DATA_EXTENTS       0
 158#define UPDATE_DATA_PTRS        1
 159
 160/*
 161 * merge reloc tree to corresponding fs tree in worker threads
 162 */
 163struct async_merge {
 164        struct btrfs_work work;
 165        struct reloc_control *rc;
 166        struct btrfs_root *root;
 167        struct completion *done;
 168        atomic_t *num_pending;
 169};
 170
 171static void mapping_tree_init(struct mapping_tree *tree)
 172{
 173        tree->rb_root.rb_node = NULL;
 174        spin_lock_init(&tree->lock);
 175}
 176
 177static void backref_cache_init(struct backref_cache *cache)
 178{
 179        int i;
 180        cache->rb_root.rb_node = NULL;
 181        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
 182                INIT_LIST_HEAD(&cache->pending[i]);
 183        spin_lock_init(&cache->lock);
 184}
 185
 186static void backref_node_init(struct backref_node *node)
 187{
 188        memset(node, 0, sizeof(*node));
 189        INIT_LIST_HEAD(&node->upper);
 190        INIT_LIST_HEAD(&node->lower);
 191        RB_CLEAR_NODE(&node->rb_node);
 192}
 193
 194static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
 195                                   struct rb_node *node)
 196{
 197        struct rb_node **p = &root->rb_node;
 198        struct rb_node *parent = NULL;
 199        struct tree_entry *entry;
 200
 201        while (*p) {
 202                parent = *p;
 203                entry = rb_entry(parent, struct tree_entry, rb_node);
 204
 205                if (bytenr < entry->bytenr)
 206                        p = &(*p)->rb_left;
 207                else if (bytenr > entry->bytenr)
 208                        p = &(*p)->rb_right;
 209                else
 210                        return parent;
 211        }
 212
 213        rb_link_node(node, parent, p);
 214        rb_insert_color(node, root);
 215        return NULL;
 216}
 217
 218static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
 219{
 220        struct rb_node *n = root->rb_node;
 221        struct tree_entry *entry;
 222
 223        while (n) {
 224                entry = rb_entry(n, struct tree_entry, rb_node);
 225
 226                if (bytenr < entry->bytenr)
 227                        n = n->rb_left;
 228                else if (bytenr > entry->bytenr)
 229                        n = n->rb_right;
 230                else
 231                        return n;
 232        }
 233        return NULL;
 234}
 235
 236/*
 237 * walk up backref nodes until reach node presents tree root
 238 */
 239static struct backref_node *walk_up_backref(struct backref_node *node,
 240                                            struct backref_edge *edges[],
 241                                            int *index)
 242{
 243        struct backref_edge *edge;
 244        int idx = *index;
 245
 246        while (!list_empty(&node->upper)) {
 247                edge = list_entry(node->upper.next,
 248                                  struct backref_edge, list[LOWER]);
 249                edges[idx++] = edge;
 250                node = edge->node[UPPER];
 251        }
 252        *index = idx;
 253        return node;
 254}
 255
 256/*
 257 * walk down backref nodes to find start of next reference path
 258 */
 259static struct backref_node *walk_down_backref(struct backref_edge *edges[],
 260                                              int *index)
 261{
 262        struct backref_edge *edge;
 263        struct backref_node *lower;
 264        int idx = *index;
 265
 266        while (idx > 0) {
 267                edge = edges[idx - 1];
 268                lower = edge->node[LOWER];
 269                if (list_is_last(&edge->list[LOWER], &lower->upper)) {
 270                        idx--;
 271                        continue;
 272                }
 273                edge = list_entry(edge->list[LOWER].next,
 274                                  struct backref_edge, list[LOWER]);
 275                edges[idx - 1] = edge;
 276                *index = idx;
 277                return edge->node[UPPER];
 278        }
 279        *index = 0;
 280        return NULL;
 281}
 282
 283static void drop_node_buffer(struct backref_node *node)
 284{
 285        if (node->eb) {
 286                if (node->locked) {
 287                        btrfs_tree_unlock(node->eb);
 288                        node->locked = 0;
 289                }
 290                free_extent_buffer(node->eb);
 291                node->eb = NULL;
 292        }
 293}
 294
 295static void drop_backref_node(struct backref_cache *tree,
 296                              struct backref_node *node)
 297{
 298        BUG_ON(!node->lowest);
 299        BUG_ON(!list_empty(&node->upper));
 300
 301        drop_node_buffer(node);
 302        list_del(&node->lower);
 303
 304        rb_erase(&node->rb_node, &tree->rb_root);
 305        kfree(node);
 306}
 307
 308/*
 309 * remove a backref node from the backref cache
 310 */
 311static void remove_backref_node(struct backref_cache *cache,
 312                                struct backref_node *node)
 313{
 314        struct backref_node *upper;
 315        struct backref_edge *edge;
 316
 317        if (!node)
 318                return;
 319
 320        BUG_ON(!node->lowest);
 321        while (!list_empty(&node->upper)) {
 322                edge = list_entry(node->upper.next, struct backref_edge,
 323                                  list[LOWER]);
 324                upper = edge->node[UPPER];
 325                list_del(&edge->list[LOWER]);
 326                list_del(&edge->list[UPPER]);
 327                kfree(edge);
 328                /*
 329                 * add the node to pending list if no other
 330                 * child block cached.
 331                 */
 332                if (list_empty(&upper->lower)) {
 333                        list_add_tail(&upper->lower,
 334                                      &cache->pending[upper->level]);
 335                        upper->lowest = 1;
 336                }
 337        }
 338        drop_backref_node(cache, node);
 339}
 340
 341/*
 342 * find reloc tree by address of tree root
 343 */
 344static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
 345                                          u64 bytenr)
 346{
 347        struct rb_node *rb_node;
 348        struct mapping_node *node;
 349        struct btrfs_root *root = NULL;
 350
 351        spin_lock(&rc->reloc_root_tree.lock);
 352        rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
 353        if (rb_node) {
 354                node = rb_entry(rb_node, struct mapping_node, rb_node);
 355                root = (struct btrfs_root *)node->data;
 356        }
 357        spin_unlock(&rc->reloc_root_tree.lock);
 358        return root;
 359}
 360
 361static int is_cowonly_root(u64 root_objectid)
 362{
 363        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
 364            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
 365            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
 366            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
 367            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
 368            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
 369                return 1;
 370        return 0;
 371}
 372
 373static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
 374                                        u64 root_objectid)
 375{
 376        struct btrfs_key key;
 377
 378        key.objectid = root_objectid;
 379        key.type = BTRFS_ROOT_ITEM_KEY;
 380        if (is_cowonly_root(root_objectid))
 381                key.offset = 0;
 382        else
 383                key.offset = (u64)-1;
 384
 385        return btrfs_read_fs_root_no_name(fs_info, &key);
 386}
 387
 388#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 389static noinline_for_stack
 390struct btrfs_root *find_tree_root(struct reloc_control *rc,
 391                                  struct extent_buffer *leaf,
 392                                  struct btrfs_extent_ref_v0 *ref0)
 393{
 394        struct btrfs_root *root;
 395        u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
 396        u64 generation = btrfs_ref_generation_v0(leaf, ref0);
 397
 398        BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
 399
 400        root = read_fs_root(rc->extent_root->fs_info, root_objectid);
 401        BUG_ON(IS_ERR(root));
 402
 403        if (root->ref_cows &&
 404            generation != btrfs_root_generation(&root->root_item))
 405                return NULL;
 406
 407        return root;
 408}
 409#endif
 410
 411static noinline_for_stack
 412int find_inline_backref(struct extent_buffer *leaf, int slot,
 413                        unsigned long *ptr, unsigned long *end)
 414{
 415        struct btrfs_extent_item *ei;
 416        struct btrfs_tree_block_info *bi;
 417        u32 item_size;
 418
 419        item_size = btrfs_item_size_nr(leaf, slot);
 420#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 421        if (item_size < sizeof(*ei)) {
 422                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
 423                return 1;
 424        }
 425#endif
 426        ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 427        WARN_ON(!(btrfs_extent_flags(leaf, ei) &
 428                  BTRFS_EXTENT_FLAG_TREE_BLOCK));
 429
 430        if (item_size <= sizeof(*ei) + sizeof(*bi)) {
 431                WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 432                return 1;
 433        }
 434
 435        bi = (struct btrfs_tree_block_info *)(ei + 1);
 436        *ptr = (unsigned long)(bi + 1);
 437        *end = (unsigned long)ei + item_size;
 438        return 0;
 439}
 440
 441/*
 442 * build backref tree for a given tree block. root of the backref tree
 443 * corresponds the tree block, leaves of the backref tree correspond
 444 * roots of b-trees that reference the tree block.
 445 *
 446 * the basic idea of this function is check backrefs of a given block
 447 * to find upper level blocks that refernece the block, and then check
 448 * bakcrefs of these upper level blocks recursively. the recursion stop
 449 * when tree root is reached or backrefs for the block is cached.
 450 *
 451 * NOTE: if we find backrefs for a block are cached, we know backrefs
 452 * for all upper level blocks that directly/indirectly reference the
 453 * block are also cached.
 454 */
 455static struct backref_node *build_backref_tree(struct reloc_control *rc,
 456                                               struct backref_cache *cache,
 457                                               struct btrfs_key *node_key,
 458                                               int level, u64 bytenr)
 459{
 460        struct btrfs_path *path1;
 461        struct btrfs_path *path2;
 462        struct extent_buffer *eb;
 463        struct btrfs_root *root;
 464        struct backref_node *cur;
 465        struct backref_node *upper;
 466        struct backref_node *lower;
 467        struct backref_node *node = NULL;
 468        struct backref_node *exist = NULL;
 469        struct backref_edge *edge;
 470        struct rb_node *rb_node;
 471        struct btrfs_key key;
 472        unsigned long end;
 473        unsigned long ptr;
 474        LIST_HEAD(list);
 475        int ret;
 476        int err = 0;
 477
 478        path1 = btrfs_alloc_path();
 479        path2 = btrfs_alloc_path();
 480        if (!path1 || !path2) {
 481                err = -ENOMEM;
 482                goto out;
 483        }
 484
 485        node = kmalloc(sizeof(*node), GFP_NOFS);
 486        if (!node) {
 487                err = -ENOMEM;
 488                goto out;
 489        }
 490
 491        backref_node_init(node);
 492        node->bytenr = bytenr;
 493        node->owner = 0;
 494        node->level = level;
 495        node->lowest = 1;
 496        cur = node;
 497again:
 498        end = 0;
 499        ptr = 0;
 500        key.objectid = cur->bytenr;
 501        key.type = BTRFS_EXTENT_ITEM_KEY;
 502        key.offset = (u64)-1;
 503
 504        path1->search_commit_root = 1;
 505        path1->skip_locking = 1;
 506        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
 507                                0, 0);
 508        if (ret < 0) {
 509                err = ret;
 510                goto out;
 511        }
 512        BUG_ON(!ret || !path1->slots[0]);
 513
 514        path1->slots[0]--;
 515
 516        WARN_ON(cur->checked);
 517        if (!list_empty(&cur->upper)) {
 518                /*
 519                 * the backref was added previously when processsing
 520                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
 521                 */
 522                BUG_ON(!list_is_singular(&cur->upper));
 523                edge = list_entry(cur->upper.next, struct backref_edge,
 524                                  list[LOWER]);
 525                BUG_ON(!list_empty(&edge->list[UPPER]));
 526                exist = edge->node[UPPER];
 527                /*
 528                 * add the upper level block to pending list if we need
 529                 * check its backrefs
 530                 */
 531                if (!exist->checked)
 532                        list_add_tail(&edge->list[UPPER], &list);
 533        } else {
 534                exist = NULL;
 535        }
 536
 537        while (1) {
 538                cond_resched();
 539                eb = path1->nodes[0];
 540
 541                if (ptr >= end) {
 542                        if (path1->slots[0] >= btrfs_header_nritems(eb)) {
 543                                ret = btrfs_next_leaf(rc->extent_root, path1);
 544                                if (ret < 0) {
 545                                        err = ret;
 546                                        goto out;
 547                                }
 548                                if (ret > 0)
 549                                        break;
 550                                eb = path1->nodes[0];
 551                        }
 552
 553                        btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
 554                        if (key.objectid != cur->bytenr) {
 555                                WARN_ON(exist);
 556                                break;
 557                        }
 558
 559                        if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 560                                ret = find_inline_backref(eb, path1->slots[0],
 561                                                          &ptr, &end);
 562                                if (ret)
 563                                        goto next;
 564                        }
 565                }
 566
 567                if (ptr < end) {
 568                        /* update key for inline back ref */
 569                        struct btrfs_extent_inline_ref *iref;
 570                        iref = (struct btrfs_extent_inline_ref *)ptr;
 571                        key.type = btrfs_extent_inline_ref_type(eb, iref);
 572                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
 573                        WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
 574                                key.type != BTRFS_SHARED_BLOCK_REF_KEY);
 575                }
 576
 577                if (exist &&
 578                    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
 579                      exist->owner == key.offset) ||
 580                     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
 581                      exist->bytenr == key.offset))) {
 582                        exist = NULL;
 583                        goto next;
 584                }
 585
 586#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 587                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
 588                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
 589                        if (key.objectid == key.offset &&
 590                            key.type == BTRFS_EXTENT_REF_V0_KEY) {
 591                                struct btrfs_extent_ref_v0 *ref0;
 592                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
 593                                                struct btrfs_extent_ref_v0);
 594                                root = find_tree_root(rc, eb, ref0);
 595                                if (root)
 596                                        cur->root = root;
 597                                else
 598                                        cur->old_root = 1;
 599                                break;
 600                        }
 601#else
 602                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
 603                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
 604#endif
 605                        if (key.objectid == key.offset) {
 606                                /*
 607                                 * only root blocks of reloc trees use
 608                                 * backref of this type.
 609                                 */
 610                                root = find_reloc_root(rc, cur->bytenr);
 611                                BUG_ON(!root);
 612                                cur->root = root;
 613                                break;
 614                        }
 615
 616                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
 617                        if (!edge) {
 618                                err = -ENOMEM;
 619                                goto out;
 620                        }
 621                        rb_node = tree_search(&cache->rb_root, key.offset);
 622                        if (!rb_node) {
 623                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
 624                                if (!upper) {
 625                                        kfree(edge);
 626                                        err = -ENOMEM;
 627                                        goto out;
 628                                }
 629                                backref_node_init(upper);
 630                                upper->bytenr = key.offset;
 631                                upper->owner = 0;
 632                                upper->level = cur->level + 1;
 633                                /*
 634                                 *  backrefs for the upper level block isn't
 635                                 *  cached, add the block to pending list
 636                                 */
 637                                list_add_tail(&edge->list[UPPER], &list);
 638                        } else {
 639                                upper = rb_entry(rb_node, struct backref_node,
 640                                                 rb_node);
 641                                INIT_LIST_HEAD(&edge->list[UPPER]);
 642                        }
 643                        list_add(&edge->list[LOWER], &cur->upper);
 644                        edge->node[UPPER] = upper;
 645                        edge->node[LOWER] = cur;
 646
 647                        goto next;
 648                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
 649                        goto next;
 650                }
 651
 652                /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
 653                root = read_fs_root(rc->extent_root->fs_info, key.offset);
 654                if (IS_ERR(root)) {
 655                        err = PTR_ERR(root);
 656                        goto out;
 657                }
 658
 659                if (btrfs_root_level(&root->root_item) == cur->level) {
 660                        /* tree root */
 661                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
 662                               cur->bytenr);
 663                        cur->root = root;
 664                        break;
 665                }
 666
 667                level = cur->level + 1;
 668
 669                /*
 670                 * searching the tree to find upper level blocks
 671                 * reference the block.
 672                 */
 673                path2->search_commit_root = 1;
 674                path2->skip_locking = 1;
 675                path2->lowest_level = level;
 676                ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
 677                path2->lowest_level = 0;
 678                if (ret < 0) {
 679                        err = ret;
 680                        goto out;
 681                }
 682                if (ret > 0 && path2->slots[level] > 0)
 683                        path2->slots[level]--;
 684
 685                eb = path2->nodes[level];
 686                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
 687                        cur->bytenr);
 688
 689                lower = cur;
 690                for (; level < BTRFS_MAX_LEVEL; level++) {
 691                        if (!path2->nodes[level]) {
 692                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
 693                                       lower->bytenr);
 694                                lower->root = root;
 695                                break;
 696                        }
 697
 698                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
 699                        if (!edge) {
 700                                err = -ENOMEM;
 701                                goto out;
 702                        }
 703
 704                        eb = path2->nodes[level];
 705                        rb_node = tree_search(&cache->rb_root, eb->start);
 706                        if (!rb_node) {
 707                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
 708                                if (!upper) {
 709                                        kfree(edge);
 710                                        err = -ENOMEM;
 711                                        goto out;
 712                                }
 713                                backref_node_init(upper);
 714                                upper->bytenr = eb->start;
 715                                upper->owner = btrfs_header_owner(eb);
 716                                upper->level = lower->level + 1;
 717
 718                                /*
 719                                 * if we know the block isn't shared
 720                                 * we can void checking its backrefs.
 721                                 */
 722                                if (btrfs_block_can_be_shared(root, eb))
 723                                        upper->checked = 0;
 724                                else
 725                                        upper->checked = 1;
 726
 727                                /*
 728                                 * add the block to pending list if we
 729                                 * need check its backrefs. only block
 730                                 * at 'cur->level + 1' is added to the
 731                                 * tail of pending list. this guarantees
 732                                 * we check backrefs from lower level
 733                                 * blocks to upper level blocks.
 734                                 */
 735                                if (!upper->checked &&
 736                                    level == cur->level + 1) {
 737                                        list_add_tail(&edge->list[UPPER],
 738                                                      &list);
 739                                } else
 740                                        INIT_LIST_HEAD(&edge->list[UPPER]);
 741                        } else {
 742                                upper = rb_entry(rb_node, struct backref_node,
 743                                                 rb_node);
 744                                BUG_ON(!upper->checked);
 745                                INIT_LIST_HEAD(&edge->list[UPPER]);
 746                        }
 747                        list_add_tail(&edge->list[LOWER], &lower->upper);
 748                        edge->node[UPPER] = upper;
 749                        edge->node[LOWER] = lower;
 750
 751                        if (rb_node)
 752                                break;
 753                        lower = upper;
 754                        upper = NULL;
 755                }
 756                btrfs_release_path(root, path2);
 757next:
 758                if (ptr < end) {
 759                        ptr += btrfs_extent_inline_ref_size(key.type);
 760                        if (ptr >= end) {
 761                                WARN_ON(ptr > end);
 762                                ptr = 0;
 763                                end = 0;
 764                        }
 765                }
 766                if (ptr >= end)
 767                        path1->slots[0]++;
 768        }
 769        btrfs_release_path(rc->extent_root, path1);
 770
 771        cur->checked = 1;
 772        WARN_ON(exist);
 773
 774        /* the pending list isn't empty, take the first block to process */
 775        if (!list_empty(&list)) {
 776                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
 777                list_del_init(&edge->list[UPPER]);
 778                cur = edge->node[UPPER];
 779                goto again;
 780        }
 781
 782        /*
 783         * everything goes well, connect backref nodes and insert backref nodes
 784         * into the cache.
 785         */
 786        BUG_ON(!node->checked);
 787        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
 788        BUG_ON(rb_node);
 789
 790        list_for_each_entry(edge, &node->upper, list[LOWER])
 791                list_add_tail(&edge->list[UPPER], &list);
 792
 793        while (!list_empty(&list)) {
 794                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
 795                list_del_init(&edge->list[UPPER]);
 796                upper = edge->node[UPPER];
 797
 798                if (!RB_EMPTY_NODE(&upper->rb_node)) {
 799                        if (upper->lowest) {
 800                                list_del_init(&upper->lower);
 801                                upper->lowest = 0;
 802                        }
 803
 804                        list_add_tail(&edge->list[UPPER], &upper->lower);
 805                        continue;
 806                }
 807
 808                BUG_ON(!upper->checked);
 809                rb_node = tree_insert(&cache->rb_root, upper->bytenr,
 810                                      &upper->rb_node);
 811                BUG_ON(rb_node);
 812
 813                list_add_tail(&edge->list[UPPER], &upper->lower);
 814
 815                list_for_each_entry(edge, &upper->upper, list[LOWER])
 816                        list_add_tail(&edge->list[UPPER], &list);
 817        }
 818out:
 819        btrfs_free_path(path1);
 820        btrfs_free_path(path2);
 821        if (err) {
 822                INIT_LIST_HEAD(&list);
 823                upper = node;
 824                while (upper) {
 825                        if (RB_EMPTY_NODE(&upper->rb_node)) {
 826                                list_splice_tail(&upper->upper, &list);
 827                                kfree(upper);
 828                        }
 829
 830                        if (list_empty(&list))
 831                                break;
 832
 833                        edge = list_entry(list.next, struct backref_edge,
 834                                          list[LOWER]);
 835                        upper = edge->node[UPPER];
 836                        kfree(edge);
 837                }
 838                return ERR_PTR(err);
 839        }
 840        return node;
 841}
 842
 843/*
 844 * helper to add 'address of tree root -> reloc tree' mapping
 845 */
 846static int __add_reloc_root(struct btrfs_root *root)
 847{
 848        struct rb_node *rb_node;
 849        struct mapping_node *node;
 850        struct reloc_control *rc = root->fs_info->reloc_ctl;
 851
 852        node = kmalloc(sizeof(*node), GFP_NOFS);
 853        BUG_ON(!node);
 854
 855        node->bytenr = root->node->start;
 856        node->data = root;
 857
 858        spin_lock(&rc->reloc_root_tree.lock);
 859        rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
 860                              node->bytenr, &node->rb_node);
 861        spin_unlock(&rc->reloc_root_tree.lock);
 862        BUG_ON(rb_node);
 863
 864        list_add_tail(&root->root_list, &rc->reloc_roots);
 865        return 0;
 866}
 867
 868/*
 869 * helper to update/delete the 'address of tree root -> reloc tree'
 870 * mapping
 871 */
 872static int __update_reloc_root(struct btrfs_root *root, int del)
 873{
 874        struct rb_node *rb_node;
 875        struct mapping_node *node = NULL;
 876        struct reloc_control *rc = root->fs_info->reloc_ctl;
 877
 878        spin_lock(&rc->reloc_root_tree.lock);
 879        rb_node = tree_search(&rc->reloc_root_tree.rb_root,
 880                              root->commit_root->start);
 881        if (rb_node) {
 882                node = rb_entry(rb_node, struct mapping_node, rb_node);
 883                rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
 884        }
 885        spin_unlock(&rc->reloc_root_tree.lock);
 886
 887        BUG_ON((struct btrfs_root *)node->data != root);
 888
 889        if (!del) {
 890                spin_lock(&rc->reloc_root_tree.lock);
 891                node->bytenr = root->node->start;
 892                rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
 893                                      node->bytenr, &node->rb_node);
 894                spin_unlock(&rc->reloc_root_tree.lock);
 895                BUG_ON(rb_node);
 896        } else {
 897                list_del_init(&root->root_list);
 898                kfree(node);
 899        }
 900        return 0;
 901}
 902
 903/*
 904 * create reloc tree for a given fs tree. reloc tree is just a
 905 * snapshot of the fs tree with special root objectid.
 906 */
 907int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
 908                          struct btrfs_root *root)
 909{
 910        struct btrfs_root *reloc_root;
 911        struct extent_buffer *eb;
 912        struct btrfs_root_item *root_item;
 913        struct btrfs_key root_key;
 914        int ret;
 915
 916        if (root->reloc_root) {
 917                reloc_root = root->reloc_root;
 918                reloc_root->last_trans = trans->transid;
 919                return 0;
 920        }
 921
 922        if (!root->fs_info->reloc_ctl ||
 923            !root->fs_info->reloc_ctl->create_reloc_root ||
 924            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
 925                return 0;
 926
 927        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
 928        BUG_ON(!root_item);
 929
 930        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
 931        root_key.type = BTRFS_ROOT_ITEM_KEY;
 932        root_key.offset = root->root_key.objectid;
 933
 934        ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
 935                              BTRFS_TREE_RELOC_OBJECTID);
 936        BUG_ON(ret);
 937
 938        btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
 939        memcpy(root_item, &root->root_item, sizeof(*root_item));
 940        btrfs_set_root_refs(root_item, 1);
 941        btrfs_set_root_bytenr(root_item, eb->start);
 942        btrfs_set_root_level(root_item, btrfs_header_level(eb));
 943        btrfs_set_root_generation(root_item, trans->transid);
 944        memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
 945        root_item->drop_level = 0;
 946
 947        btrfs_tree_unlock(eb);
 948        free_extent_buffer(eb);
 949
 950        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
 951                                &root_key, root_item);
 952        BUG_ON(ret);
 953        kfree(root_item);
 954
 955        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
 956                                                 &root_key);
 957        BUG_ON(IS_ERR(reloc_root));
 958        reloc_root->last_trans = trans->transid;
 959
 960        __add_reloc_root(reloc_root);
 961        root->reloc_root = reloc_root;
 962        return 0;
 963}
 964
 965/*
 966 * update root item of reloc tree
 967 */
 968int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 969                            struct btrfs_root *root)
 970{
 971        struct btrfs_root *reloc_root;
 972        struct btrfs_root_item *root_item;
 973        int del = 0;
 974        int ret;
 975
 976        if (!root->reloc_root)
 977                return 0;
 978
 979        reloc_root = root->reloc_root;
 980        root_item = &reloc_root->root_item;
 981
 982        if (btrfs_root_refs(root_item) == 0) {
 983                root->reloc_root = NULL;
 984                del = 1;
 985        }
 986
 987        __update_reloc_root(reloc_root, del);
 988
 989        if (reloc_root->commit_root != reloc_root->node) {
 990                btrfs_set_root_node(root_item, reloc_root->node);
 991                free_extent_buffer(reloc_root->commit_root);
 992                reloc_root->commit_root = btrfs_root_node(reloc_root);
 993        }
 994
 995        ret = btrfs_update_root(trans, root->fs_info->tree_root,
 996                                &reloc_root->root_key, root_item);
 997        BUG_ON(ret);
 998        return 0;
 999}
1000
1001/*
1002 * helper to find first cached inode with inode number >= objectid
1003 * in a subvolume
1004 */
1005static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
1006{
1007        struct rb_node *node;
1008        struct rb_node *prev;
1009        struct btrfs_inode *entry;
1010        struct inode *inode;
1011
1012        spin_lock(&root->inode_lock);
1013again:
1014        node = root->inode_tree.rb_node;
1015        prev = NULL;
1016        while (node) {
1017                prev = node;
1018                entry = rb_entry(node, struct btrfs_inode, rb_node);
1019
1020                if (objectid < entry->vfs_inode.i_ino)
1021                        node = node->rb_left;
1022                else if (objectid > entry->vfs_inode.i_ino)
1023                        node = node->rb_right;
1024                else
1025                        break;
1026        }
1027        if (!node) {
1028                while (prev) {
1029                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
1030                        if (objectid <= entry->vfs_inode.i_ino) {
1031                                node = prev;
1032                                break;
1033                        }
1034                        prev = rb_next(prev);
1035                }
1036        }
1037        while (node) {
1038                entry = rb_entry(node, struct btrfs_inode, rb_node);
1039                inode = igrab(&entry->vfs_inode);
1040                if (inode) {
1041                        spin_unlock(&root->inode_lock);
1042                        return inode;
1043                }
1044
1045                objectid = entry->vfs_inode.i_ino + 1;
1046                if (cond_resched_lock(&root->inode_lock))
1047                        goto again;
1048
1049                node = rb_next(node);
1050        }
1051        spin_unlock(&root->inode_lock);
1052        return NULL;
1053}
1054
1055static int in_block_group(u64 bytenr,
1056                          struct btrfs_block_group_cache *block_group)
1057{
1058        if (bytenr >= block_group->key.objectid &&
1059            bytenr < block_group->key.objectid + block_group->key.offset)
1060                return 1;
1061        return 0;
1062}
1063
1064/*
1065 * get new location of data
1066 */
1067static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1068                            u64 bytenr, u64 num_bytes)
1069{
1070        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1071        struct btrfs_path *path;
1072        struct btrfs_file_extent_item *fi;
1073        struct extent_buffer *leaf;
1074        int ret;
1075
1076        path = btrfs_alloc_path();
1077        if (!path)
1078                return -ENOMEM;
1079
1080        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1081        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
1082                                       bytenr, 0);
1083        if (ret < 0)
1084                goto out;
1085        if (ret > 0) {
1086                ret = -ENOENT;
1087                goto out;
1088        }
1089
1090        leaf = path->nodes[0];
1091        fi = btrfs_item_ptr(leaf, path->slots[0],
1092                            struct btrfs_file_extent_item);
1093
1094        BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1095               btrfs_file_extent_compression(leaf, fi) ||
1096               btrfs_file_extent_encryption(leaf, fi) ||
1097               btrfs_file_extent_other_encoding(leaf, fi));
1098
1099        if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1100                ret = 1;
1101                goto out;
1102        }
1103
1104        if (new_bytenr)
1105                *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106        ret = 0;
1107out:
1108        btrfs_free_path(path);
1109        return ret;
1110}
1111
1112/*
1113 * update file extent items in the tree leaf to point to
1114 * the new locations.
1115 */
1116static int replace_file_extents(struct btrfs_trans_handle *trans,
1117                                struct reloc_control *rc,
1118                                struct btrfs_root *root,
1119                                struct extent_buffer *leaf,
1120                                struct list_head *inode_list)
1121{
1122        struct btrfs_key key;
1123        struct btrfs_file_extent_item *fi;
1124        struct inode *inode = NULL;
1125        struct inodevec *ivec = NULL;
1126        u64 parent;
1127        u64 bytenr;
1128        u64 new_bytenr;
1129        u64 num_bytes;
1130        u64 end;
1131        u32 nritems;
1132        u32 i;
1133        int ret;
1134        int first = 1;
1135        int dirty = 0;
1136
1137        if (rc->stage != UPDATE_DATA_PTRS)
1138                return 0;
1139
1140        /* reloc trees always use full backref */
1141        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1142                parent = leaf->start;
1143        else
1144                parent = 0;
1145
1146        nritems = btrfs_header_nritems(leaf);
1147        for (i = 0; i < nritems; i++) {
1148                cond_resched();
1149                btrfs_item_key_to_cpu(leaf, &key, i);
1150                if (key.type != BTRFS_EXTENT_DATA_KEY)
1151                        continue;
1152                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1153                if (btrfs_file_extent_type(leaf, fi) ==
1154                    BTRFS_FILE_EXTENT_INLINE)
1155                        continue;
1156                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1157                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1158                if (bytenr == 0)
1159                        continue;
1160                if (!in_block_group(bytenr, rc->block_group))
1161                        continue;
1162
1163                /*
1164                 * if we are modifying block in fs tree, wait for readpage
1165                 * to complete and drop the extent cache
1166                 */
1167                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1168                        if (!ivec || ivec->nr == INODEVEC_SIZE) {
1169                                ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1170                                BUG_ON(!ivec);
1171                                ivec->nr = 0;
1172                                list_add_tail(&ivec->list, inode_list);
1173                        }
1174                        if (first) {
1175                                inode = find_next_inode(root, key.objectid);
1176                                if (inode)
1177                                        ivec->inode[ivec->nr++] = inode;
1178                                first = 0;
1179                        } else if (inode && inode->i_ino < key.objectid) {
1180                                inode = find_next_inode(root, key.objectid);
1181                                if (inode)
1182                                        ivec->inode[ivec->nr++] = inode;
1183                        }
1184                        if (inode && inode->i_ino == key.objectid) {
1185                                end = key.offset +
1186                                      btrfs_file_extent_num_bytes(leaf, fi);
1187                                WARN_ON(!IS_ALIGNED(key.offset,
1188                                                    root->sectorsize));
1189                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1190                                end--;
1191                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1192                                                      key.offset, end,
1193                                                      GFP_NOFS);
1194                                if (!ret)
1195                                        continue;
1196
1197                                btrfs_drop_extent_cache(inode, key.offset, end,
1198                                                        1);
1199                                unlock_extent(&BTRFS_I(inode)->io_tree,
1200                                              key.offset, end, GFP_NOFS);
1201                        }
1202                }
1203
1204                ret = get_new_location(rc->data_inode, &new_bytenr,
1205                                       bytenr, num_bytes);
1206                if (ret > 0)
1207                        continue;
1208                BUG_ON(ret < 0);
1209
1210                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1211                dirty = 1;
1212
1213                key.offset -= btrfs_file_extent_offset(leaf, fi);
1214                ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1215                                           num_bytes, parent,
1216                                           btrfs_header_owner(leaf),
1217                                           key.objectid, key.offset);
1218                BUG_ON(ret);
1219
1220                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1221                                        parent, btrfs_header_owner(leaf),
1222                                        key.objectid, key.offset);
1223                BUG_ON(ret);
1224        }
1225        if (dirty)
1226                btrfs_mark_buffer_dirty(leaf);
1227        return 0;
1228}
1229
1230static noinline_for_stack
1231int memcmp_node_keys(struct extent_buffer *eb, int slot,
1232                     struct btrfs_path *path, int level)
1233{
1234        struct btrfs_disk_key key1;
1235        struct btrfs_disk_key key2;
1236        btrfs_node_key(eb, &key1, slot);
1237        btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1238        return memcmp(&key1, &key2, sizeof(key1));
1239}
1240
1241/*
1242 * try to replace tree blocks in fs tree with the new blocks
1243 * in reloc tree. tree blocks haven't been modified since the
1244 * reloc tree was create can be replaced.
1245 *
1246 * if a block was replaced, level of the block + 1 is returned.
1247 * if no block got replaced, 0 is returned. if there are other
1248 * errors, a negative error number is returned.
1249 */
1250static int replace_path(struct btrfs_trans_handle *trans,
1251                        struct btrfs_root *dest, struct btrfs_root *src,
1252                        struct btrfs_path *path, struct btrfs_key *next_key,
1253                        struct extent_buffer **leaf,
1254                        int lowest_level, int max_level)
1255{
1256        struct extent_buffer *eb;
1257        struct extent_buffer *parent;
1258        struct btrfs_key key;
1259        u64 old_bytenr;
1260        u64 new_bytenr;
1261        u64 old_ptr_gen;
1262        u64 new_ptr_gen;
1263        u64 last_snapshot;
1264        u32 blocksize;
1265        int level;
1266        int ret;
1267        int slot;
1268
1269        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1270        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1271        BUG_ON(lowest_level > 1 && leaf);
1272
1273        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1274
1275        slot = path->slots[lowest_level];
1276        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1277
1278        eb = btrfs_lock_root_node(dest);
1279        btrfs_set_lock_blocking(eb);
1280        level = btrfs_header_level(eb);
1281
1282        if (level < lowest_level) {
1283                btrfs_tree_unlock(eb);
1284                free_extent_buffer(eb);
1285                return 0;
1286        }
1287
1288        ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1289        BUG_ON(ret);
1290        btrfs_set_lock_blocking(eb);
1291
1292        if (next_key) {
1293                next_key->objectid = (u64)-1;
1294                next_key->type = (u8)-1;
1295                next_key->offset = (u64)-1;
1296        }
1297
1298        parent = eb;
1299        while (1) {
1300                level = btrfs_header_level(parent);
1301                BUG_ON(level < lowest_level);
1302
1303                ret = btrfs_bin_search(parent, &key, level, &slot);
1304                if (ret && slot > 0)
1305                        slot--;
1306
1307                if (next_key && slot + 1 < btrfs_header_nritems(parent))
1308                        btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1309
1310                old_bytenr = btrfs_node_blockptr(parent, slot);
1311                blocksize = btrfs_level_size(dest, level - 1);
1312                old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1313
1314                if (level <= max_level) {
1315                        eb = path->nodes[level];
1316                        new_bytenr = btrfs_node_blockptr(eb,
1317                                                        path->slots[level]);
1318                        new_ptr_gen = btrfs_node_ptr_generation(eb,
1319                                                        path->slots[level]);
1320                } else {
1321                        new_bytenr = 0;
1322                        new_ptr_gen = 0;
1323                }
1324
1325                if (new_bytenr > 0 && new_bytenr == old_bytenr) {
1326                        WARN_ON(1);
1327                        ret = level;
1328                        break;
1329                }
1330
1331                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1332                    memcmp_node_keys(parent, slot, path, level)) {
1333                        if (level <= lowest_level && !leaf) {
1334                                ret = 0;
1335                                break;
1336                        }
1337
1338                        eb = read_tree_block(dest, old_bytenr, blocksize,
1339                                             old_ptr_gen);
1340                        btrfs_tree_lock(eb);
1341                        ret = btrfs_cow_block(trans, dest, eb, parent,
1342                                              slot, &eb);
1343                        BUG_ON(ret);
1344                        btrfs_set_lock_blocking(eb);
1345
1346                        if (level <= lowest_level) {
1347                                *leaf = eb;
1348                                ret = 0;
1349                                break;
1350                        }
1351
1352                        btrfs_tree_unlock(parent);
1353                        free_extent_buffer(parent);
1354
1355                        parent = eb;
1356                        continue;
1357                }
1358
1359                btrfs_node_key_to_cpu(path->nodes[level], &key,
1360                                      path->slots[level]);
1361                btrfs_release_path(src, path);
1362
1363                path->lowest_level = level;
1364                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1365                path->lowest_level = 0;
1366                BUG_ON(ret);
1367
1368                /*
1369                 * swap blocks in fs tree and reloc tree.
1370                 */
1371                btrfs_set_node_blockptr(parent, slot, new_bytenr);
1372                btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1373                btrfs_mark_buffer_dirty(parent);
1374
1375                btrfs_set_node_blockptr(path->nodes[level],
1376                                        path->slots[level], old_bytenr);
1377                btrfs_set_node_ptr_generation(path->nodes[level],
1378                                              path->slots[level], old_ptr_gen);
1379                btrfs_mark_buffer_dirty(path->nodes[level]);
1380
1381                ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1382                                        path->nodes[level]->start,
1383                                        src->root_key.objectid, level - 1, 0);
1384                BUG_ON(ret);
1385                ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1386                                        0, dest->root_key.objectid, level - 1,
1387                                        0);
1388                BUG_ON(ret);
1389
1390                ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1391                                        path->nodes[level]->start,
1392                                        src->root_key.objectid, level - 1, 0);
1393                BUG_ON(ret);
1394
1395                ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1396                                        0, dest->root_key.objectid, level - 1,
1397                                        0);
1398                BUG_ON(ret);
1399
1400                btrfs_unlock_up_safe(path, 0);
1401
1402                ret = level;
1403                break;
1404        }
1405        btrfs_tree_unlock(parent);
1406        free_extent_buffer(parent);
1407        return ret;
1408}
1409
1410/*
1411 * helper to find next relocated block in reloc tree
1412 */
1413static noinline_for_stack
1414int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1415                       int *level)
1416{
1417        struct extent_buffer *eb;
1418        int i;
1419        u64 last_snapshot;
1420        u32 nritems;
1421
1422        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1423
1424        for (i = 0; i < *level; i++) {
1425                free_extent_buffer(path->nodes[i]);
1426                path->nodes[i] = NULL;
1427        }
1428
1429        for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1430                eb = path->nodes[i];
1431                nritems = btrfs_header_nritems(eb);
1432                while (path->slots[i] + 1 < nritems) {
1433                        path->slots[i]++;
1434                        if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1435                            last_snapshot)
1436                                continue;
1437
1438                        *level = i;
1439                        return 0;
1440                }
1441                free_extent_buffer(path->nodes[i]);
1442                path->nodes[i] = NULL;
1443        }
1444        return 1;
1445}
1446
1447/*
1448 * walk down reloc tree to find relocated block of lowest level
1449 */
1450static noinline_for_stack
1451int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1452                         int *level)
1453{
1454        struct extent_buffer *eb = NULL;
1455        int i;
1456        u64 bytenr;
1457        u64 ptr_gen = 0;
1458        u64 last_snapshot;
1459        u32 blocksize;
1460        u32 nritems;
1461
1462        last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1463
1464        for (i = *level; i > 0; i--) {
1465                eb = path->nodes[i];
1466                nritems = btrfs_header_nritems(eb);
1467                while (path->slots[i] < nritems) {
1468                        ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1469                        if (ptr_gen > last_snapshot)
1470                                break;
1471                        path->slots[i]++;
1472                }
1473                if (path->slots[i] >= nritems) {
1474                        if (i == *level)
1475                                break;
1476                        *level = i + 1;
1477                        return 0;
1478                }
1479                if (i == 1) {
1480                        *level = i;
1481                        return 0;
1482                }
1483
1484                bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1485                blocksize = btrfs_level_size(root, i - 1);
1486                eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1487                BUG_ON(btrfs_header_level(eb) != i - 1);
1488                path->nodes[i - 1] = eb;
1489                path->slots[i - 1] = 0;
1490        }
1491        return 1;
1492}
1493
1494/*
1495 * invalidate extent cache for file extents whose key in range of
1496 * [min_key, max_key)
1497 */
1498static int invalidate_extent_cache(struct btrfs_root *root,
1499                                   struct btrfs_key *min_key,
1500                                   struct btrfs_key *max_key)
1501{
1502        struct inode *inode = NULL;
1503        u64 objectid;
1504        u64 start, end;
1505
1506        objectid = min_key->objectid;
1507        while (1) {
1508                cond_resched();
1509                iput(inode);
1510
1511                if (objectid > max_key->objectid)
1512                        break;
1513
1514                inode = find_next_inode(root, objectid);
1515                if (!inode)
1516                        break;
1517
1518                if (inode->i_ino > max_key->objectid) {
1519                        iput(inode);
1520                        break;
1521                }
1522
1523                objectid = inode->i_ino + 1;
1524                if (!S_ISREG(inode->i_mode))
1525                        continue;
1526
1527                if (unlikely(min_key->objectid == inode->i_ino)) {
1528                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1529                                continue;
1530                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1531                                start = 0;
1532                        else {
1533                                start = min_key->offset;
1534                                WARN_ON(!IS_ALIGNED(start, root->sectorsize));
1535                        }
1536                } else {
1537                        start = 0;
1538                }
1539
1540                if (unlikely(max_key->objectid == inode->i_ino)) {
1541                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1542                                continue;
1543                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1544                                end = (u64)-1;
1545                        } else {
1546                                if (max_key->offset == 0)
1547                                        continue;
1548                                end = max_key->offset;
1549                                WARN_ON(!IS_ALIGNED(end, root->sectorsize));
1550                                end--;
1551                        }
1552                } else {
1553                        end = (u64)-1;
1554                }
1555
1556                /* the lock_extent waits for readpage to complete */
1557                lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1558                btrfs_drop_extent_cache(inode, start, end, 1);
1559                unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
1560        }
1561        return 0;
1562}
1563
1564static int find_next_key(struct btrfs_path *path, int level,
1565                         struct btrfs_key *key)
1566
1567{
1568        while (level < BTRFS_MAX_LEVEL) {
1569                if (!path->nodes[level])
1570                        break;
1571                if (path->slots[level] + 1 <
1572                    btrfs_header_nritems(path->nodes[level])) {
1573                        btrfs_node_key_to_cpu(path->nodes[level], key,
1574                                              path->slots[level] + 1);
1575                        return 0;
1576                }
1577                level++;
1578        }
1579        return 1;
1580}
1581
1582/*
1583 * merge the relocated tree blocks in reloc tree with corresponding
1584 * fs tree.
1585 */
1586static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1587                                               struct btrfs_root *root)
1588{
1589        LIST_HEAD(inode_list);
1590        struct btrfs_key key;
1591        struct btrfs_key next_key;
1592        struct btrfs_trans_handle *trans;
1593        struct btrfs_root *reloc_root;
1594        struct btrfs_root_item *root_item;
1595        struct btrfs_path *path;
1596        struct extent_buffer *leaf = NULL;
1597        unsigned long nr;
1598        int level;
1599        int max_level;
1600        int replaced = 0;
1601        int ret;
1602        int err = 0;
1603
1604        path = btrfs_alloc_path();
1605        if (!path)
1606                return -ENOMEM;
1607
1608        reloc_root = root->reloc_root;
1609        root_item = &reloc_root->root_item;
1610
1611        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1612                level = btrfs_root_level(root_item);
1613                extent_buffer_get(reloc_root->node);
1614                path->nodes[level] = reloc_root->node;
1615                path->slots[level] = 0;
1616        } else {
1617                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1618
1619                level = root_item->drop_level;
1620                BUG_ON(level == 0);
1621                path->lowest_level = level;
1622                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1623                path->lowest_level = 0;
1624                if (ret < 0) {
1625                        btrfs_free_path(path);
1626                        return ret;
1627                }
1628
1629                btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1630                                      path->slots[level]);
1631                WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1632
1633                btrfs_unlock_up_safe(path, 0);
1634        }
1635
1636        if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
1637                trans = btrfs_start_transaction(root, 1);
1638
1639                leaf = path->nodes[0];
1640                btrfs_item_key_to_cpu(leaf, &key, 0);
1641                btrfs_release_path(reloc_root, path);
1642
1643                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1644                if (ret < 0) {
1645                        err = ret;
1646                        goto out;
1647                }
1648
1649                leaf = path->nodes[0];
1650                btrfs_unlock_up_safe(path, 1);
1651                ret = replace_file_extents(trans, rc, root, leaf,
1652                                           &inode_list);
1653                if (ret < 0)
1654                        err = ret;
1655                goto out;
1656        }
1657
1658        memset(&next_key, 0, sizeof(next_key));
1659
1660        while (1) {
1661                leaf = NULL;
1662                replaced = 0;
1663                trans = btrfs_start_transaction(root, 1);
1664                max_level = level;
1665
1666                ret = walk_down_reloc_tree(reloc_root, path, &level);
1667                if (ret < 0) {
1668                        err = ret;
1669                        goto out;
1670                }
1671                if (ret > 0)
1672                        break;
1673
1674                if (!find_next_key(path, level, &key) &&
1675                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1676                        ret = 0;
1677                } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1678                        ret = replace_path(trans, root, reloc_root,
1679                                           path, &next_key, &leaf,
1680                                           level, max_level);
1681                } else {
1682                        ret = replace_path(trans, root, reloc_root,
1683                                           path, &next_key, NULL,
1684                                           level, max_level);
1685                }
1686                if (ret < 0) {
1687                        err = ret;
1688                        goto out;
1689                }
1690
1691                if (ret > 0) {
1692                        level = ret;
1693                        btrfs_node_key_to_cpu(path->nodes[level], &key,
1694                                              path->slots[level]);
1695                        replaced = 1;
1696                } else if (leaf) {
1697                        /*
1698                         * no block got replaced, try replacing file extents
1699                         */
1700                        btrfs_item_key_to_cpu(leaf, &key, 0);
1701                        ret = replace_file_extents(trans, rc, root, leaf,
1702                                                   &inode_list);
1703                        btrfs_tree_unlock(leaf);
1704                        free_extent_buffer(leaf);
1705                        BUG_ON(ret < 0);
1706                }
1707
1708                ret = walk_up_reloc_tree(reloc_root, path, &level);
1709                if (ret > 0)
1710                        break;
1711
1712                BUG_ON(level == 0);
1713                /*
1714                 * save the merging progress in the drop_progress.
1715                 * this is OK since root refs == 1 in this case.
1716                 */
1717                btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1718                               path->slots[level]);
1719                root_item->drop_level = level;
1720
1721                nr = trans->blocks_used;
1722                btrfs_end_transaction(trans, root);
1723
1724                btrfs_btree_balance_dirty(root, nr);
1725
1726                if (replaced && rc->stage == UPDATE_DATA_PTRS)
1727                        invalidate_extent_cache(root, &key, &next_key);
1728        }
1729
1730        /*
1731         * handle the case only one block in the fs tree need to be
1732         * relocated and the block is tree root.
1733         */
1734        leaf = btrfs_lock_root_node(root);
1735        ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
1736        btrfs_tree_unlock(leaf);
1737        free_extent_buffer(leaf);
1738        if (ret < 0)
1739                err = ret;
1740out:
1741        btrfs_free_path(path);
1742
1743        if (err == 0) {
1744                memset(&root_item->drop_progress, 0,
1745                       sizeof(root_item->drop_progress));
1746                root_item->drop_level = 0;
1747                btrfs_set_root_refs(root_item, 0);
1748        }
1749
1750        nr = trans->blocks_used;
1751        btrfs_end_transaction(trans, root);
1752
1753        btrfs_btree_balance_dirty(root, nr);
1754
1755        /*
1756         * put inodes while we aren't holding the tree locks
1757         */
1758        while (!list_empty(&inode_list)) {
1759                struct inodevec *ivec;
1760                ivec = list_entry(inode_list.next, struct inodevec, list);
1761                list_del(&ivec->list);
1762                while (ivec->nr > 0) {
1763                        ivec->nr--;
1764                        iput(ivec->inode[ivec->nr]);
1765                }
1766                kfree(ivec);
1767        }
1768
1769        if (replaced && rc->stage == UPDATE_DATA_PTRS)
1770                invalidate_extent_cache(root, &key, &next_key);
1771
1772        return err;
1773}
1774
1775/*
1776 * callback for the work threads.
1777 * this function merges reloc tree with corresponding fs tree,
1778 * and then drops the reloc tree.
1779 */
1780static void merge_func(struct btrfs_work *work)
1781{
1782        struct btrfs_trans_handle *trans;
1783        struct btrfs_root *root;
1784        struct btrfs_root *reloc_root;
1785        struct async_merge *async;
1786
1787        async = container_of(work, struct async_merge, work);
1788        reloc_root = async->root;
1789
1790        if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1791                root = read_fs_root(reloc_root->fs_info,
1792                                    reloc_root->root_key.offset);
1793                BUG_ON(IS_ERR(root));
1794                BUG_ON(root->reloc_root != reloc_root);
1795
1796                merge_reloc_root(async->rc, root);
1797
1798                trans = btrfs_start_transaction(root, 1);
1799                btrfs_update_reloc_root(trans, root);
1800                btrfs_end_transaction(trans, root);
1801        }
1802
1803        btrfs_drop_snapshot(reloc_root, 0);
1804
1805        if (atomic_dec_and_test(async->num_pending))
1806                complete(async->done);
1807
1808        kfree(async);
1809}
1810
1811static int merge_reloc_roots(struct reloc_control *rc)
1812{
1813        struct async_merge *async;
1814        struct btrfs_root *root;
1815        struct completion done;
1816        atomic_t num_pending;
1817
1818        init_completion(&done);
1819        atomic_set(&num_pending, 1);
1820
1821        while (!list_empty(&rc->reloc_roots)) {
1822                root = list_entry(rc->reloc_roots.next,
1823                                  struct btrfs_root, root_list);
1824                list_del_init(&root->root_list);
1825
1826                async = kmalloc(sizeof(*async), GFP_NOFS);
1827                BUG_ON(!async);
1828                async->work.func = merge_func;
1829                async->work.flags = 0;
1830                async->rc = rc;
1831                async->root = root;
1832                async->done = &done;
1833                async->num_pending = &num_pending;
1834                atomic_inc(&num_pending);
1835                btrfs_queue_worker(&rc->workers, &async->work);
1836        }
1837
1838        if (!atomic_dec_and_test(&num_pending))
1839                wait_for_completion(&done);
1840
1841        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1842        return 0;
1843}
1844
1845static void free_block_list(struct rb_root *blocks)
1846{
1847        struct tree_block *block;
1848        struct rb_node *rb_node;
1849        while ((rb_node = rb_first(blocks))) {
1850                block = rb_entry(rb_node, struct tree_block, rb_node);
1851                rb_erase(rb_node, blocks);
1852                kfree(block);
1853        }
1854}
1855
1856static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1857                                      struct btrfs_root *reloc_root)
1858{
1859        struct btrfs_root *root;
1860
1861        if (reloc_root->last_trans == trans->transid)
1862                return 0;
1863
1864        root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
1865        BUG_ON(IS_ERR(root));
1866        BUG_ON(root->reloc_root != reloc_root);
1867
1868        return btrfs_record_root_in_trans(trans, root);
1869}
1870
1871/*
1872 * select one tree from trees that references the block.
1873 * for blocks in refernce counted trees, we preper reloc tree.
1874 * if no reloc tree found and reloc_only is true, NULL is returned.
1875 */
1876static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1877                                            struct backref_node *node,
1878                                            struct backref_edge *edges[],
1879                                            int *nr, int reloc_only)
1880{
1881        struct backref_node *next;
1882        struct btrfs_root *root;
1883        int index;
1884        int loop = 0;
1885again:
1886        index = 0;
1887        next = node;
1888        while (1) {
1889                cond_resched();
1890                next = walk_up_backref(next, edges, &index);
1891                root = next->root;
1892                if (!root) {
1893                        BUG_ON(!node->old_root);
1894                        goto skip;
1895                }
1896
1897                /* no other choice for non-refernce counted tree */
1898                if (!root->ref_cows) {
1899                        BUG_ON(reloc_only);
1900                        break;
1901                }
1902
1903                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1904                        record_reloc_root_in_trans(trans, root);
1905                        break;
1906                }
1907
1908                if (loop) {
1909                        btrfs_record_root_in_trans(trans, root);
1910                        break;
1911                }
1912
1913                if (reloc_only || next != node) {
1914                        if (!root->reloc_root)
1915                                btrfs_record_root_in_trans(trans, root);
1916                        root = root->reloc_root;
1917                        /*
1918                         * if the reloc tree was created in current
1919                         * transation, there is no node in backref tree
1920                         * corresponds to the root of the reloc tree.
1921                         */
1922                        if (btrfs_root_last_snapshot(&root->root_item) ==
1923                            trans->transid - 1)
1924                                break;
1925                }
1926skip:
1927                root = NULL;
1928                next = walk_down_backref(edges, &index);
1929                if (!next || next->level <= node->level)
1930                        break;
1931        }
1932
1933        if (!root && !loop && !reloc_only) {
1934                loop = 1;
1935                goto again;
1936        }
1937
1938        if (root)
1939                *nr = index;
1940        else
1941                *nr = 0;
1942
1943        return root;
1944}
1945
1946static noinline_for_stack
1947struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1948                                   struct backref_node *node)
1949{
1950        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1951        int nr;
1952        return __select_one_root(trans, node, edges, &nr, 0);
1953}
1954
1955static noinline_for_stack
1956struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1957                                     struct backref_node *node,
1958                                     struct backref_edge *edges[], int *nr)
1959{
1960        return __select_one_root(trans, node, edges, nr, 1);
1961}
1962
1963static void grab_path_buffers(struct btrfs_path *path,
1964                              struct backref_node *node,
1965                              struct backref_edge *edges[], int nr)
1966{
1967        int i = 0;
1968        while (1) {
1969                drop_node_buffer(node);
1970                node->eb = path->nodes[node->level];
1971                BUG_ON(!node->eb);
1972                if (path->locks[node->level])
1973                        node->locked = 1;
1974                path->nodes[node->level] = NULL;
1975                path->locks[node->level] = 0;
1976
1977                if (i >= nr)
1978                        break;
1979
1980                edges[i]->blockptr = node->eb->start;
1981                node = edges[i]->node[UPPER];
1982                i++;
1983        }
1984}
1985
1986/*
1987 * relocate a block tree, and then update pointers in upper level
1988 * blocks that reference the block to point to the new location.
1989 *
1990 * if called by link_to_upper, the block has already been relocated.
1991 * in that case this function just updates pointers.
1992 */
1993static int do_relocation(struct btrfs_trans_handle *trans,
1994                         struct backref_node *node,
1995                         struct btrfs_key *key,
1996                         struct btrfs_path *path, int lowest)
1997{
1998        struct backref_node *upper;
1999        struct backref_edge *edge;
2000        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2001        struct btrfs_root *root;
2002        struct extent_buffer *eb;
2003        u32 blocksize;
2004        u64 bytenr;
2005        u64 generation;
2006        int nr;
2007        int slot;
2008        int ret;
2009        int err = 0;
2010
2011        BUG_ON(lowest && node->eb);
2012
2013        path->lowest_level = node->level + 1;
2014        list_for_each_entry(edge, &node->upper, list[LOWER]) {
2015                cond_resched();
2016                if (node->eb && node->eb->start == edge->blockptr)
2017                        continue;
2018
2019                upper = edge->node[UPPER];
2020                root = select_reloc_root(trans, upper, edges, &nr);
2021                if (!root)
2022                        continue;
2023
2024                if (upper->eb && !upper->locked)
2025                        drop_node_buffer(upper);
2026
2027                if (!upper->eb) {
2028                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2029                        if (ret < 0) {
2030                                err = ret;
2031                                break;
2032                        }
2033                        BUG_ON(ret > 0);
2034
2035                        slot = path->slots[upper->level];
2036
2037                        btrfs_unlock_up_safe(path, upper->level + 1);
2038                        grab_path_buffers(path, upper, edges, nr);
2039
2040                        btrfs_release_path(NULL, path);
2041                } else {
2042                        ret = btrfs_bin_search(upper->eb, key, upper->level,
2043                                               &slot);
2044                        BUG_ON(ret);
2045                }
2046
2047                bytenr = btrfs_node_blockptr(upper->eb, slot);
2048                if (!lowest) {
2049                        if (node->eb->start == bytenr) {
2050                                btrfs_tree_unlock(upper->eb);
2051                                upper->locked = 0;
2052                                continue;
2053                        }
2054                } else {
2055                        BUG_ON(node->bytenr != bytenr);
2056                }
2057
2058                blocksize = btrfs_level_size(root, node->level);
2059                generation = btrfs_node_ptr_generation(upper->eb, slot);
2060                eb = read_tree_block(root, bytenr, blocksize, generation);
2061                btrfs_tree_lock(eb);
2062                btrfs_set_lock_blocking(eb);
2063
2064                if (!node->eb) {
2065                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
2066                                              slot, &eb);
2067                        if (ret < 0) {
2068                                err = ret;
2069                                break;
2070                        }
2071                        btrfs_set_lock_blocking(eb);
2072                        node->eb = eb;
2073                        node->locked = 1;
2074                } else {
2075                        btrfs_set_node_blockptr(upper->eb, slot,
2076                                                node->eb->start);
2077                        btrfs_set_node_ptr_generation(upper->eb, slot,
2078                                                      trans->transid);
2079                        btrfs_mark_buffer_dirty(upper->eb);
2080
2081                        ret = btrfs_inc_extent_ref(trans, root,
2082                                                node->eb->start, blocksize,
2083                                                upper->eb->start,
2084                                                btrfs_header_owner(upper->eb),
2085                                                node->level, 0);
2086                        BUG_ON(ret);
2087
2088                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2089                        BUG_ON(ret);
2090                }
2091                if (!lowest) {
2092                        btrfs_tree_unlock(upper->eb);
2093                        upper->locked = 0;
2094                }
2095        }
2096        path->lowest_level = 0;
2097        return err;
2098}
2099
2100static int link_to_upper(struct btrfs_trans_handle *trans,
2101                         struct backref_node *node,
2102                         struct btrfs_path *path)
2103{
2104        struct btrfs_key key;
2105        if (!node->eb || list_empty(&node->upper))
2106                return 0;
2107
2108        btrfs_node_key_to_cpu(node->eb, &key, 0);
2109        return do_relocation(trans, node, &key, path, 0);
2110}
2111
2112static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2113                                struct backref_cache *cache,
2114                                struct btrfs_path *path)
2115{
2116        struct backref_node *node;
2117        int level;
2118        int ret;
2119        int err = 0;
2120
2121        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2122                while (!list_empty(&cache->pending[level])) {
2123                        node = list_entry(cache->pending[level].next,
2124                                          struct backref_node, lower);
2125                        BUG_ON(node->level != level);
2126
2127                        ret = link_to_upper(trans, node, path);
2128                        if (ret < 0)
2129                                err = ret;
2130                        /*
2131                         * this remove the node from the pending list and
2132                         * may add some other nodes to the level + 1
2133                         * pending list
2134                         */
2135                        remove_backref_node(cache, node);
2136                }
2137        }
2138        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2139        return err;
2140}
2141
2142static void mark_block_processed(struct reloc_control *rc,
2143                                 struct backref_node *node)
2144{
2145        u32 blocksize;
2146        if (node->level == 0 ||
2147            in_block_group(node->bytenr, rc->block_group)) {
2148                blocksize = btrfs_level_size(rc->extent_root, node->level);
2149                set_extent_bits(&rc->processed_blocks, node->bytenr,
2150                                node->bytenr + blocksize - 1, EXTENT_DIRTY,
2151                                GFP_NOFS);
2152        }
2153        node->processed = 1;
2154}
2155
2156/*
2157 * mark a block and all blocks directly/indirectly reference the block
2158 * as processed.
2159 */
2160static void update_processed_blocks(struct reloc_control *rc,
2161                                    struct backref_node *node)
2162{
2163        struct backref_node *next = node;
2164        struct backref_edge *edge;
2165        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2166        int index = 0;
2167
2168        while (next) {
2169                cond_resched();
2170                while (1) {
2171                        if (next->processed)
2172                                break;
2173
2174                        mark_block_processed(rc, next);
2175
2176                        if (list_empty(&next->upper))
2177                                break;
2178
2179                        edge = list_entry(next->upper.next,
2180                                          struct backref_edge, list[LOWER]);
2181                        edges[index++] = edge;
2182                        next = edge->node[UPPER];
2183                }
2184                next = walk_down_backref(edges, &index);
2185        }
2186}
2187
2188static int tree_block_processed(u64 bytenr, u32 blocksize,
2189                                struct reloc_control *rc)
2190{
2191        if (test_range_bit(&rc->processed_blocks, bytenr,
2192                           bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2193                return 1;
2194        return 0;
2195}
2196
2197/*
2198 * check if there are any file extent pointers in the leaf point to
2199 * data require processing
2200 */
2201static int check_file_extents(struct reloc_control *rc,
2202                              u64 bytenr, u32 blocksize, u64 ptr_gen)
2203{
2204        struct btrfs_key found_key;
2205        struct btrfs_file_extent_item *fi;
2206        struct extent_buffer *leaf;
2207        u32 nritems;
2208        int i;
2209        int ret = 0;
2210
2211        leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2212
2213        nritems = btrfs_header_nritems(leaf);
2214        for (i = 0; i < nritems; i++) {
2215                cond_resched();
2216                btrfs_item_key_to_cpu(leaf, &found_key, i);
2217                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2218                        continue;
2219                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2220                if (btrfs_file_extent_type(leaf, fi) ==
2221                    BTRFS_FILE_EXTENT_INLINE)
2222                        continue;
2223                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2224                if (bytenr == 0)
2225                        continue;
2226                if (in_block_group(bytenr, rc->block_group)) {
2227                        ret = 1;
2228                        break;
2229                }
2230        }
2231        free_extent_buffer(leaf);
2232        return ret;
2233}
2234
2235/*
2236 * scan child blocks of a given block to find blocks require processing
2237 */
2238static int add_child_blocks(struct btrfs_trans_handle *trans,
2239                            struct reloc_control *rc,
2240                            struct backref_node *node,
2241                            struct rb_root *blocks)
2242{
2243        struct tree_block *block;
2244        struct rb_node *rb_node;
2245        u64 bytenr;
2246        u64 ptr_gen;
2247        u32 blocksize;
2248        u32 nritems;
2249        int i;
2250        int err = 0;
2251
2252        nritems = btrfs_header_nritems(node->eb);
2253        blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2254        for (i = 0; i < nritems; i++) {
2255                cond_resched();
2256                bytenr = btrfs_node_blockptr(node->eb, i);
2257                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2258                if (ptr_gen == trans->transid)
2259                        continue;
2260                if (!in_block_group(bytenr, rc->block_group) &&
2261                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2262                        continue;
2263                if (tree_block_processed(bytenr, blocksize, rc))
2264                        continue;
2265
2266                readahead_tree_block(rc->extent_root,
2267                                     bytenr, blocksize, ptr_gen);
2268        }
2269
2270        for (i = 0; i < nritems; i++) {
2271                cond_resched();
2272                bytenr = btrfs_node_blockptr(node->eb, i);
2273                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2274                if (ptr_gen == trans->transid)
2275                        continue;
2276                if (!in_block_group(bytenr, rc->block_group) &&
2277                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2278                        continue;
2279                if (tree_block_processed(bytenr, blocksize, rc))
2280                        continue;
2281                if (!in_block_group(bytenr, rc->block_group) &&
2282                    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2283                        continue;
2284
2285                block = kmalloc(sizeof(*block), GFP_NOFS);
2286                if (!block) {
2287                        err = -ENOMEM;
2288                        break;
2289                }
2290                block->bytenr = bytenr;
2291                btrfs_node_key_to_cpu(node->eb, &block->key, i);
2292                block->level = node->level - 1;
2293                block->key_ready = 1;
2294                rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2295                BUG_ON(rb_node);
2296        }
2297        if (err)
2298                free_block_list(blocks);
2299        return err;
2300}
2301
2302/*
2303 * find adjacent blocks require processing
2304 */
2305static noinline_for_stack
2306int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2307                        struct reloc_control *rc,
2308                        struct backref_cache *cache,
2309                        struct rb_root *blocks, int level,
2310                        struct backref_node **upper)
2311{
2312        struct backref_node *node;
2313        int ret = 0;
2314
2315        WARN_ON(!list_empty(&cache->pending[level]));
2316
2317        if (list_empty(&cache->pending[level + 1]))
2318                return 1;
2319
2320        node = list_entry(cache->pending[level + 1].next,
2321                          struct backref_node, lower);
2322        if (node->eb)
2323                ret = add_child_blocks(trans, rc, node, blocks);
2324
2325        *upper = node;
2326        return ret;
2327}
2328
2329static int get_tree_block_key(struct reloc_control *rc,
2330                              struct tree_block *block)
2331{
2332        struct extent_buffer *eb;
2333
2334        BUG_ON(block->key_ready);
2335        eb = read_tree_block(rc->extent_root, block->bytenr,
2336                             block->key.objectid, block->key.offset);
2337        WARN_ON(btrfs_header_level(eb) != block->level);
2338        if (block->level == 0)
2339                btrfs_item_key_to_cpu(eb, &block->key, 0);
2340        else
2341                btrfs_node_key_to_cpu(eb, &block->key, 0);
2342        free_extent_buffer(eb);
2343        block->key_ready = 1;
2344        return 0;
2345}
2346
2347static int reada_tree_block(struct reloc_control *rc,
2348                            struct tree_block *block)
2349{
2350        BUG_ON(block->key_ready);
2351        readahead_tree_block(rc->extent_root, block->bytenr,
2352                             block->key.objectid, block->key.offset);
2353        return 0;
2354}
2355
2356/*
2357 * helper function to relocate a tree block
2358 */
2359static int relocate_tree_block(struct btrfs_trans_handle *trans,
2360                                struct reloc_control *rc,
2361                                struct backref_node *node,
2362                                struct btrfs_key *key,
2363                                struct btrfs_path *path)
2364{
2365        struct btrfs_root *root;
2366        int ret;
2367
2368        root = select_one_root(trans, node);
2369        if (unlikely(!root)) {
2370                rc->found_old_snapshot = 1;
2371                update_processed_blocks(rc, node);
2372                return 0;
2373        }
2374
2375        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2376                ret = do_relocation(trans, node, key, path, 1);
2377                if (ret < 0)
2378                        goto out;
2379                if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2380                        ret = replace_file_extents(trans, rc, root,
2381                                                   node->eb, NULL);
2382                        if (ret < 0)
2383                                goto out;
2384                }
2385                drop_node_buffer(node);
2386        } else if (!root->ref_cows) {
2387                path->lowest_level = node->level;
2388                ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2389                btrfs_release_path(root, path);
2390                if (ret < 0)
2391                        goto out;
2392        } else if (root != node->root) {
2393                WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2394        }
2395
2396        update_processed_blocks(rc, node);
2397        ret = 0;
2398out:
2399        drop_node_buffer(node);
2400        return ret;
2401}
2402
2403/*
2404 * relocate a list of blocks
2405 */
2406static noinline_for_stack
2407int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2408                         struct reloc_control *rc, struct rb_root *blocks)
2409{
2410        struct backref_cache *cache;
2411        struct backref_node *node;
2412        struct btrfs_path *path;
2413        struct tree_block *block;
2414        struct rb_node *rb_node;
2415        int level = -1;
2416        int ret;
2417        int err = 0;
2418
2419        path = btrfs_alloc_path();
2420        if (!path)
2421                return -ENOMEM;
2422
2423        cache = kmalloc(sizeof(*cache), GFP_NOFS);
2424        if (!cache) {
2425                btrfs_free_path(path);
2426                return -ENOMEM;
2427        }
2428
2429        backref_cache_init(cache);
2430
2431        rb_node = rb_first(blocks);
2432        while (rb_node) {
2433                block = rb_entry(rb_node, struct tree_block, rb_node);
2434                if (level == -1)
2435                        level = block->level;
2436                else
2437                        BUG_ON(level != block->level);
2438                if (!block->key_ready)
2439                        reada_tree_block(rc, block);
2440                rb_node = rb_next(rb_node);
2441        }
2442
2443        rb_node = rb_first(blocks);
2444        while (rb_node) {
2445                block = rb_entry(rb_node, struct tree_block, rb_node);
2446                if (!block->key_ready)
2447                        get_tree_block_key(rc, block);
2448                rb_node = rb_next(rb_node);
2449        }
2450
2451        rb_node = rb_first(blocks);
2452        while (rb_node) {
2453                block = rb_entry(rb_node, struct tree_block, rb_node);
2454
2455                node = build_backref_tree(rc, cache, &block->key,
2456                                          block->level, block->bytenr);
2457                if (IS_ERR(node)) {
2458                        err = PTR_ERR(node);
2459                        goto out;
2460                }
2461
2462                ret = relocate_tree_block(trans, rc, node, &block->key,
2463                                          path);
2464                if (ret < 0) {
2465                        err = ret;
2466                        goto out;
2467                }
2468                remove_backref_node(cache, node);
2469                rb_node = rb_next(rb_node);
2470        }
2471
2472        if (level > 0)
2473                goto out;
2474
2475        free_block_list(blocks);
2476
2477        /*
2478         * now backrefs of some upper level tree blocks have been cached,
2479         * try relocating blocks referenced by these upper level blocks.
2480         */
2481        while (1) {
2482                struct backref_node *upper = NULL;
2483                if (trans->transaction->in_commit ||
2484                    trans->transaction->delayed_refs.flushing)
2485                        break;
2486
2487                ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
2488                                          &upper);
2489                if (ret < 0)
2490                        err = ret;
2491                if (ret != 0)
2492                        break;
2493
2494                rb_node = rb_first(blocks);
2495                while (rb_node) {
2496                        block = rb_entry(rb_node, struct tree_block, rb_node);
2497                        if (trans->transaction->in_commit ||
2498                            trans->transaction->delayed_refs.flushing)
2499                                goto out;
2500                        BUG_ON(!block->key_ready);
2501                        node = build_backref_tree(rc, cache, &block->key,
2502                                                  level, block->bytenr);
2503                        if (IS_ERR(node)) {
2504                                err = PTR_ERR(node);
2505                                goto out;
2506                        }
2507
2508                        ret = relocate_tree_block(trans, rc, node,
2509                                                  &block->key, path);
2510                        if (ret < 0) {
2511                                err = ret;
2512                                goto out;
2513                        }
2514                        remove_backref_node(cache, node);
2515                        rb_node = rb_next(rb_node);
2516                }
2517                free_block_list(blocks);
2518
2519                if (upper) {
2520                        ret = link_to_upper(trans, upper, path);
2521                        if (ret < 0) {
2522                                err = ret;
2523                                break;
2524                        }
2525                        remove_backref_node(cache, upper);
2526                }
2527        }
2528out:
2529        free_block_list(blocks);
2530
2531        ret = finish_pending_nodes(trans, cache, path);
2532        if (ret < 0)
2533                err = ret;
2534
2535        kfree(cache);
2536        btrfs_free_path(path);
2537        return err;
2538}
2539
2540static noinline_for_stack
2541int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2542                         u64 block_start)
2543{
2544        struct btrfs_root *root = BTRFS_I(inode)->root;
2545        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2546        struct extent_map *em;
2547        int ret = 0;
2548
2549        em = alloc_extent_map(GFP_NOFS);
2550        if (!em)
2551                return -ENOMEM;
2552
2553        em->start = start;
2554        em->len = end + 1 - start;
2555        em->block_len = em->len;
2556        em->block_start = block_start;
2557        em->bdev = root->fs_info->fs_devices->latest_bdev;
2558        set_bit(EXTENT_FLAG_PINNED, &em->flags);
2559
2560        lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2561        while (1) {
2562                write_lock(&em_tree->lock);
2563                ret = add_extent_mapping(em_tree, em);
2564                write_unlock(&em_tree->lock);
2565                if (ret != -EEXIST) {
2566                        free_extent_map(em);
2567                        break;
2568                }
2569                btrfs_drop_extent_cache(inode, start, end, 0);
2570        }
2571        unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2572        return ret;
2573}
2574
2575static int relocate_file_extent_cluster(struct inode *inode,
2576                                        struct file_extent_cluster *cluster)
2577{
2578        u64 page_start;
2579        u64 page_end;
2580        u64 offset = BTRFS_I(inode)->index_cnt;
2581        unsigned long index;
2582        unsigned long last_index;
2583        unsigned int dirty_page = 0;
2584        struct page *page;
2585        struct file_ra_state *ra;
2586        int nr = 0;
2587        int ret = 0;
2588
2589        if (!cluster->nr)
2590                return 0;
2591
2592        ra = kzalloc(sizeof(*ra), GFP_NOFS);
2593        if (!ra)
2594                return -ENOMEM;
2595
2596        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2597        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2598
2599        mutex_lock(&inode->i_mutex);
2600
2601        i_size_write(inode, cluster->end + 1 - offset);
2602        ret = setup_extent_mapping(inode, cluster->start - offset,
2603                                   cluster->end - offset, cluster->start);
2604        if (ret)
2605                goto out_unlock;
2606
2607        file_ra_state_init(ra, inode->i_mapping);
2608
2609        WARN_ON(cluster->start != cluster->boundary[0]);
2610        while (index <= last_index) {
2611                page = find_lock_page(inode->i_mapping, index);
2612                if (!page) {
2613                        page_cache_sync_readahead(inode->i_mapping,
2614                                                  ra, NULL, index,
2615                                                  last_index + 1 - index);
2616                        page = grab_cache_page(inode->i_mapping, index);
2617                        if (!page) {
2618                                ret = -ENOMEM;
2619                                goto out_unlock;
2620                        }
2621                }
2622
2623                if (PageReadahead(page)) {
2624                        page_cache_async_readahead(inode->i_mapping,
2625                                                   ra, NULL, page, index,
2626                                                   last_index + 1 - index);
2627                }
2628
2629                if (!PageUptodate(page)) {
2630                        btrfs_readpage(NULL, page);
2631                        lock_page(page);
2632                        if (!PageUptodate(page)) {
2633                                unlock_page(page);
2634                                page_cache_release(page);
2635                                ret = -EIO;
2636                                goto out_unlock;
2637                        }
2638                }
2639
2640                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2641                page_end = page_start + PAGE_CACHE_SIZE - 1;
2642
2643                lock_extent(&BTRFS_I(inode)->io_tree,
2644                            page_start, page_end, GFP_NOFS);
2645
2646                set_page_extent_mapped(page);
2647
2648                if (nr < cluster->nr &&
2649                    page_start + offset == cluster->boundary[nr]) {
2650                        set_extent_bits(&BTRFS_I(inode)->io_tree,
2651                                        page_start, page_end,
2652                                        EXTENT_BOUNDARY, GFP_NOFS);
2653                        nr++;
2654                }
2655                btrfs_set_extent_delalloc(inode, page_start, page_end);
2656
2657                set_page_dirty(page);
2658                dirty_page++;
2659
2660                unlock_extent(&BTRFS_I(inode)->io_tree,
2661                              page_start, page_end, GFP_NOFS);
2662                unlock_page(page);
2663                page_cache_release(page);
2664
2665                index++;
2666                if (nr < cluster->nr &&
2667                    page_end + 1 + offset == cluster->boundary[nr]) {
2668                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2669                                                           dirty_page);
2670                        dirty_page = 0;
2671                }
2672        }
2673        if (dirty_page) {
2674                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2675                                                   dirty_page);
2676        }
2677        WARN_ON(nr != cluster->nr);
2678out_unlock:
2679        mutex_unlock(&inode->i_mutex);
2680        kfree(ra);
2681        return ret;
2682}
2683
2684static noinline_for_stack
2685int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2686                         struct file_extent_cluster *cluster)
2687{
2688        int ret;
2689
2690        if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2691                ret = relocate_file_extent_cluster(inode, cluster);
2692                if (ret)
2693                        return ret;
2694                cluster->nr = 0;
2695        }
2696
2697        if (!cluster->nr)
2698                cluster->start = extent_key->objectid;
2699        else
2700                BUG_ON(cluster->nr >= MAX_EXTENTS);
2701        cluster->end = extent_key->objectid + extent_key->offset - 1;
2702        cluster->boundary[cluster->nr] = extent_key->objectid;
2703        cluster->nr++;
2704
2705        if (cluster->nr >= MAX_EXTENTS) {
2706                ret = relocate_file_extent_cluster(inode, cluster);
2707                if (ret)
2708                        return ret;
2709                cluster->nr = 0;
2710        }
2711        return 0;
2712}
2713
2714#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2715static int get_ref_objectid_v0(struct reloc_control *rc,
2716                               struct btrfs_path *path,
2717                               struct btrfs_key *extent_key,
2718                               u64 *ref_objectid, int *path_change)
2719{
2720        struct btrfs_key key;
2721        struct extent_buffer *leaf;
2722        struct btrfs_extent_ref_v0 *ref0;
2723        int ret;
2724        int slot;
2725
2726        leaf = path->nodes[0];
2727        slot = path->slots[0];
2728        while (1) {
2729                if (slot >= btrfs_header_nritems(leaf)) {
2730                        ret = btrfs_next_leaf(rc->extent_root, path);
2731                        if (ret < 0)
2732                                return ret;
2733                        BUG_ON(ret > 0);
2734                        leaf = path->nodes[0];
2735                        slot = path->slots[0];
2736                        if (path_change)
2737                                *path_change = 1;
2738                }
2739                btrfs_item_key_to_cpu(leaf, &key, slot);
2740                if (key.objectid != extent_key->objectid)
2741                        return -ENOENT;
2742
2743                if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
2744                        slot++;
2745                        continue;
2746                }
2747                ref0 = btrfs_item_ptr(leaf, slot,
2748                                struct btrfs_extent_ref_v0);
2749                *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
2750                break;
2751        }
2752        return 0;
2753}
2754#endif
2755
2756/*
2757 * helper to add a tree block to the list.
2758 * the major work is getting the generation and level of the block
2759 */
2760static int add_tree_block(struct reloc_control *rc,
2761                          struct btrfs_key *extent_key,
2762                          struct btrfs_path *path,
2763                          struct rb_root *blocks)
2764{
2765        struct extent_buffer *eb;
2766        struct btrfs_extent_item *ei;
2767        struct btrfs_tree_block_info *bi;
2768        struct tree_block *block;
2769        struct rb_node *rb_node;
2770        u32 item_size;
2771        int level = -1;
2772        int generation;
2773
2774        eb =  path->nodes[0];
2775        item_size = btrfs_item_size_nr(eb, path->slots[0]);
2776
2777        if (item_size >= sizeof(*ei) + sizeof(*bi)) {
2778                ei = btrfs_item_ptr(eb, path->slots[0],
2779                                struct btrfs_extent_item);
2780                bi = (struct btrfs_tree_block_info *)(ei + 1);
2781                generation = btrfs_extent_generation(eb, ei);
2782                level = btrfs_tree_block_level(eb, bi);
2783        } else {
2784#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2785                u64 ref_owner;
2786                int ret;
2787
2788                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2789                ret = get_ref_objectid_v0(rc, path, extent_key,
2790                                          &ref_owner, NULL);
2791                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2792                level = (int)ref_owner;
2793                /* FIXME: get real generation */
2794                generation = 0;
2795#else
2796                BUG();
2797#endif
2798        }
2799
2800        btrfs_release_path(rc->extent_root, path);
2801
2802        BUG_ON(level == -1);
2803
2804        block = kmalloc(sizeof(*block), GFP_NOFS);
2805        if (!block)
2806                return -ENOMEM;
2807
2808        block->bytenr = extent_key->objectid;
2809        block->key.objectid = extent_key->offset;
2810        block->key.offset = generation;
2811        block->level = level;
2812        block->key_ready = 0;
2813
2814        rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2815        BUG_ON(rb_node);
2816
2817        return 0;
2818}
2819
2820/*
2821 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2822 */
2823static int __add_tree_block(struct reloc_control *rc,
2824                            u64 bytenr, u32 blocksize,
2825                            struct rb_root *blocks)
2826{
2827        struct btrfs_path *path;
2828        struct btrfs_key key;
2829        int ret;
2830
2831        if (tree_block_processed(bytenr, blocksize, rc))
2832                return 0;
2833
2834        if (tree_search(blocks, bytenr))
2835                return 0;
2836
2837        path = btrfs_alloc_path();
2838        if (!path)
2839                return -ENOMEM;
2840
2841        key.objectid = bytenr;
2842        key.type = BTRFS_EXTENT_ITEM_KEY;
2843        key.offset = blocksize;
2844
2845        path->search_commit_root = 1;
2846        path->skip_locking = 1;
2847        ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2848        if (ret < 0)
2849                goto out;
2850        BUG_ON(ret);
2851
2852        btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2853        ret = add_tree_block(rc, &key, path, blocks);
2854out:
2855        btrfs_free_path(path);
2856        return ret;
2857}
2858
2859/*
2860 * helper to check if the block use full backrefs for pointers in it
2861 */
2862static int block_use_full_backref(struct reloc_control *rc,
2863                                  struct extent_buffer *eb)
2864{
2865        struct btrfs_path *path;
2866        struct btrfs_extent_item *ei;
2867        struct btrfs_key key;
2868        u64 flags;
2869        int ret;
2870
2871        if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
2872            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2873                return 1;
2874
2875        path = btrfs_alloc_path();
2876        BUG_ON(!path);
2877
2878        key.objectid = eb->start;
2879        key.type = BTRFS_EXTENT_ITEM_KEY;
2880        key.offset = eb->len;
2881
2882        path->search_commit_root = 1;
2883        path->skip_locking = 1;
2884        ret = btrfs_search_slot(NULL, rc->extent_root,
2885                                &key, path, 0, 0);
2886        BUG_ON(ret);
2887
2888        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2889                            struct btrfs_extent_item);
2890        flags = btrfs_extent_flags(path->nodes[0], ei);
2891        BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2892        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2893                ret = 1;
2894        else
2895                ret = 0;
2896        btrfs_free_path(path);
2897        return ret;
2898}
2899
2900/*
2901 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
2902 * this function scans fs tree to find blocks reference the data extent
2903 */
2904static int find_data_references(struct reloc_control *rc,
2905                                struct btrfs_key *extent_key,
2906                                struct extent_buffer *leaf,
2907                                struct btrfs_extent_data_ref *ref,
2908                                struct rb_root *blocks)
2909{
2910        struct btrfs_path *path;
2911        struct tree_block *block;
2912        struct btrfs_root *root;
2913        struct btrfs_file_extent_item *fi;
2914        struct rb_node *rb_node;
2915        struct btrfs_key key;
2916        u64 ref_root;
2917        u64 ref_objectid;
2918        u64 ref_offset;
2919        u32 ref_count;
2920        u32 nritems;
2921        int err = 0;
2922        int added = 0;
2923        int counted;
2924        int ret;
2925
2926        path = btrfs_alloc_path();
2927        if (!path)
2928                return -ENOMEM;
2929
2930        ref_root = btrfs_extent_data_ref_root(leaf, ref);
2931        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2932        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2933        ref_count = btrfs_extent_data_ref_count(leaf, ref);
2934
2935        root = read_fs_root(rc->extent_root->fs_info, ref_root);
2936        if (IS_ERR(root)) {
2937                err = PTR_ERR(root);
2938                goto out;
2939        }
2940
2941        key.objectid = ref_objectid;
2942        key.offset = ref_offset;
2943        key.type = BTRFS_EXTENT_DATA_KEY;
2944
2945        path->search_commit_root = 1;
2946        path->skip_locking = 1;
2947        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2948        if (ret < 0) {
2949                err = ret;
2950                goto out;
2951        }
2952
2953        leaf = path->nodes[0];
2954        nritems = btrfs_header_nritems(leaf);
2955        /*
2956         * the references in tree blocks that use full backrefs
2957         * are not counted in
2958         */
2959        if (block_use_full_backref(rc, leaf))
2960                counted = 0;
2961        else
2962                counted = 1;
2963        rb_node = tree_search(blocks, leaf->start);
2964        if (rb_node) {
2965                if (counted)
2966                        added = 1;
2967                else
2968                        path->slots[0] = nritems;
2969        }
2970
2971        while (ref_count > 0) {
2972                while (path->slots[0] >= nritems) {
2973                        ret = btrfs_next_leaf(root, path);
2974                        if (ret < 0) {
2975                                err = ret;
2976                                goto out;
2977                        }
2978                        if (ret > 0) {
2979                                WARN_ON(1);
2980                                goto out;
2981                        }
2982
2983                        leaf = path->nodes[0];
2984                        nritems = btrfs_header_nritems(leaf);
2985                        added = 0;
2986
2987                        if (block_use_full_backref(rc, leaf))
2988                                counted = 0;
2989                        else
2990                                counted = 1;
2991                        rb_node = tree_search(blocks, leaf->start);
2992                        if (rb_node) {
2993                                if (counted)
2994                                        added = 1;
2995                                else
2996                                        path->slots[0] = nritems;
2997                        }
2998                }
2999
3000                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3001                if (key.objectid != ref_objectid ||
3002                    key.type != BTRFS_EXTENT_DATA_KEY) {
3003                        WARN_ON(1);
3004                        break;
3005                }
3006
3007                fi = btrfs_item_ptr(leaf, path->slots[0],
3008                                    struct btrfs_file_extent_item);
3009
3010                if (btrfs_file_extent_type(leaf, fi) ==
3011                    BTRFS_FILE_EXTENT_INLINE)
3012                        goto next;
3013
3014                if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
3015                    extent_key->objectid)
3016                        goto next;
3017
3018                key.offset -= btrfs_file_extent_offset(leaf, fi);
3019                if (key.offset != ref_offset)
3020                        goto next;
3021
3022                if (counted)
3023                        ref_count--;
3024                if (added)
3025                        goto next;
3026
3027                if (!tree_block_processed(leaf->start, leaf->len, rc)) {
3028                        block = kmalloc(sizeof(*block), GFP_NOFS);
3029                        if (!block) {
3030                                err = -ENOMEM;
3031                                break;
3032                        }
3033                        block->bytenr = leaf->start;
3034                        btrfs_item_key_to_cpu(leaf, &block->key, 0);
3035                        block->level = 0;
3036                        block->key_ready = 1;
3037                        rb_node = tree_insert(blocks, block->bytenr,
3038                                              &block->rb_node);
3039                        BUG_ON(rb_node);
3040                }
3041                if (counted)
3042                        added = 1;
3043                else
3044                        path->slots[0] = nritems;
3045next:
3046                path->slots[0]++;
3047
3048        }
3049out:
3050        btrfs_free_path(path);
3051        return err;
3052}
3053
3054/*
3055 * hepler to find all tree blocks that reference a given data extent
3056 */
3057static noinline_for_stack
3058int add_data_references(struct reloc_control *rc,
3059                        struct btrfs_key *extent_key,
3060                        struct btrfs_path *path,
3061                        struct rb_root *blocks)
3062{
3063        struct btrfs_key key;
3064        struct extent_buffer *eb;
3065        struct btrfs_extent_data_ref *dref;
3066        struct btrfs_extent_inline_ref *iref;
3067        unsigned long ptr;
3068        unsigned long end;
3069        u32 blocksize;
3070        int ret;
3071        int err = 0;
3072
3073        ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3074                               extent_key->offset);
3075        BUG_ON(ret < 0);
3076        if (ret > 0) {
3077                /* the relocated data is fragmented */
3078                rc->extents_skipped++;
3079                btrfs_release_path(rc->extent_root, path);
3080                return 0;
3081        }
3082
3083        blocksize = btrfs_level_size(rc->extent_root, 0);
3084
3085        eb = path->nodes[0];
3086        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3087        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
3088#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3089        if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
3090                ptr = end;
3091        else
3092#endif
3093                ptr += sizeof(struct btrfs_extent_item);
3094
3095        while (ptr < end) {
3096                iref = (struct btrfs_extent_inline_ref *)ptr;
3097                key.type = btrfs_extent_inline_ref_type(eb, iref);
3098                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3099                        key.offset = btrfs_extent_inline_ref_offset(eb, iref);
3100                        ret = __add_tree_block(rc, key.offset, blocksize,
3101                                               blocks);
3102                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3103                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
3104                        ret = find_data_references(rc, extent_key,
3105                                                   eb, dref, blocks);
3106                } else {
3107                        BUG();
3108                }
3109                ptr += btrfs_extent_inline_ref_size(key.type);
3110        }
3111        WARN_ON(ptr > end);
3112
3113        while (1) {
3114                cond_resched();
3115                eb = path->nodes[0];
3116                if (path->slots[0] >= btrfs_header_nritems(eb)) {
3117                        ret = btrfs_next_leaf(rc->extent_root, path);
3118                        if (ret < 0) {
3119                                err = ret;
3120                                break;
3121                        }
3122                        if (ret > 0)
3123                                break;
3124                        eb = path->nodes[0];
3125                }
3126
3127                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
3128                if (key.objectid != extent_key->objectid)
3129                        break;
3130
3131#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3132                if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
3133                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
3134#else
3135                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
3136                if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
3137#endif
3138                        ret = __add_tree_block(rc, key.offset, blocksize,
3139                                               blocks);
3140                } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
3141                        dref = btrfs_item_ptr(eb, path->slots[0],
3142                                              struct btrfs_extent_data_ref);
3143                        ret = find_data_references(rc, extent_key,
3144                                                   eb, dref, blocks);
3145                } else {
3146                        ret = 0;
3147                }
3148                if (ret) {
3149                        err = ret;
3150                        break;
3151                }
3152                path->slots[0]++;
3153        }
3154        btrfs_release_path(rc->extent_root, path);
3155        if (err)
3156                free_block_list(blocks);
3157        return err;
3158}
3159
3160/*
3161 * hepler to find next unprocessed extent
3162 */
3163static noinline_for_stack
3164int find_next_extent(struct btrfs_trans_handle *trans,
3165                     struct reloc_control *rc, struct btrfs_path *path)
3166{
3167        struct btrfs_key key;
3168        struct extent_buffer *leaf;
3169        u64 start, end, last;
3170        int ret;
3171
3172        last = rc->block_group->key.objectid + rc->block_group->key.offset;
3173        while (1) {
3174                cond_resched();
3175                if (rc->search_start >= last) {
3176                        ret = 1;
3177                        break;
3178                }
3179
3180                key.objectid = rc->search_start;
3181                key.type = BTRFS_EXTENT_ITEM_KEY;
3182                key.offset = 0;
3183
3184                path->search_commit_root = 1;
3185                path->skip_locking = 1;
3186                ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3187                                        0, 0);
3188                if (ret < 0)
3189                        break;
3190next:
3191                leaf = path->nodes[0];
3192                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3193                        ret = btrfs_next_leaf(rc->extent_root, path);
3194                        if (ret != 0)
3195                                break;
3196                        leaf = path->nodes[0];
3197                }
3198
3199                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3200                if (key.objectid >= last) {
3201                        ret = 1;
3202                        break;
3203                }
3204
3205                if (key.type != BTRFS_EXTENT_ITEM_KEY ||
3206                    key.objectid + key.offset <= rc->search_start) {
3207                        path->slots[0]++;
3208                        goto next;
3209                }
3210
3211                ret = find_first_extent_bit(&rc->processed_blocks,
3212                                            key.objectid, &start, &end,
3213                                            EXTENT_DIRTY);
3214
3215                if (ret == 0 && start <= key.objectid) {
3216                        btrfs_release_path(rc->extent_root, path);
3217                        rc->search_start = end + 1;
3218                } else {
3219                        rc->search_start = key.objectid + key.offset;
3220                        return 0;
3221                }
3222        }
3223        btrfs_release_path(rc->extent_root, path);
3224        return ret;
3225}
3226
3227static void set_reloc_control(struct reloc_control *rc)
3228{
3229        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3230        mutex_lock(&fs_info->trans_mutex);
3231        fs_info->reloc_ctl = rc;
3232        mutex_unlock(&fs_info->trans_mutex);
3233}
3234
3235static void unset_reloc_control(struct reloc_control *rc)
3236{
3237        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3238        mutex_lock(&fs_info->trans_mutex);
3239        fs_info->reloc_ctl = NULL;
3240        mutex_unlock(&fs_info->trans_mutex);
3241}
3242
3243static int check_extent_flags(u64 flags)
3244{
3245        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3246            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3247                return 1;
3248        if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3249            !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3250                return 1;
3251        if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3252            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3253                return 1;
3254        return 0;
3255}
3256
3257
3258static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3259{
3260        struct rb_root blocks = RB_ROOT;
3261        struct btrfs_key key;
3262        struct file_extent_cluster *cluster;
3263        struct btrfs_trans_handle *trans = NULL;
3264        struct btrfs_path *path;
3265        struct btrfs_extent_item *ei;
3266        unsigned long nr;
3267        u64 flags;
3268        u32 item_size;
3269        int ret;
3270        int err = 0;
3271
3272        cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3273        if (!cluster)
3274                return -ENOMEM;
3275
3276        path = btrfs_alloc_path();
3277        if (!path)
3278                return -ENOMEM;
3279
3280        rc->extents_found = 0;
3281        rc->extents_skipped = 0;
3282
3283        rc->search_start = rc->block_group->key.objectid;
3284        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3285                          GFP_NOFS);
3286
3287        rc->create_reloc_root = 1;
3288        set_reloc_control(rc);
3289
3290        trans = btrfs_start_transaction(rc->extent_root, 1);
3291        btrfs_commit_transaction(trans, rc->extent_root);
3292
3293        while (1) {
3294                trans = btrfs_start_transaction(rc->extent_root, 1);
3295
3296                ret = find_next_extent(trans, rc, path);
3297                if (ret < 0)
3298                        err = ret;
3299                if (ret != 0)
3300                        break;
3301
3302                rc->extents_found++;
3303
3304                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3305                                    struct btrfs_extent_item);
3306                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3307                item_size = btrfs_item_size_nr(path->nodes[0],
3308                                               path->slots[0]);
3309                if (item_size >= sizeof(*ei)) {
3310                        flags = btrfs_extent_flags(path->nodes[0], ei);
3311                        ret = check_extent_flags(flags);
3312                        BUG_ON(ret);
3313
3314                } else {
3315#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3316                        u64 ref_owner;
3317                        int path_change = 0;
3318
3319                        BUG_ON(item_size !=
3320                               sizeof(struct btrfs_extent_item_v0));
3321                        ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
3322                                                  &path_change);
3323                        if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
3324                                flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
3325                        else
3326                                flags = BTRFS_EXTENT_FLAG_DATA;
3327
3328                        if (path_change) {
3329                                btrfs_release_path(rc->extent_root, path);
3330
3331                                path->search_commit_root = 1;
3332                                path->skip_locking = 1;
3333                                ret = btrfs_search_slot(NULL, rc->extent_root,
3334                                                        &key, path, 0, 0);
3335                                if (ret < 0) {
3336                                        err = ret;
3337                                        break;
3338                                }
3339                                BUG_ON(ret > 0);
3340                        }
3341#else
3342                        BUG();
3343#endif
3344                }
3345
3346                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3347                        ret = add_tree_block(rc, &key, path, &blocks);
3348                } else if (rc->stage == UPDATE_DATA_PTRS &&
3349                         (flags & BTRFS_EXTENT_FLAG_DATA)) {
3350                        ret = add_data_references(rc, &key, path, &blocks);
3351                } else {
3352                        btrfs_release_path(rc->extent_root, path);
3353                        ret = 0;
3354                }
3355                if (ret < 0) {
3356                        err = 0;
3357                        break;
3358                }
3359
3360                if (!RB_EMPTY_ROOT(&blocks)) {
3361                        ret = relocate_tree_blocks(trans, rc, &blocks);
3362                        if (ret < 0) {
3363                                err = ret;
3364                                break;
3365                        }
3366                }
3367
3368                nr = trans->blocks_used;
3369                btrfs_end_transaction(trans, rc->extent_root);
3370                trans = NULL;
3371                btrfs_btree_balance_dirty(rc->extent_root, nr);
3372
3373                if (rc->stage == MOVE_DATA_EXTENTS &&
3374                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
3375                        rc->found_file_extent = 1;
3376                        ret = relocate_data_extent(rc->data_inode,
3377                                                   &key, cluster);
3378                        if (ret < 0) {
3379                                err = ret;
3380                                break;
3381                        }
3382                }
3383        }
3384        btrfs_free_path(path);
3385
3386        if (trans) {
3387                nr = trans->blocks_used;
3388                btrfs_end_transaction(trans, rc->extent_root);
3389                btrfs_btree_balance_dirty(rc->extent_root, nr);
3390        }
3391
3392        if (!err) {
3393                ret = relocate_file_extent_cluster(rc->data_inode, cluster);
3394                if (ret < 0)
3395                        err = ret;
3396        }
3397
3398        kfree(cluster);
3399
3400        rc->create_reloc_root = 0;
3401        smp_mb();
3402
3403        if (rc->extents_found > 0) {
3404                trans = btrfs_start_transaction(rc->extent_root, 1);
3405                btrfs_commit_transaction(trans, rc->extent_root);
3406        }
3407
3408        merge_reloc_roots(rc);
3409
3410        unset_reloc_control(rc);
3411
3412        /* get rid of pinned extents */
3413        trans = btrfs_start_transaction(rc->extent_root, 1);
3414        btrfs_commit_transaction(trans, rc->extent_root);
3415
3416        return err;
3417}
3418
3419static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3420                                 struct btrfs_root *root, u64 objectid)
3421{
3422        struct btrfs_path *path;
3423        struct btrfs_inode_item *item;
3424        struct extent_buffer *leaf;
3425        int ret;
3426
3427        path = btrfs_alloc_path();
3428        if (!path)
3429                return -ENOMEM;
3430
3431        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3432        if (ret)
3433                goto out;
3434
3435        leaf = path->nodes[0];
3436        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3437        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
3438        btrfs_set_inode_generation(leaf, item, 1);
3439        btrfs_set_inode_size(leaf, item, 0);
3440        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3441        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
3442        btrfs_mark_buffer_dirty(leaf);
3443        btrfs_release_path(root, path);
3444out:
3445        btrfs_free_path(path);
3446        return ret;
3447}
3448
3449/*
3450 * helper to create inode for data relocation.
3451 * the inode is in data relocation tree and its link count is 0
3452 */
3453static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3454                                        struct btrfs_block_group_cache *group)
3455{
3456        struct inode *inode = NULL;
3457        struct btrfs_trans_handle *trans;
3458        struct btrfs_root *root;
3459        struct btrfs_key key;
3460        unsigned long nr;
3461        u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
3462        int err = 0;
3463
3464        root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
3465        if (IS_ERR(root))
3466                return ERR_CAST(root);
3467
3468        trans = btrfs_start_transaction(root, 1);
3469        BUG_ON(!trans);
3470
3471        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3472        if (err)
3473                goto out;
3474
3475        err = __insert_orphan_inode(trans, root, objectid);
3476        BUG_ON(err);
3477
3478        key.objectid = objectid;
3479        key.type = BTRFS_INODE_ITEM_KEY;
3480        key.offset = 0;
3481        inode = btrfs_iget(root->fs_info->sb, &key, root);
3482        BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3483        BTRFS_I(inode)->index_cnt = group->key.objectid;
3484
3485        err = btrfs_orphan_add(trans, inode);
3486out:
3487        nr = trans->blocks_used;
3488        btrfs_end_transaction(trans, root);
3489
3490        btrfs_btree_balance_dirty(root, nr);
3491        if (err) {
3492                if (inode)
3493                        iput(inode);
3494                inode = ERR_PTR(err);
3495        }
3496        return inode;
3497}
3498
3499/*
3500 * function to relocate all extents in a block group.
3501 */
3502int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3503{
3504        struct btrfs_fs_info *fs_info = extent_root->fs_info;
3505        struct reloc_control *rc;
3506        int ret;
3507        int err = 0;
3508
3509        rc = kzalloc(sizeof(*rc), GFP_NOFS);
3510        if (!rc)
3511                return -ENOMEM;
3512
3513        mapping_tree_init(&rc->reloc_root_tree);
3514        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3515        INIT_LIST_HEAD(&rc->reloc_roots);
3516
3517        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3518        BUG_ON(!rc->block_group);
3519
3520        btrfs_init_workers(&rc->workers, "relocate",
3521                           fs_info->thread_pool_size, NULL);
3522
3523        rc->extent_root = extent_root;
3524        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
3525
3526        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3527        if (IS_ERR(rc->data_inode)) {
3528                err = PTR_ERR(rc->data_inode);
3529                rc->data_inode = NULL;
3530                goto out;
3531        }
3532
3533        printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
3534               (unsigned long long)rc->block_group->key.objectid,
3535               (unsigned long long)rc->block_group->flags);
3536
3537        btrfs_start_delalloc_inodes(fs_info->tree_root);
3538        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
3539
3540        while (1) {
3541                rc->extents_found = 0;
3542                rc->extents_skipped = 0;
3543
3544                mutex_lock(&fs_info->cleaner_mutex);
3545
3546                btrfs_clean_old_snapshots(fs_info->tree_root);
3547                ret = relocate_block_group(rc);
3548
3549                mutex_unlock(&fs_info->cleaner_mutex);
3550                if (ret < 0) {
3551                        err = ret;
3552                        break;
3553                }
3554
3555                if (rc->extents_found == 0)
3556                        break;
3557
3558                printk(KERN_INFO "btrfs: found %llu extents\n",
3559                        (unsigned long long)rc->extents_found);
3560
3561                if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3562                        btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
3563                        invalidate_mapping_pages(rc->data_inode->i_mapping,
3564                                                 0, -1);
3565                        rc->stage = UPDATE_DATA_PTRS;
3566                } else if (rc->stage == UPDATE_DATA_PTRS &&
3567                           rc->extents_skipped >= rc->extents_found) {
3568                        iput(rc->data_inode);
3569                        rc->data_inode = create_reloc_inode(fs_info,
3570                                                            rc->block_group);
3571                        if (IS_ERR(rc->data_inode)) {
3572                                err = PTR_ERR(rc->data_inode);
3573                                rc->data_inode = NULL;
3574                                break;
3575                        }
3576                        rc->stage = MOVE_DATA_EXTENTS;
3577                        rc->found_file_extent = 0;
3578                }
3579        }
3580
3581        filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
3582                                     rc->block_group->key.objectid,
3583                                     rc->block_group->key.objectid +
3584                                     rc->block_group->key.offset - 1);
3585
3586        WARN_ON(rc->block_group->pinned > 0);
3587        WARN_ON(rc->block_group->reserved > 0);
3588        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3589out:
3590        iput(rc->data_inode);
3591        btrfs_stop_workers(&rc->workers);
3592        btrfs_put_block_group(rc->block_group);
3593        kfree(rc);
3594        return err;
3595}
3596
3597static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3598{
3599        struct btrfs_trans_handle *trans;
3600        int ret;
3601
3602        trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
3603
3604        memset(&root->root_item.drop_progress, 0,
3605                sizeof(root->root_item.drop_progress));
3606        root->root_item.drop_level = 0;
3607        btrfs_set_root_refs(&root->root_item, 0);
3608        ret = btrfs_update_root(trans, root->fs_info->tree_root,
3609                                &root->root_key, &root->root_item);
3610        BUG_ON(ret);
3611
3612        ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
3613        BUG_ON(ret);
3614        return 0;
3615}
3616
3617/*
3618 * recover relocation interrupted by system crash.
3619 *
3620 * this function resumes merging reloc trees with corresponding fs trees.
3621 * this is important for keeping the sharing of tree blocks
3622 */
3623int btrfs_recover_relocation(struct btrfs_root *root)
3624{
3625        LIST_HEAD(reloc_roots);
3626        struct btrfs_key key;
3627        struct btrfs_root *fs_root;
3628        struct btrfs_root *reloc_root;
3629        struct btrfs_path *path;
3630        struct extent_buffer *leaf;
3631        struct reloc_control *rc = NULL;
3632        struct btrfs_trans_handle *trans;
3633        int ret;
3634        int err = 0;
3635
3636        path = btrfs_alloc_path();
3637        if (!path)
3638                return -ENOMEM;
3639
3640        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3641        key.type = BTRFS_ROOT_ITEM_KEY;
3642        key.offset = (u64)-1;
3643
3644        while (1) {
3645                ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
3646                                        path, 0, 0);
3647                if (ret < 0) {
3648                        err = ret;
3649                        goto out;
3650                }
3651                if (ret > 0) {
3652                        if (path->slots[0] == 0)
3653                                break;
3654                        path->slots[0]--;
3655                }
3656                leaf = path->nodes[0];
3657                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3658                btrfs_release_path(root->fs_info->tree_root, path);
3659
3660                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3661                    key.type != BTRFS_ROOT_ITEM_KEY)
3662                        break;
3663
3664                reloc_root = btrfs_read_fs_root_no_radix(root, &key);
3665                if (IS_ERR(reloc_root)) {
3666                        err = PTR_ERR(reloc_root);
3667                        goto out;
3668                }
3669
3670                list_add(&reloc_root->root_list, &reloc_roots);
3671
3672                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3673                        fs_root = read_fs_root(root->fs_info,
3674                                               reloc_root->root_key.offset);
3675                        if (IS_ERR(fs_root)) {
3676                                ret = PTR_ERR(fs_root);
3677                                if (ret != -ENOENT) {
3678                                        err = ret;
3679                                        goto out;
3680                                }
3681                                mark_garbage_root(reloc_root);
3682                        }
3683                }
3684
3685                if (key.offset == 0)
3686                        break;
3687
3688                key.offset--;
3689        }
3690        btrfs_release_path(root->fs_info->tree_root, path);
3691
3692        if (list_empty(&reloc_roots))
3693                goto out;
3694
3695        rc = kzalloc(sizeof(*rc), GFP_NOFS);
3696        if (!rc) {
3697                err = -ENOMEM;
3698                goto out;
3699        }
3700
3701        mapping_tree_init(&rc->reloc_root_tree);
3702        INIT_LIST_HEAD(&rc->reloc_roots);
3703        btrfs_init_workers(&rc->workers, "relocate",
3704                           root->fs_info->thread_pool_size, NULL);
3705        rc->extent_root = root->fs_info->extent_root;
3706
3707        set_reloc_control(rc);
3708
3709        while (!list_empty(&reloc_roots)) {
3710                reloc_root = list_entry(reloc_roots.next,
3711                                        struct btrfs_root, root_list);
3712                list_del(&reloc_root->root_list);
3713
3714                if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3715                        list_add_tail(&reloc_root->root_list,
3716                                      &rc->reloc_roots);
3717                        continue;
3718                }
3719
3720                fs_root = read_fs_root(root->fs_info,
3721                                       reloc_root->root_key.offset);
3722                BUG_ON(IS_ERR(fs_root));
3723
3724                __add_reloc_root(reloc_root);
3725                fs_root->reloc_root = reloc_root;
3726        }
3727
3728        trans = btrfs_start_transaction(rc->extent_root, 1);
3729        btrfs_commit_transaction(trans, rc->extent_root);
3730
3731        merge_reloc_roots(rc);
3732
3733        unset_reloc_control(rc);
3734
3735        trans = btrfs_start_transaction(rc->extent_root, 1);
3736        btrfs_commit_transaction(trans, rc->extent_root);
3737out:
3738        if (rc) {
3739                btrfs_stop_workers(&rc->workers);
3740                kfree(rc);
3741        }
3742        while (!list_empty(&reloc_roots)) {
3743                reloc_root = list_entry(reloc_roots.next,
3744                                        struct btrfs_root, root_list);
3745                list_del(&reloc_root->root_list);
3746                free_extent_buffer(reloc_root->node);
3747                free_extent_buffer(reloc_root->commit_root);
3748                kfree(reloc_root);
3749        }
3750        btrfs_free_path(path);
3751
3752        if (err == 0) {
3753                /* cleanup orphan inode in data relocation tree */
3754                fs_root = read_fs_root(root->fs_info,
3755                                       BTRFS_DATA_RELOC_TREE_OBJECTID);
3756                if (IS_ERR(fs_root))
3757                        err = PTR_ERR(fs_root);
3758        }
3759        return err;
3760}
3761
3762/*
3763 * helper to add ordered checksum for data relocation.
3764 *
3765 * cloning checksum properly handles the nodatasum extents.
3766 * it also saves CPU time to re-calculate the checksum.
3767 */
3768int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3769{
3770        struct btrfs_ordered_sum *sums;
3771        struct btrfs_sector_sum *sector_sum;
3772        struct btrfs_ordered_extent *ordered;
3773        struct btrfs_root *root = BTRFS_I(inode)->root;
3774        size_t offset;
3775        int ret;
3776        u64 disk_bytenr;
3777        LIST_HEAD(list);
3778
3779        ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3780        BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
3781
3782        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
3783        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
3784                                       disk_bytenr + len - 1, &list);
3785
3786        while (!list_empty(&list)) {
3787                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3788                list_del_init(&sums->list);
3789
3790                sector_sum = sums->sums;
3791                sums->bytenr = ordered->start;
3792
3793                offset = 0;
3794                while (offset < sums->len) {
3795                        sector_sum->bytenr += ordered->start - disk_bytenr;
3796                        sector_sum++;
3797                        offset += root->sectorsize;
3798                }
3799
3800                btrfs_add_ordered_sum(inode, ordered, sums);
3801        }
3802        btrfs_put_ordered_extent(ordered);
3803        return 0;
3804}
3805