linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/pagemap.h>
  20#include <linux/writeback.h>
  21#include <linux/blkdev.h>
  22#include <linux/sort.h>
  23#include <linux/rcupdate.h>
  24#include <linux/kthread.h>
  25#include <linux/slab.h>
  26#include "compat.h"
  27#include "hash.h"
  28#include "ctree.h"
  29#include "disk-io.h"
  30#include "print-tree.h"
  31#include "transaction.h"
  32#include "volumes.h"
  33#include "locking.h"
  34#include "free-space-cache.h"
  35
  36static int update_block_group(struct btrfs_trans_handle *trans,
  37                              struct btrfs_root *root,
  38                              u64 bytenr, u64 num_bytes, int alloc);
  39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
  40                                 u64 num_bytes, int reserve, int sinfo);
  41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  42                                struct btrfs_root *root,
  43                                u64 bytenr, u64 num_bytes, u64 parent,
  44                                u64 root_objectid, u64 owner_objectid,
  45                                u64 owner_offset, int refs_to_drop,
  46                                struct btrfs_delayed_extent_op *extra_op);
  47static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  48                                    struct extent_buffer *leaf,
  49                                    struct btrfs_extent_item *ei);
  50static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  51                                      struct btrfs_root *root,
  52                                      u64 parent, u64 root_objectid,
  53                                      u64 flags, u64 owner, u64 offset,
  54                                      struct btrfs_key *ins, int ref_mod);
  55static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  56                                     struct btrfs_root *root,
  57                                     u64 parent, u64 root_objectid,
  58                                     u64 flags, struct btrfs_disk_key *key,
  59                                     int level, struct btrfs_key *ins);
  60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  61                          struct btrfs_root *extent_root, u64 alloc_bytes,
  62                          u64 flags, int force);
  63static int find_next_key(struct btrfs_path *path, int level,
  64                         struct btrfs_key *key);
  65static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
  66                            int dump_block_groups);
  67
  68static noinline int
  69block_group_cache_done(struct btrfs_block_group_cache *cache)
  70{
  71        smp_mb();
  72        return cache->cached == BTRFS_CACHE_FINISHED;
  73}
  74
  75static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
  76{
  77        return (cache->flags & bits) == bits;
  78}
  79
  80void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
  81{
  82        atomic_inc(&cache->count);
  83}
  84
  85void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
  86{
  87        if (atomic_dec_and_test(&cache->count)) {
  88                WARN_ON(cache->pinned > 0);
  89                WARN_ON(cache->reserved > 0);
  90                WARN_ON(cache->reserved_pinned > 0);
  91                kfree(cache);
  92        }
  93}
  94
  95/*
  96 * this adds the block group to the fs_info rb tree for the block group
  97 * cache
  98 */
  99static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 100                                struct btrfs_block_group_cache *block_group)
 101{
 102        struct rb_node **p;
 103        struct rb_node *parent = NULL;
 104        struct btrfs_block_group_cache *cache;
 105
 106        spin_lock(&info->block_group_cache_lock);
 107        p = &info->block_group_cache_tree.rb_node;
 108
 109        while (*p) {
 110                parent = *p;
 111                cache = rb_entry(parent, struct btrfs_block_group_cache,
 112                                 cache_node);
 113                if (block_group->key.objectid < cache->key.objectid) {
 114                        p = &(*p)->rb_left;
 115                } else if (block_group->key.objectid > cache->key.objectid) {
 116                        p = &(*p)->rb_right;
 117                } else {
 118                        spin_unlock(&info->block_group_cache_lock);
 119                        return -EEXIST;
 120                }
 121        }
 122
 123        rb_link_node(&block_group->cache_node, parent, p);
 124        rb_insert_color(&block_group->cache_node,
 125                        &info->block_group_cache_tree);
 126        spin_unlock(&info->block_group_cache_lock);
 127
 128        return 0;
 129}
 130
 131/*
 132 * This will return the block group at or after bytenr if contains is 0, else
 133 * it will return the block group that contains the bytenr
 134 */
 135static struct btrfs_block_group_cache *
 136block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 137                              int contains)
 138{
 139        struct btrfs_block_group_cache *cache, *ret = NULL;
 140        struct rb_node *n;
 141        u64 end, start;
 142
 143        spin_lock(&info->block_group_cache_lock);
 144        n = info->block_group_cache_tree.rb_node;
 145
 146        while (n) {
 147                cache = rb_entry(n, struct btrfs_block_group_cache,
 148                                 cache_node);
 149                end = cache->key.objectid + cache->key.offset - 1;
 150                start = cache->key.objectid;
 151
 152                if (bytenr < start) {
 153                        if (!contains && (!ret || start < ret->key.objectid))
 154                                ret = cache;
 155                        n = n->rb_left;
 156                } else if (bytenr > start) {
 157                        if (contains && bytenr <= end) {
 158                                ret = cache;
 159                                break;
 160                        }
 161                        n = n->rb_right;
 162                } else {
 163                        ret = cache;
 164                        break;
 165                }
 166        }
 167        if (ret)
 168                btrfs_get_block_group(ret);
 169        spin_unlock(&info->block_group_cache_lock);
 170
 171        return ret;
 172}
 173
 174static int add_excluded_extent(struct btrfs_root *root,
 175                               u64 start, u64 num_bytes)
 176{
 177        u64 end = start + num_bytes - 1;
 178        set_extent_bits(&root->fs_info->freed_extents[0],
 179                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 180        set_extent_bits(&root->fs_info->freed_extents[1],
 181                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 182        return 0;
 183}
 184
 185static void free_excluded_extents(struct btrfs_root *root,
 186                                  struct btrfs_block_group_cache *cache)
 187{
 188        u64 start, end;
 189
 190        start = cache->key.objectid;
 191        end = start + cache->key.offset - 1;
 192
 193        clear_extent_bits(&root->fs_info->freed_extents[0],
 194                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 195        clear_extent_bits(&root->fs_info->freed_extents[1],
 196                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 197}
 198
 199static int exclude_super_stripes(struct btrfs_root *root,
 200                                 struct btrfs_block_group_cache *cache)
 201{
 202        u64 bytenr;
 203        u64 *logical;
 204        int stripe_len;
 205        int i, nr, ret;
 206
 207        if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 208                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 209                cache->bytes_super += stripe_len;
 210                ret = add_excluded_extent(root, cache->key.objectid,
 211                                          stripe_len);
 212                BUG_ON(ret);
 213        }
 214
 215        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 216                bytenr = btrfs_sb_offset(i);
 217                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 218                                       cache->key.objectid, bytenr,
 219                                       0, &logical, &nr, &stripe_len);
 220                BUG_ON(ret);
 221
 222                while (nr--) {
 223                        cache->bytes_super += stripe_len;
 224                        ret = add_excluded_extent(root, logical[nr],
 225                                                  stripe_len);
 226                        BUG_ON(ret);
 227                }
 228
 229                kfree(logical);
 230        }
 231        return 0;
 232}
 233
 234static struct btrfs_caching_control *
 235get_caching_control(struct btrfs_block_group_cache *cache)
 236{
 237        struct btrfs_caching_control *ctl;
 238
 239        spin_lock(&cache->lock);
 240        if (cache->cached != BTRFS_CACHE_STARTED) {
 241                spin_unlock(&cache->lock);
 242                return NULL;
 243        }
 244
 245        /* We're loading it the fast way, so we don't have a caching_ctl. */
 246        if (!cache->caching_ctl) {
 247                spin_unlock(&cache->lock);
 248                return NULL;
 249        }
 250
 251        ctl = cache->caching_ctl;
 252        atomic_inc(&ctl->count);
 253        spin_unlock(&cache->lock);
 254        return ctl;
 255}
 256
 257static void put_caching_control(struct btrfs_caching_control *ctl)
 258{
 259        if (atomic_dec_and_test(&ctl->count))
 260                kfree(ctl);
 261}
 262
 263/*
 264 * this is only called by cache_block_group, since we could have freed extents
 265 * we need to check the pinned_extents for any extents that can't be used yet
 266 * since their free space will be released as soon as the transaction commits.
 267 */
 268static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 269                              struct btrfs_fs_info *info, u64 start, u64 end)
 270{
 271        u64 extent_start, extent_end, size, total_added = 0;
 272        int ret;
 273
 274        while (start < end) {
 275                ret = find_first_extent_bit(info->pinned_extents, start,
 276                                            &extent_start, &extent_end,
 277                                            EXTENT_DIRTY | EXTENT_UPTODATE);
 278                if (ret)
 279                        break;
 280
 281                if (extent_start <= start) {
 282                        start = extent_end + 1;
 283                } else if (extent_start > start && extent_start < end) {
 284                        size = extent_start - start;
 285                        total_added += size;
 286                        ret = btrfs_add_free_space(block_group, start,
 287                                                   size);
 288                        BUG_ON(ret);
 289                        start = extent_end + 1;
 290                } else {
 291                        break;
 292                }
 293        }
 294
 295        if (start < end) {
 296                size = end - start;
 297                total_added += size;
 298                ret = btrfs_add_free_space(block_group, start, size);
 299                BUG_ON(ret);
 300        }
 301
 302        return total_added;
 303}
 304
 305static int caching_kthread(void *data)
 306{
 307        struct btrfs_block_group_cache *block_group = data;
 308        struct btrfs_fs_info *fs_info = block_group->fs_info;
 309        struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
 310        struct btrfs_root *extent_root = fs_info->extent_root;
 311        struct btrfs_path *path;
 312        struct extent_buffer *leaf;
 313        struct btrfs_key key;
 314        u64 total_found = 0;
 315        u64 last = 0;
 316        u32 nritems;
 317        int ret = 0;
 318
 319        path = btrfs_alloc_path();
 320        if (!path)
 321                return -ENOMEM;
 322
 323        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 324
 325        /*
 326         * We don't want to deadlock with somebody trying to allocate a new
 327         * extent for the extent root while also trying to search the extent
 328         * root to add free space.  So we skip locking and search the commit
 329         * root, since its read-only
 330         */
 331        path->skip_locking = 1;
 332        path->search_commit_root = 1;
 333        path->reada = 2;
 334
 335        key.objectid = last;
 336        key.offset = 0;
 337        key.type = BTRFS_EXTENT_ITEM_KEY;
 338again:
 339        mutex_lock(&caching_ctl->mutex);
 340        /* need to make sure the commit_root doesn't disappear */
 341        down_read(&fs_info->extent_commit_sem);
 342
 343        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 344        if (ret < 0)
 345                goto err;
 346
 347        leaf = path->nodes[0];
 348        nritems = btrfs_header_nritems(leaf);
 349
 350        while (1) {
 351                smp_mb();
 352                if (fs_info->closing > 1) {
 353                        last = (u64)-1;
 354                        break;
 355                }
 356
 357                if (path->slots[0] < nritems) {
 358                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 359                } else {
 360                        ret = find_next_key(path, 0, &key);
 361                        if (ret)
 362                                break;
 363
 364                        caching_ctl->progress = last;
 365                        btrfs_release_path(extent_root, path);
 366                        up_read(&fs_info->extent_commit_sem);
 367                        mutex_unlock(&caching_ctl->mutex);
 368                        if (btrfs_transaction_in_commit(fs_info))
 369                                schedule_timeout(1);
 370                        else
 371                                cond_resched();
 372                        goto again;
 373                }
 374
 375                if (key.objectid < block_group->key.objectid) {
 376                        path->slots[0]++;
 377                        continue;
 378                }
 379
 380                if (key.objectid >= block_group->key.objectid +
 381                    block_group->key.offset)
 382                        break;
 383
 384                if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 385                        total_found += add_new_free_space(block_group,
 386                                                          fs_info, last,
 387                                                          key.objectid);
 388                        last = key.objectid + key.offset;
 389
 390                        if (total_found > (1024 * 1024 * 2)) {
 391                                total_found = 0;
 392                                wake_up(&caching_ctl->wait);
 393                        }
 394                }
 395                path->slots[0]++;
 396        }
 397        ret = 0;
 398
 399        total_found += add_new_free_space(block_group, fs_info, last,
 400                                          block_group->key.objectid +
 401                                          block_group->key.offset);
 402        caching_ctl->progress = (u64)-1;
 403
 404        spin_lock(&block_group->lock);
 405        block_group->caching_ctl = NULL;
 406        block_group->cached = BTRFS_CACHE_FINISHED;
 407        spin_unlock(&block_group->lock);
 408
 409err:
 410        btrfs_free_path(path);
 411        up_read(&fs_info->extent_commit_sem);
 412
 413        free_excluded_extents(extent_root, block_group);
 414
 415        mutex_unlock(&caching_ctl->mutex);
 416        wake_up(&caching_ctl->wait);
 417
 418        put_caching_control(caching_ctl);
 419        atomic_dec(&block_group->space_info->caching_threads);
 420        btrfs_put_block_group(block_group);
 421
 422        return 0;
 423}
 424
 425static int cache_block_group(struct btrfs_block_group_cache *cache,
 426                             struct btrfs_trans_handle *trans,
 427                             struct btrfs_root *root,
 428                             int load_cache_only)
 429{
 430        struct btrfs_fs_info *fs_info = cache->fs_info;
 431        struct btrfs_caching_control *caching_ctl;
 432        struct task_struct *tsk;
 433        int ret = 0;
 434
 435        smp_mb();
 436        if (cache->cached != BTRFS_CACHE_NO)
 437                return 0;
 438
 439        /*
 440         * We can't do the read from on-disk cache during a commit since we need
 441         * to have the normal tree locking.  Also if we are currently trying to
 442         * allocate blocks for the tree root we can't do the fast caching since
 443         * we likely hold important locks.
 444         */
 445        if (!trans->transaction->in_commit &&
 446            (root && root != root->fs_info->tree_root)) {
 447                spin_lock(&cache->lock);
 448                if (cache->cached != BTRFS_CACHE_NO) {
 449                        spin_unlock(&cache->lock);
 450                        return 0;
 451                }
 452                cache->cached = BTRFS_CACHE_STARTED;
 453                spin_unlock(&cache->lock);
 454
 455                ret = load_free_space_cache(fs_info, cache);
 456
 457                spin_lock(&cache->lock);
 458                if (ret == 1) {
 459                        cache->cached = BTRFS_CACHE_FINISHED;
 460                        cache->last_byte_to_unpin = (u64)-1;
 461                } else {
 462                        cache->cached = BTRFS_CACHE_NO;
 463                }
 464                spin_unlock(&cache->lock);
 465                if (ret == 1) {
 466                        free_excluded_extents(fs_info->extent_root, cache);
 467                        return 0;
 468                }
 469        }
 470
 471        if (load_cache_only)
 472                return 0;
 473
 474        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
 475        BUG_ON(!caching_ctl);
 476
 477        INIT_LIST_HEAD(&caching_ctl->list);
 478        mutex_init(&caching_ctl->mutex);
 479        init_waitqueue_head(&caching_ctl->wait);
 480        caching_ctl->block_group = cache;
 481        caching_ctl->progress = cache->key.objectid;
 482        /* one for caching kthread, one for caching block group list */
 483        atomic_set(&caching_ctl->count, 2);
 484
 485        spin_lock(&cache->lock);
 486        if (cache->cached != BTRFS_CACHE_NO) {
 487                spin_unlock(&cache->lock);
 488                kfree(caching_ctl);
 489                return 0;
 490        }
 491        cache->caching_ctl = caching_ctl;
 492        cache->cached = BTRFS_CACHE_STARTED;
 493        spin_unlock(&cache->lock);
 494
 495        down_write(&fs_info->extent_commit_sem);
 496        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 497        up_write(&fs_info->extent_commit_sem);
 498
 499        atomic_inc(&cache->space_info->caching_threads);
 500        btrfs_get_block_group(cache);
 501
 502        tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
 503                          cache->key.objectid);
 504        if (IS_ERR(tsk)) {
 505                ret = PTR_ERR(tsk);
 506                printk(KERN_ERR "error running thread %d\n", ret);
 507                BUG();
 508        }
 509
 510        return ret;
 511}
 512
 513/*
 514 * return the block group that starts at or after bytenr
 515 */
 516static struct btrfs_block_group_cache *
 517btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 518{
 519        struct btrfs_block_group_cache *cache;
 520
 521        cache = block_group_cache_tree_search(info, bytenr, 0);
 522
 523        return cache;
 524}
 525
 526/*
 527 * return the block group that contains the given bytenr
 528 */
 529struct btrfs_block_group_cache *btrfs_lookup_block_group(
 530                                                 struct btrfs_fs_info *info,
 531                                                 u64 bytenr)
 532{
 533        struct btrfs_block_group_cache *cache;
 534
 535        cache = block_group_cache_tree_search(info, bytenr, 1);
 536
 537        return cache;
 538}
 539
 540static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 541                                                  u64 flags)
 542{
 543        struct list_head *head = &info->space_info;
 544        struct btrfs_space_info *found;
 545
 546        flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
 547                 BTRFS_BLOCK_GROUP_METADATA;
 548
 549        rcu_read_lock();
 550        list_for_each_entry_rcu(found, head, list) {
 551                if (found->flags & flags) {
 552                        rcu_read_unlock();
 553                        return found;
 554                }
 555        }
 556        rcu_read_unlock();
 557        return NULL;
 558}
 559
 560/*
 561 * after adding space to the filesystem, we need to clear the full flags
 562 * on all the space infos.
 563 */
 564void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 565{
 566        struct list_head *head = &info->space_info;
 567        struct btrfs_space_info *found;
 568
 569        rcu_read_lock();
 570        list_for_each_entry_rcu(found, head, list)
 571                found->full = 0;
 572        rcu_read_unlock();
 573}
 574
 575static u64 div_factor(u64 num, int factor)
 576{
 577        if (factor == 10)
 578                return num;
 579        num *= factor;
 580        do_div(num, 10);
 581        return num;
 582}
 583
 584static u64 div_factor_fine(u64 num, int factor)
 585{
 586        if (factor == 100)
 587                return num;
 588        num *= factor;
 589        do_div(num, 100);
 590        return num;
 591}
 592
 593u64 btrfs_find_block_group(struct btrfs_root *root,
 594                           u64 search_start, u64 search_hint, int owner)
 595{
 596        struct btrfs_block_group_cache *cache;
 597        u64 used;
 598        u64 last = max(search_hint, search_start);
 599        u64 group_start = 0;
 600        int full_search = 0;
 601        int factor = 9;
 602        int wrapped = 0;
 603again:
 604        while (1) {
 605                cache = btrfs_lookup_first_block_group(root->fs_info, last);
 606                if (!cache)
 607                        break;
 608
 609                spin_lock(&cache->lock);
 610                last = cache->key.objectid + cache->key.offset;
 611                used = btrfs_block_group_used(&cache->item);
 612
 613                if ((full_search || !cache->ro) &&
 614                    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
 615                        if (used + cache->pinned + cache->reserved <
 616                            div_factor(cache->key.offset, factor)) {
 617                                group_start = cache->key.objectid;
 618                                spin_unlock(&cache->lock);
 619                                btrfs_put_block_group(cache);
 620                                goto found;
 621                        }
 622                }
 623                spin_unlock(&cache->lock);
 624                btrfs_put_block_group(cache);
 625                cond_resched();
 626        }
 627        if (!wrapped) {
 628                last = search_start;
 629                wrapped = 1;
 630                goto again;
 631        }
 632        if (!full_search && factor < 10) {
 633                last = search_start;
 634                full_search = 1;
 635                factor = 10;
 636                goto again;
 637        }
 638found:
 639        return group_start;
 640}
 641
 642/* simple helper to search for an existing extent at a given offset */
 643int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 644{
 645        int ret;
 646        struct btrfs_key key;
 647        struct btrfs_path *path;
 648
 649        path = btrfs_alloc_path();
 650        BUG_ON(!path);
 651        key.objectid = start;
 652        key.offset = len;
 653        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 654        ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 655                                0, 0);
 656        btrfs_free_path(path);
 657        return ret;
 658}
 659
 660/*
 661 * helper function to lookup reference count and flags of extent.
 662 *
 663 * the head node for delayed ref is used to store the sum of all the
 664 * reference count modifications queued up in the rbtree. the head
 665 * node may also store the extent flags to set. This way you can check
 666 * to see what the reference count and extent flags would be if all of
 667 * the delayed refs are not processed.
 668 */
 669int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 670                             struct btrfs_root *root, u64 bytenr,
 671                             u64 num_bytes, u64 *refs, u64 *flags)
 672{
 673        struct btrfs_delayed_ref_head *head;
 674        struct btrfs_delayed_ref_root *delayed_refs;
 675        struct btrfs_path *path;
 676        struct btrfs_extent_item *ei;
 677        struct extent_buffer *leaf;
 678        struct btrfs_key key;
 679        u32 item_size;
 680        u64 num_refs;
 681        u64 extent_flags;
 682        int ret;
 683
 684        path = btrfs_alloc_path();
 685        if (!path)
 686                return -ENOMEM;
 687
 688        key.objectid = bytenr;
 689        key.type = BTRFS_EXTENT_ITEM_KEY;
 690        key.offset = num_bytes;
 691        if (!trans) {
 692                path->skip_locking = 1;
 693                path->search_commit_root = 1;
 694        }
 695again:
 696        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 697                                &key, path, 0, 0);
 698        if (ret < 0)
 699                goto out_free;
 700
 701        if (ret == 0) {
 702                leaf = path->nodes[0];
 703                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 704                if (item_size >= sizeof(*ei)) {
 705                        ei = btrfs_item_ptr(leaf, path->slots[0],
 706                                            struct btrfs_extent_item);
 707                        num_refs = btrfs_extent_refs(leaf, ei);
 708                        extent_flags = btrfs_extent_flags(leaf, ei);
 709                } else {
 710#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 711                        struct btrfs_extent_item_v0 *ei0;
 712                        BUG_ON(item_size != sizeof(*ei0));
 713                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 714                                             struct btrfs_extent_item_v0);
 715                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
 716                        /* FIXME: this isn't correct for data */
 717                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 718#else
 719                        BUG();
 720#endif
 721                }
 722                BUG_ON(num_refs == 0);
 723        } else {
 724                num_refs = 0;
 725                extent_flags = 0;
 726                ret = 0;
 727        }
 728
 729        if (!trans)
 730                goto out;
 731
 732        delayed_refs = &trans->transaction->delayed_refs;
 733        spin_lock(&delayed_refs->lock);
 734        head = btrfs_find_delayed_ref_head(trans, bytenr);
 735        if (head) {
 736                if (!mutex_trylock(&head->mutex)) {
 737                        atomic_inc(&head->node.refs);
 738                        spin_unlock(&delayed_refs->lock);
 739
 740                        btrfs_release_path(root->fs_info->extent_root, path);
 741
 742                        mutex_lock(&head->mutex);
 743                        mutex_unlock(&head->mutex);
 744                        btrfs_put_delayed_ref(&head->node);
 745                        goto again;
 746                }
 747                if (head->extent_op && head->extent_op->update_flags)
 748                        extent_flags |= head->extent_op->flags_to_set;
 749                else
 750                        BUG_ON(num_refs == 0);
 751
 752                num_refs += head->node.ref_mod;
 753                mutex_unlock(&head->mutex);
 754        }
 755        spin_unlock(&delayed_refs->lock);
 756out:
 757        WARN_ON(num_refs == 0);
 758        if (refs)
 759                *refs = num_refs;
 760        if (flags)
 761                *flags = extent_flags;
 762out_free:
 763        btrfs_free_path(path);
 764        return ret;
 765}
 766
 767/*
 768 * Back reference rules.  Back refs have three main goals:
 769 *
 770 * 1) differentiate between all holders of references to an extent so that
 771 *    when a reference is dropped we can make sure it was a valid reference
 772 *    before freeing the extent.
 773 *
 774 * 2) Provide enough information to quickly find the holders of an extent
 775 *    if we notice a given block is corrupted or bad.
 776 *
 777 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 778 *    maintenance.  This is actually the same as #2, but with a slightly
 779 *    different use case.
 780 *
 781 * There are two kinds of back refs. The implicit back refs is optimized
 782 * for pointers in non-shared tree blocks. For a given pointer in a block,
 783 * back refs of this kind provide information about the block's owner tree
 784 * and the pointer's key. These information allow us to find the block by
 785 * b-tree searching. The full back refs is for pointers in tree blocks not
 786 * referenced by their owner trees. The location of tree block is recorded
 787 * in the back refs. Actually the full back refs is generic, and can be
 788 * used in all cases the implicit back refs is used. The major shortcoming
 789 * of the full back refs is its overhead. Every time a tree block gets
 790 * COWed, we have to update back refs entry for all pointers in it.
 791 *
 792 * For a newly allocated tree block, we use implicit back refs for
 793 * pointers in it. This means most tree related operations only involve
 794 * implicit back refs. For a tree block created in old transaction, the
 795 * only way to drop a reference to it is COW it. So we can detect the
 796 * event that tree block loses its owner tree's reference and do the
 797 * back refs conversion.
 798 *
 799 * When a tree block is COW'd through a tree, there are four cases:
 800 *
 801 * The reference count of the block is one and the tree is the block's
 802 * owner tree. Nothing to do in this case.
 803 *
 804 * The reference count of the block is one and the tree is not the
 805 * block's owner tree. In this case, full back refs is used for pointers
 806 * in the block. Remove these full back refs, add implicit back refs for
 807 * every pointers in the new block.
 808 *
 809 * The reference count of the block is greater than one and the tree is
 810 * the block's owner tree. In this case, implicit back refs is used for
 811 * pointers in the block. Add full back refs for every pointers in the
 812 * block, increase lower level extents' reference counts. The original
 813 * implicit back refs are entailed to the new block.
 814 *
 815 * The reference count of the block is greater than one and the tree is
 816 * not the block's owner tree. Add implicit back refs for every pointer in
 817 * the new block, increase lower level extents' reference count.
 818 *
 819 * Back Reference Key composing:
 820 *
 821 * The key objectid corresponds to the first byte in the extent,
 822 * The key type is used to differentiate between types of back refs.
 823 * There are different meanings of the key offset for different types
 824 * of back refs.
 825 *
 826 * File extents can be referenced by:
 827 *
 828 * - multiple snapshots, subvolumes, or different generations in one subvol
 829 * - different files inside a single subvolume
 830 * - different offsets inside a file (bookend extents in file.c)
 831 *
 832 * The extent ref structure for the implicit back refs has fields for:
 833 *
 834 * - Objectid of the subvolume root
 835 * - objectid of the file holding the reference
 836 * - original offset in the file
 837 * - how many bookend extents
 838 *
 839 * The key offset for the implicit back refs is hash of the first
 840 * three fields.
 841 *
 842 * The extent ref structure for the full back refs has field for:
 843 *
 844 * - number of pointers in the tree leaf
 845 *
 846 * The key offset for the implicit back refs is the first byte of
 847 * the tree leaf
 848 *
 849 * When a file extent is allocated, The implicit back refs is used.
 850 * the fields are filled in:
 851 *
 852 *     (root_key.objectid, inode objectid, offset in file, 1)
 853 *
 854 * When a file extent is removed file truncation, we find the
 855 * corresponding implicit back refs and check the following fields:
 856 *
 857 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 858 *
 859 * Btree extents can be referenced by:
 860 *
 861 * - Different subvolumes
 862 *
 863 * Both the implicit back refs and the full back refs for tree blocks
 864 * only consist of key. The key offset for the implicit back refs is
 865 * objectid of block's owner tree. The key offset for the full back refs
 866 * is the first byte of parent block.
 867 *
 868 * When implicit back refs is used, information about the lowest key and
 869 * level of the tree block are required. These information are stored in
 870 * tree block info structure.
 871 */
 872
 873#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 874static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 875                                  struct btrfs_root *root,
 876                                  struct btrfs_path *path,
 877                                  u64 owner, u32 extra_size)
 878{
 879        struct btrfs_extent_item *item;
 880        struct btrfs_extent_item_v0 *ei0;
 881        struct btrfs_extent_ref_v0 *ref0;
 882        struct btrfs_tree_block_info *bi;
 883        struct extent_buffer *leaf;
 884        struct btrfs_key key;
 885        struct btrfs_key found_key;
 886        u32 new_size = sizeof(*item);
 887        u64 refs;
 888        int ret;
 889
 890        leaf = path->nodes[0];
 891        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
 892
 893        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 894        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 895                             struct btrfs_extent_item_v0);
 896        refs = btrfs_extent_refs_v0(leaf, ei0);
 897
 898        if (owner == (u64)-1) {
 899                while (1) {
 900                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 901                                ret = btrfs_next_leaf(root, path);
 902                                if (ret < 0)
 903                                        return ret;
 904                                BUG_ON(ret > 0);
 905                                leaf = path->nodes[0];
 906                        }
 907                        btrfs_item_key_to_cpu(leaf, &found_key,
 908                                              path->slots[0]);
 909                        BUG_ON(key.objectid != found_key.objectid);
 910                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
 911                                path->slots[0]++;
 912                                continue;
 913                        }
 914                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
 915                                              struct btrfs_extent_ref_v0);
 916                        owner = btrfs_ref_objectid_v0(leaf, ref0);
 917                        break;
 918                }
 919        }
 920        btrfs_release_path(root, path);
 921
 922        if (owner < BTRFS_FIRST_FREE_OBJECTID)
 923                new_size += sizeof(*bi);
 924
 925        new_size -= sizeof(*ei0);
 926        ret = btrfs_search_slot(trans, root, &key, path,
 927                                new_size + extra_size, 1);
 928        if (ret < 0)
 929                return ret;
 930        BUG_ON(ret);
 931
 932        ret = btrfs_extend_item(trans, root, path, new_size);
 933        BUG_ON(ret);
 934
 935        leaf = path->nodes[0];
 936        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 937        btrfs_set_extent_refs(leaf, item, refs);
 938        /* FIXME: get real generation */
 939        btrfs_set_extent_generation(leaf, item, 0);
 940        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 941                btrfs_set_extent_flags(leaf, item,
 942                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
 943                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
 944                bi = (struct btrfs_tree_block_info *)(item + 1);
 945                /* FIXME: get first key of the block */
 946                memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
 947                btrfs_set_tree_block_level(leaf, bi, (int)owner);
 948        } else {
 949                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
 950        }
 951        btrfs_mark_buffer_dirty(leaf);
 952        return 0;
 953}
 954#endif
 955
 956static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 957{
 958        u32 high_crc = ~(u32)0;
 959        u32 low_crc = ~(u32)0;
 960        __le64 lenum;
 961
 962        lenum = cpu_to_le64(root_objectid);
 963        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
 964        lenum = cpu_to_le64(owner);
 965        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 966        lenum = cpu_to_le64(offset);
 967        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 968
 969        return ((u64)high_crc << 31) ^ (u64)low_crc;
 970}
 971
 972static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
 973                                     struct btrfs_extent_data_ref *ref)
 974{
 975        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
 976                                    btrfs_extent_data_ref_objectid(leaf, ref),
 977                                    btrfs_extent_data_ref_offset(leaf, ref));
 978}
 979
 980static int match_extent_data_ref(struct extent_buffer *leaf,
 981                                 struct btrfs_extent_data_ref *ref,
 982                                 u64 root_objectid, u64 owner, u64 offset)
 983{
 984        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
 985            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
 986            btrfs_extent_data_ref_offset(leaf, ref) != offset)
 987                return 0;
 988        return 1;
 989}
 990
 991static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
 992                                           struct btrfs_root *root,
 993                                           struct btrfs_path *path,
 994                                           u64 bytenr, u64 parent,
 995                                           u64 root_objectid,
 996                                           u64 owner, u64 offset)
 997{
 998        struct btrfs_key key;
 999        struct btrfs_extent_data_ref *ref;
1000        struct extent_buffer *leaf;
1001        u32 nritems;
1002        int ret;
1003        int recow;
1004        int err = -ENOENT;
1005
1006        key.objectid = bytenr;
1007        if (parent) {
1008                key.type = BTRFS_SHARED_DATA_REF_KEY;
1009                key.offset = parent;
1010        } else {
1011                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1012                key.offset = hash_extent_data_ref(root_objectid,
1013                                                  owner, offset);
1014        }
1015again:
1016        recow = 0;
1017        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1018        if (ret < 0) {
1019                err = ret;
1020                goto fail;
1021        }
1022
1023        if (parent) {
1024                if (!ret)
1025                        return 0;
1026#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1027                key.type = BTRFS_EXTENT_REF_V0_KEY;
1028                btrfs_release_path(root, path);
1029                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1030                if (ret < 0) {
1031                        err = ret;
1032                        goto fail;
1033                }
1034                if (!ret)
1035                        return 0;
1036#endif
1037                goto fail;
1038        }
1039
1040        leaf = path->nodes[0];
1041        nritems = btrfs_header_nritems(leaf);
1042        while (1) {
1043                if (path->slots[0] >= nritems) {
1044                        ret = btrfs_next_leaf(root, path);
1045                        if (ret < 0)
1046                                err = ret;
1047                        if (ret)
1048                                goto fail;
1049
1050                        leaf = path->nodes[0];
1051                        nritems = btrfs_header_nritems(leaf);
1052                        recow = 1;
1053                }
1054
1055                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1056                if (key.objectid != bytenr ||
1057                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1058                        goto fail;
1059
1060                ref = btrfs_item_ptr(leaf, path->slots[0],
1061                                     struct btrfs_extent_data_ref);
1062
1063                if (match_extent_data_ref(leaf, ref, root_objectid,
1064                                          owner, offset)) {
1065                        if (recow) {
1066                                btrfs_release_path(root, path);
1067                                goto again;
1068                        }
1069                        err = 0;
1070                        break;
1071                }
1072                path->slots[0]++;
1073        }
1074fail:
1075        return err;
1076}
1077
1078static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1079                                           struct btrfs_root *root,
1080                                           struct btrfs_path *path,
1081                                           u64 bytenr, u64 parent,
1082                                           u64 root_objectid, u64 owner,
1083                                           u64 offset, int refs_to_add)
1084{
1085        struct btrfs_key key;
1086        struct extent_buffer *leaf;
1087        u32 size;
1088        u32 num_refs;
1089        int ret;
1090
1091        key.objectid = bytenr;
1092        if (parent) {
1093                key.type = BTRFS_SHARED_DATA_REF_KEY;
1094                key.offset = parent;
1095                size = sizeof(struct btrfs_shared_data_ref);
1096        } else {
1097                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1098                key.offset = hash_extent_data_ref(root_objectid,
1099                                                  owner, offset);
1100                size = sizeof(struct btrfs_extent_data_ref);
1101        }
1102
1103        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1104        if (ret && ret != -EEXIST)
1105                goto fail;
1106
1107        leaf = path->nodes[0];
1108        if (parent) {
1109                struct btrfs_shared_data_ref *ref;
1110                ref = btrfs_item_ptr(leaf, path->slots[0],
1111                                     struct btrfs_shared_data_ref);
1112                if (ret == 0) {
1113                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1114                } else {
1115                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
1116                        num_refs += refs_to_add;
1117                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1118                }
1119        } else {
1120                struct btrfs_extent_data_ref *ref;
1121                while (ret == -EEXIST) {
1122                        ref = btrfs_item_ptr(leaf, path->slots[0],
1123                                             struct btrfs_extent_data_ref);
1124                        if (match_extent_data_ref(leaf, ref, root_objectid,
1125                                                  owner, offset))
1126                                break;
1127                        btrfs_release_path(root, path);
1128                        key.offset++;
1129                        ret = btrfs_insert_empty_item(trans, root, path, &key,
1130                                                      size);
1131                        if (ret && ret != -EEXIST)
1132                                goto fail;
1133
1134                        leaf = path->nodes[0];
1135                }
1136                ref = btrfs_item_ptr(leaf, path->slots[0],
1137                                     struct btrfs_extent_data_ref);
1138                if (ret == 0) {
1139                        btrfs_set_extent_data_ref_root(leaf, ref,
1140                                                       root_objectid);
1141                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1142                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1143                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1144                } else {
1145                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
1146                        num_refs += refs_to_add;
1147                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1148                }
1149        }
1150        btrfs_mark_buffer_dirty(leaf);
1151        ret = 0;
1152fail:
1153        btrfs_release_path(root, path);
1154        return ret;
1155}
1156
1157static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1158                                           struct btrfs_root *root,
1159                                           struct btrfs_path *path,
1160                                           int refs_to_drop)
1161{
1162        struct btrfs_key key;
1163        struct btrfs_extent_data_ref *ref1 = NULL;
1164        struct btrfs_shared_data_ref *ref2 = NULL;
1165        struct extent_buffer *leaf;
1166        u32 num_refs = 0;
1167        int ret = 0;
1168
1169        leaf = path->nodes[0];
1170        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1171
1172        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1173                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1174                                      struct btrfs_extent_data_ref);
1175                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1176        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1177                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1178                                      struct btrfs_shared_data_ref);
1179                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1180#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1181        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1182                struct btrfs_extent_ref_v0 *ref0;
1183                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1184                                      struct btrfs_extent_ref_v0);
1185                num_refs = btrfs_ref_count_v0(leaf, ref0);
1186#endif
1187        } else {
1188                BUG();
1189        }
1190
1191        BUG_ON(num_refs < refs_to_drop);
1192        num_refs -= refs_to_drop;
1193
1194        if (num_refs == 0) {
1195                ret = btrfs_del_item(trans, root, path);
1196        } else {
1197                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1198                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1199                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1200                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1201#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1202                else {
1203                        struct btrfs_extent_ref_v0 *ref0;
1204                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1205                                        struct btrfs_extent_ref_v0);
1206                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1207                }
1208#endif
1209                btrfs_mark_buffer_dirty(leaf);
1210        }
1211        return ret;
1212}
1213
1214static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1215                                          struct btrfs_path *path,
1216                                          struct btrfs_extent_inline_ref *iref)
1217{
1218        struct btrfs_key key;
1219        struct extent_buffer *leaf;
1220        struct btrfs_extent_data_ref *ref1;
1221        struct btrfs_shared_data_ref *ref2;
1222        u32 num_refs = 0;
1223
1224        leaf = path->nodes[0];
1225        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1226        if (iref) {
1227                if (btrfs_extent_inline_ref_type(leaf, iref) ==
1228                    BTRFS_EXTENT_DATA_REF_KEY) {
1229                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1230                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1231                } else {
1232                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1233                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1234                }
1235        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1236                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1237                                      struct btrfs_extent_data_ref);
1238                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1239        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1240                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1241                                      struct btrfs_shared_data_ref);
1242                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1243#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1244        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1245                struct btrfs_extent_ref_v0 *ref0;
1246                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1247                                      struct btrfs_extent_ref_v0);
1248                num_refs = btrfs_ref_count_v0(leaf, ref0);
1249#endif
1250        } else {
1251                WARN_ON(1);
1252        }
1253        return num_refs;
1254}
1255
1256static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1257                                          struct btrfs_root *root,
1258                                          struct btrfs_path *path,
1259                                          u64 bytenr, u64 parent,
1260                                          u64 root_objectid)
1261{
1262        struct btrfs_key key;
1263        int ret;
1264
1265        key.objectid = bytenr;
1266        if (parent) {
1267                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1268                key.offset = parent;
1269        } else {
1270                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1271                key.offset = root_objectid;
1272        }
1273
1274        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1275        if (ret > 0)
1276                ret = -ENOENT;
1277#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1278        if (ret == -ENOENT && parent) {
1279                btrfs_release_path(root, path);
1280                key.type = BTRFS_EXTENT_REF_V0_KEY;
1281                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1282                if (ret > 0)
1283                        ret = -ENOENT;
1284        }
1285#endif
1286        return ret;
1287}
1288
1289static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1290                                          struct btrfs_root *root,
1291                                          struct btrfs_path *path,
1292                                          u64 bytenr, u64 parent,
1293                                          u64 root_objectid)
1294{
1295        struct btrfs_key key;
1296        int ret;
1297
1298        key.objectid = bytenr;
1299        if (parent) {
1300                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1301                key.offset = parent;
1302        } else {
1303                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1304                key.offset = root_objectid;
1305        }
1306
1307        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1308        btrfs_release_path(root, path);
1309        return ret;
1310}
1311
1312static inline int extent_ref_type(u64 parent, u64 owner)
1313{
1314        int type;
1315        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1316                if (parent > 0)
1317                        type = BTRFS_SHARED_BLOCK_REF_KEY;
1318                else
1319                        type = BTRFS_TREE_BLOCK_REF_KEY;
1320        } else {
1321                if (parent > 0)
1322                        type = BTRFS_SHARED_DATA_REF_KEY;
1323                else
1324                        type = BTRFS_EXTENT_DATA_REF_KEY;
1325        }
1326        return type;
1327}
1328
1329static int find_next_key(struct btrfs_path *path, int level,
1330                         struct btrfs_key *key)
1331
1332{
1333        for (; level < BTRFS_MAX_LEVEL; level++) {
1334                if (!path->nodes[level])
1335                        break;
1336                if (path->slots[level] + 1 >=
1337                    btrfs_header_nritems(path->nodes[level]))
1338                        continue;
1339                if (level == 0)
1340                        btrfs_item_key_to_cpu(path->nodes[level], key,
1341                                              path->slots[level] + 1);
1342                else
1343                        btrfs_node_key_to_cpu(path->nodes[level], key,
1344                                              path->slots[level] + 1);
1345                return 0;
1346        }
1347        return 1;
1348}
1349
1350/*
1351 * look for inline back ref. if back ref is found, *ref_ret is set
1352 * to the address of inline back ref, and 0 is returned.
1353 *
1354 * if back ref isn't found, *ref_ret is set to the address where it
1355 * should be inserted, and -ENOENT is returned.
1356 *
1357 * if insert is true and there are too many inline back refs, the path
1358 * points to the extent item, and -EAGAIN is returned.
1359 *
1360 * NOTE: inline back refs are ordered in the same way that back ref
1361 *       items in the tree are ordered.
1362 */
1363static noinline_for_stack
1364int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1365                                 struct btrfs_root *root,
1366                                 struct btrfs_path *path,
1367                                 struct btrfs_extent_inline_ref **ref_ret,
1368                                 u64 bytenr, u64 num_bytes,
1369                                 u64 parent, u64 root_objectid,
1370                                 u64 owner, u64 offset, int insert)
1371{
1372        struct btrfs_key key;
1373        struct extent_buffer *leaf;
1374        struct btrfs_extent_item *ei;
1375        struct btrfs_extent_inline_ref *iref;
1376        u64 flags;
1377        u64 item_size;
1378        unsigned long ptr;
1379        unsigned long end;
1380        int extra_size;
1381        int type;
1382        int want;
1383        int ret;
1384        int err = 0;
1385
1386        key.objectid = bytenr;
1387        key.type = BTRFS_EXTENT_ITEM_KEY;
1388        key.offset = num_bytes;
1389
1390        want = extent_ref_type(parent, owner);
1391        if (insert) {
1392                extra_size = btrfs_extent_inline_ref_size(want);
1393                path->keep_locks = 1;
1394        } else
1395                extra_size = -1;
1396        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1397        if (ret < 0) {
1398                err = ret;
1399                goto out;
1400        }
1401        BUG_ON(ret);
1402
1403        leaf = path->nodes[0];
1404        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1405#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1406        if (item_size < sizeof(*ei)) {
1407                if (!insert) {
1408                        err = -ENOENT;
1409                        goto out;
1410                }
1411                ret = convert_extent_item_v0(trans, root, path, owner,
1412                                             extra_size);
1413                if (ret < 0) {
1414                        err = ret;
1415                        goto out;
1416                }
1417                leaf = path->nodes[0];
1418                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1419        }
1420#endif
1421        BUG_ON(item_size < sizeof(*ei));
1422
1423        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1424        flags = btrfs_extent_flags(leaf, ei);
1425
1426        ptr = (unsigned long)(ei + 1);
1427        end = (unsigned long)ei + item_size;
1428
1429        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1430                ptr += sizeof(struct btrfs_tree_block_info);
1431                BUG_ON(ptr > end);
1432        } else {
1433                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1434        }
1435
1436        err = -ENOENT;
1437        while (1) {
1438                if (ptr >= end) {
1439                        WARN_ON(ptr > end);
1440                        break;
1441                }
1442                iref = (struct btrfs_extent_inline_ref *)ptr;
1443                type = btrfs_extent_inline_ref_type(leaf, iref);
1444                if (want < type)
1445                        break;
1446                if (want > type) {
1447                        ptr += btrfs_extent_inline_ref_size(type);
1448                        continue;
1449                }
1450
1451                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1452                        struct btrfs_extent_data_ref *dref;
1453                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1454                        if (match_extent_data_ref(leaf, dref, root_objectid,
1455                                                  owner, offset)) {
1456                                err = 0;
1457                                break;
1458                        }
1459                        if (hash_extent_data_ref_item(leaf, dref) <
1460                            hash_extent_data_ref(root_objectid, owner, offset))
1461                                break;
1462                } else {
1463                        u64 ref_offset;
1464                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1465                        if (parent > 0) {
1466                                if (parent == ref_offset) {
1467                                        err = 0;
1468                                        break;
1469                                }
1470                                if (ref_offset < parent)
1471                                        break;
1472                        } else {
1473                                if (root_objectid == ref_offset) {
1474                                        err = 0;
1475                                        break;
1476                                }
1477                                if (ref_offset < root_objectid)
1478                                        break;
1479                        }
1480                }
1481                ptr += btrfs_extent_inline_ref_size(type);
1482        }
1483        if (err == -ENOENT && insert) {
1484                if (item_size + extra_size >=
1485                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1486                        err = -EAGAIN;
1487                        goto out;
1488                }
1489                /*
1490                 * To add new inline back ref, we have to make sure
1491                 * there is no corresponding back ref item.
1492                 * For simplicity, we just do not add new inline back
1493                 * ref if there is any kind of item for this block
1494                 */
1495                if (find_next_key(path, 0, &key) == 0 &&
1496                    key.objectid == bytenr &&
1497                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1498                        err = -EAGAIN;
1499                        goto out;
1500                }
1501        }
1502        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1503out:
1504        if (insert) {
1505                path->keep_locks = 0;
1506                btrfs_unlock_up_safe(path, 1);
1507        }
1508        return err;
1509}
1510
1511/*
1512 * helper to add new inline back ref
1513 */
1514static noinline_for_stack
1515int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1516                                struct btrfs_root *root,
1517                                struct btrfs_path *path,
1518                                struct btrfs_extent_inline_ref *iref,
1519                                u64 parent, u64 root_objectid,
1520                                u64 owner, u64 offset, int refs_to_add,
1521                                struct btrfs_delayed_extent_op *extent_op)
1522{
1523        struct extent_buffer *leaf;
1524        struct btrfs_extent_item *ei;
1525        unsigned long ptr;
1526        unsigned long end;
1527        unsigned long item_offset;
1528        u64 refs;
1529        int size;
1530        int type;
1531        int ret;
1532
1533        leaf = path->nodes[0];
1534        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1535        item_offset = (unsigned long)iref - (unsigned long)ei;
1536
1537        type = extent_ref_type(parent, owner);
1538        size = btrfs_extent_inline_ref_size(type);
1539
1540        ret = btrfs_extend_item(trans, root, path, size);
1541        BUG_ON(ret);
1542
1543        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1544        refs = btrfs_extent_refs(leaf, ei);
1545        refs += refs_to_add;
1546        btrfs_set_extent_refs(leaf, ei, refs);
1547        if (extent_op)
1548                __run_delayed_extent_op(extent_op, leaf, ei);
1549
1550        ptr = (unsigned long)ei + item_offset;
1551        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1552        if (ptr < end - size)
1553                memmove_extent_buffer(leaf, ptr + size, ptr,
1554                                      end - size - ptr);
1555
1556        iref = (struct btrfs_extent_inline_ref *)ptr;
1557        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1558        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1559                struct btrfs_extent_data_ref *dref;
1560                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1561                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1562                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1563                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1564                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1565        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1566                struct btrfs_shared_data_ref *sref;
1567                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1568                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1569                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1570        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1571                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1572        } else {
1573                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1574        }
1575        btrfs_mark_buffer_dirty(leaf);
1576        return 0;
1577}
1578
1579static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1580                                 struct btrfs_root *root,
1581                                 struct btrfs_path *path,
1582                                 struct btrfs_extent_inline_ref **ref_ret,
1583                                 u64 bytenr, u64 num_bytes, u64 parent,
1584                                 u64 root_objectid, u64 owner, u64 offset)
1585{
1586        int ret;
1587
1588        ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1589                                           bytenr, num_bytes, parent,
1590                                           root_objectid, owner, offset, 0);
1591        if (ret != -ENOENT)
1592                return ret;
1593
1594        btrfs_release_path(root, path);
1595        *ref_ret = NULL;
1596
1597        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1598                ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1599                                            root_objectid);
1600        } else {
1601                ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1602                                             root_objectid, owner, offset);
1603        }
1604        return ret;
1605}
1606
1607/*
1608 * helper to update/remove inline back ref
1609 */
1610static noinline_for_stack
1611int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1612                                 struct btrfs_root *root,
1613                                 struct btrfs_path *path,
1614                                 struct btrfs_extent_inline_ref *iref,
1615                                 int refs_to_mod,
1616                                 struct btrfs_delayed_extent_op *extent_op)
1617{
1618        struct extent_buffer *leaf;
1619        struct btrfs_extent_item *ei;
1620        struct btrfs_extent_data_ref *dref = NULL;
1621        struct btrfs_shared_data_ref *sref = NULL;
1622        unsigned long ptr;
1623        unsigned long end;
1624        u32 item_size;
1625        int size;
1626        int type;
1627        int ret;
1628        u64 refs;
1629
1630        leaf = path->nodes[0];
1631        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1632        refs = btrfs_extent_refs(leaf, ei);
1633        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1634        refs += refs_to_mod;
1635        btrfs_set_extent_refs(leaf, ei, refs);
1636        if (extent_op)
1637                __run_delayed_extent_op(extent_op, leaf, ei);
1638
1639        type = btrfs_extent_inline_ref_type(leaf, iref);
1640
1641        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1642                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1643                refs = btrfs_extent_data_ref_count(leaf, dref);
1644        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1645                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1646                refs = btrfs_shared_data_ref_count(leaf, sref);
1647        } else {
1648                refs = 1;
1649                BUG_ON(refs_to_mod != -1);
1650        }
1651
1652        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1653        refs += refs_to_mod;
1654
1655        if (refs > 0) {
1656                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1657                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1658                else
1659                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1660        } else {
1661                size =  btrfs_extent_inline_ref_size(type);
1662                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1663                ptr = (unsigned long)iref;
1664                end = (unsigned long)ei + item_size;
1665                if (ptr + size < end)
1666                        memmove_extent_buffer(leaf, ptr, ptr + size,
1667                                              end - ptr - size);
1668                item_size -= size;
1669                ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1670                BUG_ON(ret);
1671        }
1672        btrfs_mark_buffer_dirty(leaf);
1673        return 0;
1674}
1675
1676static noinline_for_stack
1677int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1678                                 struct btrfs_root *root,
1679                                 struct btrfs_path *path,
1680                                 u64 bytenr, u64 num_bytes, u64 parent,
1681                                 u64 root_objectid, u64 owner,
1682                                 u64 offset, int refs_to_add,
1683                                 struct btrfs_delayed_extent_op *extent_op)
1684{
1685        struct btrfs_extent_inline_ref *iref;
1686        int ret;
1687
1688        ret = lookup_inline_extent_backref(trans, root, path, &iref,
1689                                           bytenr, num_bytes, parent,
1690                                           root_objectid, owner, offset, 1);
1691        if (ret == 0) {
1692                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1693                ret = update_inline_extent_backref(trans, root, path, iref,
1694                                                   refs_to_add, extent_op);
1695        } else if (ret == -ENOENT) {
1696                ret = setup_inline_extent_backref(trans, root, path, iref,
1697                                                  parent, root_objectid,
1698                                                  owner, offset, refs_to_add,
1699                                                  extent_op);
1700        }
1701        return ret;
1702}
1703
1704static int insert_extent_backref(struct btrfs_trans_handle *trans,
1705                                 struct btrfs_root *root,
1706                                 struct btrfs_path *path,
1707                                 u64 bytenr, u64 parent, u64 root_objectid,
1708                                 u64 owner, u64 offset, int refs_to_add)
1709{
1710        int ret;
1711        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1712                BUG_ON(refs_to_add != 1);
1713                ret = insert_tree_block_ref(trans, root, path, bytenr,
1714                                            parent, root_objectid);
1715        } else {
1716                ret = insert_extent_data_ref(trans, root, path, bytenr,
1717                                             parent, root_objectid,
1718                                             owner, offset, refs_to_add);
1719        }
1720        return ret;
1721}
1722
1723static int remove_extent_backref(struct btrfs_trans_handle *trans,
1724                                 struct btrfs_root *root,
1725                                 struct btrfs_path *path,
1726                                 struct btrfs_extent_inline_ref *iref,
1727                                 int refs_to_drop, int is_data)
1728{
1729        int ret;
1730
1731        BUG_ON(!is_data && refs_to_drop != 1);
1732        if (iref) {
1733                ret = update_inline_extent_backref(trans, root, path, iref,
1734                                                   -refs_to_drop, NULL);
1735        } else if (is_data) {
1736                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1737        } else {
1738                ret = btrfs_del_item(trans, root, path);
1739        }
1740        return ret;
1741}
1742
1743static void btrfs_issue_discard(struct block_device *bdev,
1744                                u64 start, u64 len)
1745{
1746        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
1747}
1748
1749static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1750                                u64 num_bytes)
1751{
1752        int ret;
1753        u64 map_length = num_bytes;
1754        struct btrfs_multi_bio *multi = NULL;
1755
1756        if (!btrfs_test_opt(root, DISCARD))
1757                return 0;
1758
1759        /* Tell the block device(s) that the sectors can be discarded */
1760        ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
1761                              bytenr, &map_length, &multi, 0);
1762        if (!ret) {
1763                struct btrfs_bio_stripe *stripe = multi->stripes;
1764                int i;
1765
1766                if (map_length > num_bytes)
1767                        map_length = num_bytes;
1768
1769                for (i = 0; i < multi->num_stripes; i++, stripe++) {
1770                        btrfs_issue_discard(stripe->dev->bdev,
1771                                            stripe->physical,
1772                                            map_length);
1773                }
1774                kfree(multi);
1775        }
1776
1777        return ret;
1778}
1779
1780int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1781                         struct btrfs_root *root,
1782                         u64 bytenr, u64 num_bytes, u64 parent,
1783                         u64 root_objectid, u64 owner, u64 offset)
1784{
1785        int ret;
1786        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1787               root_objectid == BTRFS_TREE_LOG_OBJECTID);
1788
1789        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1790                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1791                                        parent, root_objectid, (int)owner,
1792                                        BTRFS_ADD_DELAYED_REF, NULL);
1793        } else {
1794                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1795                                        parent, root_objectid, owner, offset,
1796                                        BTRFS_ADD_DELAYED_REF, NULL);
1797        }
1798        return ret;
1799}
1800
1801static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1802                                  struct btrfs_root *root,
1803                                  u64 bytenr, u64 num_bytes,
1804                                  u64 parent, u64 root_objectid,
1805                                  u64 owner, u64 offset, int refs_to_add,
1806                                  struct btrfs_delayed_extent_op *extent_op)
1807{
1808        struct btrfs_path *path;
1809        struct extent_buffer *leaf;
1810        struct btrfs_extent_item *item;
1811        u64 refs;
1812        int ret;
1813        int err = 0;
1814
1815        path = btrfs_alloc_path();
1816        if (!path)
1817                return -ENOMEM;
1818
1819        path->reada = 1;
1820        path->leave_spinning = 1;
1821        /* this will setup the path even if it fails to insert the back ref */
1822        ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1823                                           path, bytenr, num_bytes, parent,
1824                                           root_objectid, owner, offset,
1825                                           refs_to_add, extent_op);
1826        if (ret == 0)
1827                goto out;
1828
1829        if (ret != -EAGAIN) {
1830                err = ret;
1831                goto out;
1832        }
1833
1834        leaf = path->nodes[0];
1835        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1836        refs = btrfs_extent_refs(leaf, item);
1837        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1838        if (extent_op)
1839                __run_delayed_extent_op(extent_op, leaf, item);
1840
1841        btrfs_mark_buffer_dirty(leaf);
1842        btrfs_release_path(root->fs_info->extent_root, path);
1843
1844        path->reada = 1;
1845        path->leave_spinning = 1;
1846
1847        /* now insert the actual backref */
1848        ret = insert_extent_backref(trans, root->fs_info->extent_root,
1849                                    path, bytenr, parent, root_objectid,
1850                                    owner, offset, refs_to_add);
1851        BUG_ON(ret);
1852out:
1853        btrfs_free_path(path);
1854        return err;
1855}
1856
1857static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1858                                struct btrfs_root *root,
1859                                struct btrfs_delayed_ref_node *node,
1860                                struct btrfs_delayed_extent_op *extent_op,
1861                                int insert_reserved)
1862{
1863        int ret = 0;
1864        struct btrfs_delayed_data_ref *ref;
1865        struct btrfs_key ins;
1866        u64 parent = 0;
1867        u64 ref_root = 0;
1868        u64 flags = 0;
1869
1870        ins.objectid = node->bytenr;
1871        ins.offset = node->num_bytes;
1872        ins.type = BTRFS_EXTENT_ITEM_KEY;
1873
1874        ref = btrfs_delayed_node_to_data_ref(node);
1875        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1876                parent = ref->parent;
1877        else
1878                ref_root = ref->root;
1879
1880        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1881                if (extent_op) {
1882                        BUG_ON(extent_op->update_key);
1883                        flags |= extent_op->flags_to_set;
1884                }
1885                ret = alloc_reserved_file_extent(trans, root,
1886                                                 parent, ref_root, flags,
1887                                                 ref->objectid, ref->offset,
1888                                                 &ins, node->ref_mod);
1889        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1890                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1891                                             node->num_bytes, parent,
1892                                             ref_root, ref->objectid,
1893                                             ref->offset, node->ref_mod,
1894                                             extent_op);
1895        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1896                ret = __btrfs_free_extent(trans, root, node->bytenr,
1897                                          node->num_bytes, parent,
1898                                          ref_root, ref->objectid,
1899                                          ref->offset, node->ref_mod,
1900                                          extent_op);
1901        } else {
1902                BUG();
1903        }
1904        return ret;
1905}
1906
1907static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1908                                    struct extent_buffer *leaf,
1909                                    struct btrfs_extent_item *ei)
1910{
1911        u64 flags = btrfs_extent_flags(leaf, ei);
1912        if (extent_op->update_flags) {
1913                flags |= extent_op->flags_to_set;
1914                btrfs_set_extent_flags(leaf, ei, flags);
1915        }
1916
1917        if (extent_op->update_key) {
1918                struct btrfs_tree_block_info *bi;
1919                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1920                bi = (struct btrfs_tree_block_info *)(ei + 1);
1921                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1922        }
1923}
1924
1925static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1926                                 struct btrfs_root *root,
1927                                 struct btrfs_delayed_ref_node *node,
1928                                 struct btrfs_delayed_extent_op *extent_op)
1929{
1930        struct btrfs_key key;
1931        struct btrfs_path *path;
1932        struct btrfs_extent_item *ei;
1933        struct extent_buffer *leaf;
1934        u32 item_size;
1935        int ret;
1936        int err = 0;
1937
1938        path = btrfs_alloc_path();
1939        if (!path)
1940                return -ENOMEM;
1941
1942        key.objectid = node->bytenr;
1943        key.type = BTRFS_EXTENT_ITEM_KEY;
1944        key.offset = node->num_bytes;
1945
1946        path->reada = 1;
1947        path->leave_spinning = 1;
1948        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
1949                                path, 0, 1);
1950        if (ret < 0) {
1951                err = ret;
1952                goto out;
1953        }
1954        if (ret > 0) {
1955                err = -EIO;
1956                goto out;
1957        }
1958
1959        leaf = path->nodes[0];
1960        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1961#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1962        if (item_size < sizeof(*ei)) {
1963                ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
1964                                             path, (u64)-1, 0);
1965                if (ret < 0) {
1966                        err = ret;
1967                        goto out;
1968                }
1969                leaf = path->nodes[0];
1970                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1971        }
1972#endif
1973        BUG_ON(item_size < sizeof(*ei));
1974        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1975        __run_delayed_extent_op(extent_op, leaf, ei);
1976
1977        btrfs_mark_buffer_dirty(leaf);
1978out:
1979        btrfs_free_path(path);
1980        return err;
1981}
1982
1983static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1984                                struct btrfs_root *root,
1985                                struct btrfs_delayed_ref_node *node,
1986                                struct btrfs_delayed_extent_op *extent_op,
1987                                int insert_reserved)
1988{
1989        int ret = 0;
1990        struct btrfs_delayed_tree_ref *ref;
1991        struct btrfs_key ins;
1992        u64 parent = 0;
1993        u64 ref_root = 0;
1994
1995        ins.objectid = node->bytenr;
1996        ins.offset = node->num_bytes;
1997        ins.type = BTRFS_EXTENT_ITEM_KEY;
1998
1999        ref = btrfs_delayed_node_to_tree_ref(node);
2000        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2001                parent = ref->parent;
2002        else
2003                ref_root = ref->root;
2004
2005        BUG_ON(node->ref_mod != 1);
2006        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2007                BUG_ON(!extent_op || !extent_op->update_flags ||
2008                       !extent_op->update_key);
2009                ret = alloc_reserved_tree_block(trans, root,
2010                                                parent, ref_root,
2011                                                extent_op->flags_to_set,
2012                                                &extent_op->key,
2013                                                ref->level, &ins);
2014        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2015                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2016                                             node->num_bytes, parent, ref_root,
2017                                             ref->level, 0, 1, extent_op);
2018        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2019                ret = __btrfs_free_extent(trans, root, node->bytenr,
2020                                          node->num_bytes, parent, ref_root,
2021                                          ref->level, 0, 1, extent_op);
2022        } else {
2023                BUG();
2024        }
2025        return ret;
2026}
2027
2028/* helper function to actually process a single delayed ref entry */
2029static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2030                               struct btrfs_root *root,
2031                               struct btrfs_delayed_ref_node *node,
2032                               struct btrfs_delayed_extent_op *extent_op,
2033                               int insert_reserved)
2034{
2035        int ret;
2036        if (btrfs_delayed_ref_is_head(node)) {
2037                struct btrfs_delayed_ref_head *head;
2038                /*
2039                 * we've hit the end of the chain and we were supposed
2040                 * to insert this extent into the tree.  But, it got
2041                 * deleted before we ever needed to insert it, so all
2042                 * we have to do is clean up the accounting
2043                 */
2044                BUG_ON(extent_op);
2045                head = btrfs_delayed_node_to_head(node);
2046                if (insert_reserved) {
2047                        btrfs_pin_extent(root, node->bytenr,
2048                                         node->num_bytes, 1);
2049                        if (head->is_data) {
2050                                ret = btrfs_del_csums(trans, root,
2051                                                      node->bytenr,
2052                                                      node->num_bytes);
2053                                BUG_ON(ret);
2054                        }
2055                }
2056                mutex_unlock(&head->mutex);
2057                return 0;
2058        }
2059
2060        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2061            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2062                ret = run_delayed_tree_ref(trans, root, node, extent_op,
2063                                           insert_reserved);
2064        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2065                 node->type == BTRFS_SHARED_DATA_REF_KEY)
2066                ret = run_delayed_data_ref(trans, root, node, extent_op,
2067                                           insert_reserved);
2068        else
2069                BUG();
2070        return ret;
2071}
2072
2073static noinline struct btrfs_delayed_ref_node *
2074select_delayed_ref(struct btrfs_delayed_ref_head *head)
2075{
2076        struct rb_node *node;
2077        struct btrfs_delayed_ref_node *ref;
2078        int action = BTRFS_ADD_DELAYED_REF;
2079again:
2080        /*
2081         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2082         * this prevents ref count from going down to zero when
2083         * there still are pending delayed ref.
2084         */
2085        node = rb_prev(&head->node.rb_node);
2086        while (1) {
2087                if (!node)
2088                        break;
2089                ref = rb_entry(node, struct btrfs_delayed_ref_node,
2090                                rb_node);
2091                if (ref->bytenr != head->node.bytenr)
2092                        break;
2093                if (ref->action == action)
2094                        return ref;
2095                node = rb_prev(node);
2096        }
2097        if (action == BTRFS_ADD_DELAYED_REF) {
2098                action = BTRFS_DROP_DELAYED_REF;
2099                goto again;
2100        }
2101        return NULL;
2102}
2103
2104static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2105                                       struct btrfs_root *root,
2106                                       struct list_head *cluster)
2107{
2108        struct btrfs_delayed_ref_root *delayed_refs;
2109        struct btrfs_delayed_ref_node *ref;
2110        struct btrfs_delayed_ref_head *locked_ref = NULL;
2111        struct btrfs_delayed_extent_op *extent_op;
2112        int ret;
2113        int count = 0;
2114        int must_insert_reserved = 0;
2115
2116        delayed_refs = &trans->transaction->delayed_refs;
2117        while (1) {
2118                if (!locked_ref) {
2119                        /* pick a new head ref from the cluster list */
2120                        if (list_empty(cluster))
2121                                break;
2122
2123                        locked_ref = list_entry(cluster->next,
2124                                     struct btrfs_delayed_ref_head, cluster);
2125
2126                        /* grab the lock that says we are going to process
2127                         * all the refs for this head */
2128                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
2129
2130                        /*
2131                         * we may have dropped the spin lock to get the head
2132                         * mutex lock, and that might have given someone else
2133                         * time to free the head.  If that's true, it has been
2134                         * removed from our list and we can move on.
2135                         */
2136                        if (ret == -EAGAIN) {
2137                                locked_ref = NULL;
2138                                count++;
2139                                continue;
2140                        }
2141                }
2142
2143                /*
2144                 * record the must insert reserved flag before we
2145                 * drop the spin lock.
2146                 */
2147                must_insert_reserved = locked_ref->must_insert_reserved;
2148                locked_ref->must_insert_reserved = 0;
2149
2150                extent_op = locked_ref->extent_op;
2151                locked_ref->extent_op = NULL;
2152
2153                /*
2154                 * locked_ref is the head node, so we have to go one
2155                 * node back for any delayed ref updates
2156                 */
2157                ref = select_delayed_ref(locked_ref);
2158                if (!ref) {
2159                        /* All delayed refs have been processed, Go ahead
2160                         * and send the head node to run_one_delayed_ref,
2161                         * so that any accounting fixes can happen
2162                         */
2163                        ref = &locked_ref->node;
2164
2165                        if (extent_op && must_insert_reserved) {
2166                                kfree(extent_op);
2167                                extent_op = NULL;
2168                        }
2169
2170                        if (extent_op) {
2171                                spin_unlock(&delayed_refs->lock);
2172
2173                                ret = run_delayed_extent_op(trans, root,
2174                                                            ref, extent_op);
2175                                BUG_ON(ret);
2176                                kfree(extent_op);
2177
2178                                cond_resched();
2179                                spin_lock(&delayed_refs->lock);
2180                                continue;
2181                        }
2182
2183                        list_del_init(&locked_ref->cluster);
2184                        locked_ref = NULL;
2185                }
2186
2187                ref->in_tree = 0;
2188                rb_erase(&ref->rb_node, &delayed_refs->root);
2189                delayed_refs->num_entries--;
2190
2191                spin_unlock(&delayed_refs->lock);
2192
2193                ret = run_one_delayed_ref(trans, root, ref, extent_op,
2194                                          must_insert_reserved);
2195                BUG_ON(ret);
2196
2197                btrfs_put_delayed_ref(ref);
2198                kfree(extent_op);
2199                count++;
2200
2201                cond_resched();
2202                spin_lock(&delayed_refs->lock);
2203        }
2204        return count;
2205}
2206
2207/*
2208 * this starts processing the delayed reference count updates and
2209 * extent insertions we have queued up so far.  count can be
2210 * 0, which means to process everything in the tree at the start
2211 * of the run (but not newly added entries), or it can be some target
2212 * number you'd like to process.
2213 */
2214int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2215                           struct btrfs_root *root, unsigned long count)
2216{
2217        struct rb_node *node;
2218        struct btrfs_delayed_ref_root *delayed_refs;
2219        struct btrfs_delayed_ref_node *ref;
2220        struct list_head cluster;
2221        int ret;
2222        int run_all = count == (unsigned long)-1;
2223        int run_most = 0;
2224
2225        if (root == root->fs_info->extent_root)
2226                root = root->fs_info->tree_root;
2227
2228        delayed_refs = &trans->transaction->delayed_refs;
2229        INIT_LIST_HEAD(&cluster);
2230again:
2231        spin_lock(&delayed_refs->lock);
2232        if (count == 0) {
2233                count = delayed_refs->num_entries * 2;
2234                run_most = 1;
2235        }
2236        while (1) {
2237                if (!(run_all || run_most) &&
2238                    delayed_refs->num_heads_ready < 64)
2239                        break;
2240
2241                /*
2242                 * go find something we can process in the rbtree.  We start at
2243                 * the beginning of the tree, and then build a cluster
2244                 * of refs to process starting at the first one we are able to
2245                 * lock
2246                 */
2247                ret = btrfs_find_ref_cluster(trans, &cluster,
2248                                             delayed_refs->run_delayed_start);
2249                if (ret)
2250                        break;
2251
2252                ret = run_clustered_refs(trans, root, &cluster);
2253                BUG_ON(ret < 0);
2254
2255                count -= min_t(unsigned long, ret, count);
2256
2257                if (count == 0)
2258                        break;
2259        }
2260
2261        if (run_all) {
2262                node = rb_first(&delayed_refs->root);
2263                if (!node)
2264                        goto out;
2265                count = (unsigned long)-1;
2266
2267                while (node) {
2268                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
2269                                       rb_node);
2270                        if (btrfs_delayed_ref_is_head(ref)) {
2271                                struct btrfs_delayed_ref_head *head;
2272
2273                                head = btrfs_delayed_node_to_head(ref);
2274                                atomic_inc(&ref->refs);
2275
2276                                spin_unlock(&delayed_refs->lock);
2277                                mutex_lock(&head->mutex);
2278                                mutex_unlock(&head->mutex);
2279
2280                                btrfs_put_delayed_ref(ref);
2281                                cond_resched();
2282                                goto again;
2283                        }
2284                        node = rb_next(node);
2285                }
2286                spin_unlock(&delayed_refs->lock);
2287                schedule_timeout(1);
2288                goto again;
2289        }
2290out:
2291        spin_unlock(&delayed_refs->lock);
2292        return 0;
2293}
2294
2295int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2296                                struct btrfs_root *root,
2297                                u64 bytenr, u64 num_bytes, u64 flags,
2298                                int is_data)
2299{
2300        struct btrfs_delayed_extent_op *extent_op;
2301        int ret;
2302
2303        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2304        if (!extent_op)
2305                return -ENOMEM;
2306
2307        extent_op->flags_to_set = flags;
2308        extent_op->update_flags = 1;
2309        extent_op->update_key = 0;
2310        extent_op->is_data = is_data ? 1 : 0;
2311
2312        ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2313        if (ret)
2314                kfree(extent_op);
2315        return ret;
2316}
2317
2318static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2319                                      struct btrfs_root *root,
2320                                      struct btrfs_path *path,
2321                                      u64 objectid, u64 offset, u64 bytenr)
2322{
2323        struct btrfs_delayed_ref_head *head;
2324        struct btrfs_delayed_ref_node *ref;
2325        struct btrfs_delayed_data_ref *data_ref;
2326        struct btrfs_delayed_ref_root *delayed_refs;
2327        struct rb_node *node;
2328        int ret = 0;
2329
2330        ret = -ENOENT;
2331        delayed_refs = &trans->transaction->delayed_refs;
2332        spin_lock(&delayed_refs->lock);
2333        head = btrfs_find_delayed_ref_head(trans, bytenr);
2334        if (!head)
2335                goto out;
2336
2337        if (!mutex_trylock(&head->mutex)) {
2338                atomic_inc(&head->node.refs);
2339                spin_unlock(&delayed_refs->lock);
2340
2341                btrfs_release_path(root->fs_info->extent_root, path);
2342
2343                mutex_lock(&head->mutex);
2344                mutex_unlock(&head->mutex);
2345                btrfs_put_delayed_ref(&head->node);
2346                return -EAGAIN;
2347        }
2348
2349        node = rb_prev(&head->node.rb_node);
2350        if (!node)
2351                goto out_unlock;
2352
2353        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2354
2355        if (ref->bytenr != bytenr)
2356                goto out_unlock;
2357
2358        ret = 1;
2359        if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2360                goto out_unlock;
2361
2362        data_ref = btrfs_delayed_node_to_data_ref(ref);
2363
2364        node = rb_prev(node);
2365        if (node) {
2366                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2367                if (ref->bytenr == bytenr)
2368                        goto out_unlock;
2369        }
2370
2371        if (data_ref->root != root->root_key.objectid ||
2372            data_ref->objectid != objectid || data_ref->offset != offset)
2373                goto out_unlock;
2374
2375        ret = 0;
2376out_unlock:
2377        mutex_unlock(&head->mutex);
2378out:
2379        spin_unlock(&delayed_refs->lock);
2380        return ret;
2381}
2382
2383static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2384                                        struct btrfs_root *root,
2385                                        struct btrfs_path *path,
2386                                        u64 objectid, u64 offset, u64 bytenr)
2387{
2388        struct btrfs_root *extent_root = root->fs_info->extent_root;
2389        struct extent_buffer *leaf;
2390        struct btrfs_extent_data_ref *ref;
2391        struct btrfs_extent_inline_ref *iref;
2392        struct btrfs_extent_item *ei;
2393        struct btrfs_key key;
2394        u32 item_size;
2395        int ret;
2396
2397        key.objectid = bytenr;
2398        key.offset = (u64)-1;
2399        key.type = BTRFS_EXTENT_ITEM_KEY;
2400
2401        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2402        if (ret < 0)
2403                goto out;
2404        BUG_ON(ret == 0);
2405
2406        ret = -ENOENT;
2407        if (path->slots[0] == 0)
2408                goto out;
2409
2410        path->slots[0]--;
2411        leaf = path->nodes[0];
2412        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2413
2414        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2415                goto out;
2416
2417        ret = 1;
2418        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2419#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2420        if (item_size < sizeof(*ei)) {
2421                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2422                goto out;
2423        }
2424#endif
2425        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2426
2427        if (item_size != sizeof(*ei) +
2428            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2429                goto out;
2430
2431        if (btrfs_extent_generation(leaf, ei) <=
2432            btrfs_root_last_snapshot(&root->root_item))
2433                goto out;
2434
2435        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2436        if (btrfs_extent_inline_ref_type(leaf, iref) !=
2437            BTRFS_EXTENT_DATA_REF_KEY)
2438                goto out;
2439
2440        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2441        if (btrfs_extent_refs(leaf, ei) !=
2442            btrfs_extent_data_ref_count(leaf, ref) ||
2443            btrfs_extent_data_ref_root(leaf, ref) !=
2444            root->root_key.objectid ||
2445            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2446            btrfs_extent_data_ref_offset(leaf, ref) != offset)
2447                goto out;
2448
2449        ret = 0;
2450out:
2451        return ret;
2452}
2453
2454int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2455                          struct btrfs_root *root,
2456                          u64 objectid, u64 offset, u64 bytenr)
2457{
2458        struct btrfs_path *path;
2459        int ret;
2460        int ret2;
2461
2462        path = btrfs_alloc_path();
2463        if (!path)
2464                return -ENOENT;
2465
2466        do {
2467                ret = check_committed_ref(trans, root, path, objectid,
2468                                          offset, bytenr);
2469                if (ret && ret != -ENOENT)
2470                        goto out;
2471
2472                ret2 = check_delayed_ref(trans, root, path, objectid,
2473                                         offset, bytenr);
2474        } while (ret2 == -EAGAIN);
2475
2476        if (ret2 && ret2 != -ENOENT) {
2477                ret = ret2;
2478                goto out;
2479        }
2480
2481        if (ret != -ENOENT || ret2 != -ENOENT)
2482                ret = 0;
2483out:
2484        btrfs_free_path(path);
2485        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2486                WARN_ON(ret > 0);
2487        return ret;
2488}
2489
2490#if 0
2491int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2492                    struct extent_buffer *buf, u32 nr_extents)
2493{
2494        struct btrfs_key key;
2495        struct btrfs_file_extent_item *fi;
2496        u64 root_gen;
2497        u32 nritems;
2498        int i;
2499        int level;
2500        int ret = 0;
2501        int shared = 0;
2502
2503        if (!root->ref_cows)
2504                return 0;
2505
2506        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2507                shared = 0;
2508                root_gen = root->root_key.offset;
2509        } else {
2510                shared = 1;
2511                root_gen = trans->transid - 1;
2512        }
2513
2514        level = btrfs_header_level(buf);
2515        nritems = btrfs_header_nritems(buf);
2516
2517        if (level == 0) {
2518                struct btrfs_leaf_ref *ref;
2519                struct btrfs_extent_info *info;
2520
2521                ref = btrfs_alloc_leaf_ref(root, nr_extents);
2522                if (!ref) {
2523                        ret = -ENOMEM;
2524                        goto out;
2525                }
2526
2527                ref->root_gen = root_gen;
2528                ref->bytenr = buf->start;
2529                ref->owner = btrfs_header_owner(buf);
2530                ref->generation = btrfs_header_generation(buf);
2531                ref->nritems = nr_extents;
2532                info = ref->extents;
2533
2534                for (i = 0; nr_extents > 0 && i < nritems; i++) {
2535                        u64 disk_bytenr;
2536                        btrfs_item_key_to_cpu(buf, &key, i);
2537                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2538                                continue;
2539                        fi = btrfs_item_ptr(buf, i,
2540                                            struct btrfs_file_extent_item);
2541                        if (btrfs_file_extent_type(buf, fi) ==
2542                            BTRFS_FILE_EXTENT_INLINE)
2543                                continue;
2544                        disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2545                        if (disk_bytenr == 0)
2546                                continue;
2547
2548                        info->bytenr = disk_bytenr;
2549                        info->num_bytes =
2550                                btrfs_file_extent_disk_num_bytes(buf, fi);
2551                        info->objectid = key.objectid;
2552                        info->offset = key.offset;
2553                        info++;
2554                }
2555
2556                ret = btrfs_add_leaf_ref(root, ref, shared);
2557                if (ret == -EEXIST && shared) {
2558                        struct btrfs_leaf_ref *old;
2559                        old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2560                        BUG_ON(!old);
2561                        btrfs_remove_leaf_ref(root, old);
2562                        btrfs_free_leaf_ref(root, old);
2563                        ret = btrfs_add_leaf_ref(root, ref, shared);
2564                }
2565                WARN_ON(ret);
2566                btrfs_free_leaf_ref(root, ref);
2567        }
2568out:
2569        return ret;
2570}
2571
2572/* when a block goes through cow, we update the reference counts of
2573 * everything that block points to.  The internal pointers of the block
2574 * can be in just about any order, and it is likely to have clusters of
2575 * things that are close together and clusters of things that are not.
2576 *
2577 * To help reduce the seeks that come with updating all of these reference
2578 * counts, sort them by byte number before actual updates are done.
2579 *
2580 * struct refsort is used to match byte number to slot in the btree block.
2581 * we sort based on the byte number and then use the slot to actually
2582 * find the item.
2583 *
2584 * struct refsort is smaller than strcut btrfs_item and smaller than
2585 * struct btrfs_key_ptr.  Since we're currently limited to the page size
2586 * for a btree block, there's no way for a kmalloc of refsorts for a
2587 * single node to be bigger than a page.
2588 */
2589struct refsort {
2590        u64 bytenr;
2591        u32 slot;
2592};
2593
2594/*
2595 * for passing into sort()
2596 */
2597static int refsort_cmp(const void *a_void, const void *b_void)
2598{
2599        const struct refsort *a = a_void;
2600        const struct refsort *b = b_void;
2601
2602        if (a->bytenr < b->bytenr)
2603                return -1;
2604        if (a->bytenr > b->bytenr)
2605                return 1;
2606        return 0;
2607}
2608#endif
2609
2610static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2611                           struct btrfs_root *root,
2612                           struct extent_buffer *buf,
2613                           int full_backref, int inc)
2614{
2615        u64 bytenr;
2616        u64 num_bytes;
2617        u64 parent;
2618        u64 ref_root;
2619        u32 nritems;
2620        struct btrfs_key key;
2621        struct btrfs_file_extent_item *fi;
2622        int i;
2623        int level;
2624        int ret = 0;
2625        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2626                            u64, u64, u64, u64, u64, u64);
2627
2628        ref_root = btrfs_header_owner(buf);
2629        nritems = btrfs_header_nritems(buf);
2630        level = btrfs_header_level(buf);
2631
2632        if (!root->ref_cows && level == 0)
2633                return 0;
2634
2635        if (inc)
2636                process_func = btrfs_inc_extent_ref;
2637        else
2638                process_func = btrfs_free_extent;
2639
2640        if (full_backref)
2641                parent = buf->start;
2642        else
2643                parent = 0;
2644
2645        for (i = 0; i < nritems; i++) {
2646                if (level == 0) {
2647                        btrfs_item_key_to_cpu(buf, &key, i);
2648                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2649                                continue;
2650                        fi = btrfs_item_ptr(buf, i,
2651                                            struct btrfs_file_extent_item);
2652                        if (btrfs_file_extent_type(buf, fi) ==
2653                            BTRFS_FILE_EXTENT_INLINE)
2654                                continue;
2655                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2656                        if (bytenr == 0)
2657                                continue;
2658
2659                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2660                        key.offset -= btrfs_file_extent_offset(buf, fi);
2661                        ret = process_func(trans, root, bytenr, num_bytes,
2662                                           parent, ref_root, key.objectid,
2663                                           key.offset);
2664                        if (ret)
2665                                goto fail;
2666                } else {
2667                        bytenr = btrfs_node_blockptr(buf, i);
2668                        num_bytes = btrfs_level_size(root, level - 1);
2669                        ret = process_func(trans, root, bytenr, num_bytes,
2670                                           parent, ref_root, level - 1, 0);
2671                        if (ret)
2672                                goto fail;
2673                }
2674        }
2675        return 0;
2676fail:
2677        BUG();
2678        return ret;
2679}
2680
2681int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2682                  struct extent_buffer *buf, int full_backref)
2683{
2684        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2685}
2686
2687int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2688                  struct extent_buffer *buf, int full_backref)
2689{
2690        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2691}
2692
2693static int write_one_cache_group(struct btrfs_trans_handle *trans,
2694                                 struct btrfs_root *root,
2695                                 struct btrfs_path *path,
2696                                 struct btrfs_block_group_cache *cache)
2697{
2698        int ret;
2699        struct btrfs_root *extent_root = root->fs_info->extent_root;
2700        unsigned long bi;
2701        struct extent_buffer *leaf;
2702
2703        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2704        if (ret < 0)
2705                goto fail;
2706        BUG_ON(ret);
2707
2708        leaf = path->nodes[0];
2709        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2710        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2711        btrfs_mark_buffer_dirty(leaf);
2712        btrfs_release_path(extent_root, path);
2713fail:
2714        if (ret)
2715                return ret;
2716        return 0;
2717
2718}
2719
2720static struct btrfs_block_group_cache *
2721next_block_group(struct btrfs_root *root,
2722                 struct btrfs_block_group_cache *cache)
2723{
2724        struct rb_node *node;
2725        spin_lock(&root->fs_info->block_group_cache_lock);
2726        node = rb_next(&cache->cache_node);
2727        btrfs_put_block_group(cache);
2728        if (node) {
2729                cache = rb_entry(node, struct btrfs_block_group_cache,
2730                                 cache_node);
2731                btrfs_get_block_group(cache);
2732        } else
2733                cache = NULL;
2734        spin_unlock(&root->fs_info->block_group_cache_lock);
2735        return cache;
2736}
2737
2738static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2739                            struct btrfs_trans_handle *trans,
2740                            struct btrfs_path *path)
2741{
2742        struct btrfs_root *root = block_group->fs_info->tree_root;
2743        struct inode *inode = NULL;
2744        u64 alloc_hint = 0;
2745        int dcs = BTRFS_DC_ERROR;
2746        int num_pages = 0;
2747        int retries = 0;
2748        int ret = 0;
2749
2750        /*
2751         * If this block group is smaller than 100 megs don't bother caching the
2752         * block group.
2753         */
2754        if (block_group->key.offset < (100 * 1024 * 1024)) {
2755                spin_lock(&block_group->lock);
2756                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2757                spin_unlock(&block_group->lock);
2758                return 0;
2759        }
2760
2761again:
2762        inode = lookup_free_space_inode(root, block_group, path);
2763        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2764                ret = PTR_ERR(inode);
2765                btrfs_release_path(root, path);
2766                goto out;
2767        }
2768
2769        if (IS_ERR(inode)) {
2770                BUG_ON(retries);
2771                retries++;
2772
2773                if (block_group->ro)
2774                        goto out_free;
2775
2776                ret = create_free_space_inode(root, trans, block_group, path);
2777                if (ret)
2778                        goto out_free;
2779                goto again;
2780        }
2781
2782        /*
2783         * We want to set the generation to 0, that way if anything goes wrong
2784         * from here on out we know not to trust this cache when we load up next
2785         * time.
2786         */
2787        BTRFS_I(inode)->generation = 0;
2788        ret = btrfs_update_inode(trans, root, inode);
2789        WARN_ON(ret);
2790
2791        if (i_size_read(inode) > 0) {
2792                ret = btrfs_truncate_free_space_cache(root, trans, path,
2793                                                      inode);
2794                if (ret)
2795                        goto out_put;
2796        }
2797
2798        spin_lock(&block_group->lock);
2799        if (block_group->cached != BTRFS_CACHE_FINISHED) {
2800                /* We're not cached, don't bother trying to write stuff out */
2801                dcs = BTRFS_DC_WRITTEN;
2802                spin_unlock(&block_group->lock);
2803                goto out_put;
2804        }
2805        spin_unlock(&block_group->lock);
2806
2807        num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2808        if (!num_pages)
2809                num_pages = 1;
2810
2811        /*
2812         * Just to make absolutely sure we have enough space, we're going to
2813         * preallocate 12 pages worth of space for each block group.  In
2814         * practice we ought to use at most 8, but we need extra space so we can
2815         * add our header and have a terminator between the extents and the
2816         * bitmaps.
2817         */
2818        num_pages *= 16;
2819        num_pages *= PAGE_CACHE_SIZE;
2820
2821        ret = btrfs_check_data_free_space(inode, num_pages);
2822        if (ret)
2823                goto out_put;
2824
2825        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2826                                              num_pages, num_pages,
2827                                              &alloc_hint);
2828        if (!ret)
2829                dcs = BTRFS_DC_SETUP;
2830        btrfs_free_reserved_data_space(inode, num_pages);
2831out_put:
2832        iput(inode);
2833out_free:
2834        btrfs_release_path(root, path);
2835out:
2836        spin_lock(&block_group->lock);
2837        block_group->disk_cache_state = dcs;
2838        spin_unlock(&block_group->lock);
2839
2840        return ret;
2841}
2842
2843int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2844                                   struct btrfs_root *root)
2845{
2846        struct btrfs_block_group_cache *cache;
2847        int err = 0;
2848        struct btrfs_path *path;
2849        u64 last = 0;
2850
2851        path = btrfs_alloc_path();
2852        if (!path)
2853                return -ENOMEM;
2854
2855again:
2856        while (1) {
2857                cache = btrfs_lookup_first_block_group(root->fs_info, last);
2858                while (cache) {
2859                        if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2860                                break;
2861                        cache = next_block_group(root, cache);
2862                }
2863                if (!cache) {
2864                        if (last == 0)
2865                                break;
2866                        last = 0;
2867                        continue;
2868                }
2869                err = cache_save_setup(cache, trans, path);
2870                last = cache->key.objectid + cache->key.offset;
2871                btrfs_put_block_group(cache);
2872        }
2873
2874        while (1) {
2875                if (last == 0) {
2876                        err = btrfs_run_delayed_refs(trans, root,
2877                                                     (unsigned long)-1);
2878                        BUG_ON(err);
2879                }
2880
2881                cache = btrfs_lookup_first_block_group(root->fs_info, last);
2882                while (cache) {
2883                        if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2884                                btrfs_put_block_group(cache);
2885                                goto again;
2886                        }
2887
2888                        if (cache->dirty)
2889                                break;
2890                        cache = next_block_group(root, cache);
2891                }
2892                if (!cache) {
2893                        if (last == 0)
2894                                break;
2895                        last = 0;
2896                        continue;
2897                }
2898
2899                if (cache->disk_cache_state == BTRFS_DC_SETUP)
2900                        cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2901                cache->dirty = 0;
2902                last = cache->key.objectid + cache->key.offset;
2903
2904                err = write_one_cache_group(trans, root, path, cache);
2905                BUG_ON(err);
2906                btrfs_put_block_group(cache);
2907        }
2908
2909        while (1) {
2910                /*
2911                 * I don't think this is needed since we're just marking our
2912                 * preallocated extent as written, but just in case it can't
2913                 * hurt.
2914                 */
2915                if (last == 0) {
2916                        err = btrfs_run_delayed_refs(trans, root,
2917                                                     (unsigned long)-1);
2918                        BUG_ON(err);
2919                }
2920
2921                cache = btrfs_lookup_first_block_group(root->fs_info, last);
2922                while (cache) {
2923                        /*
2924                         * Really this shouldn't happen, but it could if we
2925                         * couldn't write the entire preallocated extent and
2926                         * splitting the extent resulted in a new block.
2927                         */
2928                        if (cache->dirty) {
2929                                btrfs_put_block_group(cache);
2930                                goto again;
2931                        }
2932                        if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2933                                break;
2934                        cache = next_block_group(root, cache);
2935                }
2936                if (!cache) {
2937                        if (last == 0)
2938                                break;
2939                        last = 0;
2940                        continue;
2941                }
2942
2943                btrfs_write_out_cache(root, trans, cache, path);
2944
2945                /*
2946                 * If we didn't have an error then the cache state is still
2947                 * NEED_WRITE, so we can set it to WRITTEN.
2948                 */
2949                if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2950                        cache->disk_cache_state = BTRFS_DC_WRITTEN;
2951                last = cache->key.objectid + cache->key.offset;
2952                btrfs_put_block_group(cache);
2953        }
2954
2955        btrfs_free_path(path);
2956        return 0;
2957}
2958
2959int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2960{
2961        struct btrfs_block_group_cache *block_group;
2962        int readonly = 0;
2963
2964        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2965        if (!block_group || block_group->ro)
2966                readonly = 1;
2967        if (block_group)
2968                btrfs_put_block_group(block_group);
2969        return readonly;
2970}
2971
2972static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2973                             u64 total_bytes, u64 bytes_used,
2974                             struct btrfs_space_info **space_info)
2975{
2976        struct btrfs_space_info *found;
2977        int i;
2978        int factor;
2979
2980        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2981                     BTRFS_BLOCK_GROUP_RAID10))
2982                factor = 2;
2983        else
2984                factor = 1;
2985
2986        found = __find_space_info(info, flags);
2987        if (found) {
2988                spin_lock(&found->lock);
2989                found->total_bytes += total_bytes;
2990                found->disk_total += total_bytes * factor;
2991                found->bytes_used += bytes_used;
2992                found->disk_used += bytes_used * factor;
2993                found->full = 0;
2994                spin_unlock(&found->lock);
2995                *space_info = found;
2996                return 0;
2997        }
2998        found = kzalloc(sizeof(*found), GFP_NOFS);
2999        if (!found)
3000                return -ENOMEM;
3001
3002        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3003                INIT_LIST_HEAD(&found->block_groups[i]);
3004        init_rwsem(&found->groups_sem);
3005        spin_lock_init(&found->lock);
3006        found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
3007                                BTRFS_BLOCK_GROUP_SYSTEM |
3008                                BTRFS_BLOCK_GROUP_METADATA);
3009        found->total_bytes = total_bytes;
3010        found->disk_total = total_bytes * factor;
3011        found->bytes_used = bytes_used;
3012        found->disk_used = bytes_used * factor;
3013        found->bytes_pinned = 0;
3014        found->bytes_reserved = 0;
3015        found->bytes_readonly = 0;
3016        found->bytes_may_use = 0;
3017        found->full = 0;
3018        found->force_alloc = 0;
3019        *space_info = found;
3020        list_add_rcu(&found->list, &info->space_info);
3021        atomic_set(&found->caching_threads, 0);
3022        return 0;
3023}
3024
3025static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3026{
3027        u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
3028                                   BTRFS_BLOCK_GROUP_RAID1 |
3029                                   BTRFS_BLOCK_GROUP_RAID10 |
3030                                   BTRFS_BLOCK_GROUP_DUP);
3031        if (extra_flags) {
3032                if (flags & BTRFS_BLOCK_GROUP_DATA)
3033                        fs_info->avail_data_alloc_bits |= extra_flags;
3034                if (flags & BTRFS_BLOCK_GROUP_METADATA)
3035                        fs_info->avail_metadata_alloc_bits |= extra_flags;
3036                if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3037                        fs_info->avail_system_alloc_bits |= extra_flags;
3038        }
3039}
3040
3041u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3042{
3043        /*
3044         * we add in the count of missing devices because we want
3045         * to make sure that any RAID levels on a degraded FS
3046         * continue to be honored.
3047         */
3048        u64 num_devices = root->fs_info->fs_devices->rw_devices +
3049                root->fs_info->fs_devices->missing_devices;
3050
3051        if (num_devices == 1)
3052                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3053        if (num_devices < 4)
3054                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3055
3056        if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3057            (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3058                      BTRFS_BLOCK_GROUP_RAID10))) {
3059                flags &= ~BTRFS_BLOCK_GROUP_DUP;
3060        }
3061
3062        if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3063            (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3064                flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3065        }
3066
3067        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3068            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3069             (flags & BTRFS_BLOCK_GROUP_RAID10) |
3070             (flags & BTRFS_BLOCK_GROUP_DUP)))
3071                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3072        return flags;
3073}
3074
3075static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3076{
3077        if (flags & BTRFS_BLOCK_GROUP_DATA)
3078                flags |= root->fs_info->avail_data_alloc_bits &
3079                         root->fs_info->data_alloc_profile;
3080        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3081                flags |= root->fs_info->avail_system_alloc_bits &
3082                         root->fs_info->system_alloc_profile;
3083        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3084                flags |= root->fs_info->avail_metadata_alloc_bits &
3085                         root->fs_info->metadata_alloc_profile;
3086        return btrfs_reduce_alloc_profile(root, flags);
3087}
3088
3089u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3090{
3091        u64 flags;
3092
3093        if (data)
3094                flags = BTRFS_BLOCK_GROUP_DATA;
3095        else if (root == root->fs_info->chunk_root)
3096                flags = BTRFS_BLOCK_GROUP_SYSTEM;
3097        else
3098                flags = BTRFS_BLOCK_GROUP_METADATA;
3099
3100        return get_alloc_profile(root, flags);
3101}
3102
3103void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3104{
3105        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3106                                                       BTRFS_BLOCK_GROUP_DATA);
3107}
3108
3109/*
3110 * This will check the space that the inode allocates from to make sure we have
3111 * enough space for bytes.
3112 */
3113int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3114{
3115        struct btrfs_space_info *data_sinfo;
3116        struct btrfs_root *root = BTRFS_I(inode)->root;
3117        u64 used;
3118        int ret = 0, committed = 0, alloc_chunk = 1;
3119
3120        /* make sure bytes are sectorsize aligned */
3121        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3122
3123        if (root == root->fs_info->tree_root) {
3124                alloc_chunk = 0;
3125                committed = 1;
3126        }
3127
3128        data_sinfo = BTRFS_I(inode)->space_info;
3129        if (!data_sinfo)
3130                goto alloc;
3131
3132again:
3133        /* make sure we have enough space to handle the data first */
3134        spin_lock(&data_sinfo->lock);
3135        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3136                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3137                data_sinfo->bytes_may_use;
3138
3139        if (used + bytes > data_sinfo->total_bytes) {
3140                struct btrfs_trans_handle *trans;
3141
3142                /*
3143                 * if we don't have enough free bytes in this space then we need
3144                 * to alloc a new chunk.
3145                 */
3146                if (!data_sinfo->full && alloc_chunk) {
3147                        u64 alloc_target;
3148
3149                        data_sinfo->force_alloc = 1;
3150                        spin_unlock(&data_sinfo->lock);
3151alloc:
3152                        alloc_target = btrfs_get_alloc_profile(root, 1);
3153                        trans = btrfs_join_transaction(root, 1);
3154                        if (IS_ERR(trans))
3155                                return PTR_ERR(trans);
3156
3157                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3158                                             bytes + 2 * 1024 * 1024,
3159                                             alloc_target, 0);
3160                        btrfs_end_transaction(trans, root);
3161                        if (ret < 0) {
3162                                if (ret != -ENOSPC)
3163                                        return ret;
3164                                else
3165                                        goto commit_trans;
3166                        }
3167
3168                        if (!data_sinfo) {
3169                                btrfs_set_inode_space_info(root, inode);
3170                                data_sinfo = BTRFS_I(inode)->space_info;
3171                        }
3172                        goto again;
3173                }
3174                spin_unlock(&data_sinfo->lock);
3175
3176                /* commit the current transaction and try again */
3177commit_trans:
3178                if (!committed && !root->fs_info->open_ioctl_trans) {
3179                        committed = 1;
3180                        trans = btrfs_join_transaction(root, 1);
3181                        if (IS_ERR(trans))
3182                                return PTR_ERR(trans);
3183                        ret = btrfs_commit_transaction(trans, root);
3184                        if (ret)
3185                                return ret;
3186                        goto again;
3187                }
3188
3189#if 0 /* I hope we never need this code again, just in case */
3190                printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3191                       "%llu bytes_reserved, " "%llu bytes_pinned, "
3192                       "%llu bytes_readonly, %llu may use %llu total\n",
3193                       (unsigned long long)bytes,
3194                       (unsigned long long)data_sinfo->bytes_used,
3195                       (unsigned long long)data_sinfo->bytes_reserved,
3196                       (unsigned long long)data_sinfo->bytes_pinned,
3197                       (unsigned long long)data_sinfo->bytes_readonly,
3198                       (unsigned long long)data_sinfo->bytes_may_use,
3199                       (unsigned long long)data_sinfo->total_bytes);
3200#endif
3201                return -ENOSPC;
3202        }
3203        data_sinfo->bytes_may_use += bytes;
3204        BTRFS_I(inode)->reserved_bytes += bytes;
3205        spin_unlock(&data_sinfo->lock);
3206
3207        return 0;
3208}
3209
3210/*
3211 * called when we are clearing an delalloc extent from the
3212 * inode's io_tree or there was an error for whatever reason
3213 * after calling btrfs_check_data_free_space
3214 */
3215void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3216{
3217        struct btrfs_root *root = BTRFS_I(inode)->root;
3218        struct btrfs_space_info *data_sinfo;
3219
3220        /* make sure bytes are sectorsize aligned */
3221        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3222
3223        data_sinfo = BTRFS_I(inode)->space_info;
3224        spin_lock(&data_sinfo->lock);
3225        data_sinfo->bytes_may_use -= bytes;
3226        BTRFS_I(inode)->reserved_bytes -= bytes;
3227        spin_unlock(&data_sinfo->lock);
3228}
3229
3230static void force_metadata_allocation(struct btrfs_fs_info *info)
3231{
3232        struct list_head *head = &info->space_info;
3233        struct btrfs_space_info *found;
3234
3235        rcu_read_lock();
3236        list_for_each_entry_rcu(found, head, list) {
3237                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3238                        found->force_alloc = 1;
3239        }
3240        rcu_read_unlock();
3241}
3242
3243static int should_alloc_chunk(struct btrfs_root *root,
3244                              struct btrfs_space_info *sinfo, u64 alloc_bytes)
3245{
3246        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3247        u64 thresh;
3248
3249        if (sinfo->bytes_used + sinfo->bytes_reserved +
3250            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3251                return 0;
3252
3253        if (sinfo->bytes_used + sinfo->bytes_reserved +
3254            alloc_bytes < div_factor(num_bytes, 8))
3255                return 0;
3256
3257        thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3258        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3259
3260        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3261                return 0;
3262
3263        return 1;
3264}
3265
3266static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3267                          struct btrfs_root *extent_root, u64 alloc_bytes,
3268                          u64 flags, int force)
3269{
3270        struct btrfs_space_info *space_info;
3271        struct btrfs_fs_info *fs_info = extent_root->fs_info;
3272        int ret = 0;
3273
3274        mutex_lock(&fs_info->chunk_mutex);
3275
3276        flags = btrfs_reduce_alloc_profile(extent_root, flags);
3277
3278        space_info = __find_space_info(extent_root->fs_info, flags);
3279        if (!space_info) {
3280                ret = update_space_info(extent_root->fs_info, flags,
3281                                        0, 0, &space_info);
3282                BUG_ON(ret);
3283        }
3284        BUG_ON(!space_info);
3285
3286        spin_lock(&space_info->lock);
3287        if (space_info->force_alloc)
3288                force = 1;
3289        if (space_info->full) {
3290                spin_unlock(&space_info->lock);
3291                goto out;
3292        }
3293
3294        if (!force && !should_alloc_chunk(extent_root, space_info,
3295                                          alloc_bytes)) {
3296                spin_unlock(&space_info->lock);
3297                goto out;
3298        }
3299        spin_unlock(&space_info->lock);
3300
3301        /*
3302         * If we have mixed data/metadata chunks we want to make sure we keep
3303         * allocating mixed chunks instead of individual chunks.
3304         */
3305        if (btrfs_mixed_space_info(space_info))
3306                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3307
3308        /*
3309         * if we're doing a data chunk, go ahead and make sure that
3310         * we keep a reasonable number of metadata chunks allocated in the
3311         * FS as well.
3312         */
3313        if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3314                fs_info->data_chunk_allocations++;
3315                if (!(fs_info->data_chunk_allocations %
3316                      fs_info->metadata_ratio))
3317                        force_metadata_allocation(fs_info);
3318        }
3319
3320        ret = btrfs_alloc_chunk(trans, extent_root, flags);
3321        spin_lock(&space_info->lock);
3322        if (ret)
3323                space_info->full = 1;
3324        else
3325                ret = 1;
3326        space_info->force_alloc = 0;
3327        spin_unlock(&space_info->lock);
3328out:
3329        mutex_unlock(&extent_root->fs_info->chunk_mutex);
3330        return ret;
3331}
3332
3333/*
3334 * shrink metadata reservation for delalloc
3335 */
3336static int shrink_delalloc(struct btrfs_trans_handle *trans,
3337                           struct btrfs_root *root, u64 to_reclaim, int sync)
3338{
3339        struct btrfs_block_rsv *block_rsv;
3340        struct btrfs_space_info *space_info;
3341        u64 reserved;
3342        u64 max_reclaim;
3343        u64 reclaimed = 0;
3344        long time_left;
3345        int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3346        int loops = 0;
3347        unsigned long progress;
3348
3349        block_rsv = &root->fs_info->delalloc_block_rsv;
3350        space_info = block_rsv->space_info;
3351
3352        smp_mb();
3353        reserved = space_info->bytes_reserved;
3354        progress = space_info->reservation_progress;
3355
3356        if (reserved == 0)
3357                return 0;
3358
3359        max_reclaim = min(reserved, to_reclaim);
3360
3361        while (loops < 1024) {
3362                /* have the flusher threads jump in and do some IO */
3363                smp_mb();
3364                nr_pages = min_t(unsigned long, nr_pages,
3365                       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3366                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3367
3368                spin_lock(&space_info->lock);
3369                if (reserved > space_info->bytes_reserved)
3370                        reclaimed += reserved - space_info->bytes_reserved;
3371                reserved = space_info->bytes_reserved;
3372                spin_unlock(&space_info->lock);
3373
3374                loops++;
3375
3376                if (reserved == 0 || reclaimed >= max_reclaim)
3377                        break;
3378
3379                if (trans && trans->transaction->blocked)
3380                        return -EAGAIN;
3381
3382                time_left = schedule_timeout_interruptible(1);
3383
3384                /* We were interrupted, exit */
3385                if (time_left)
3386                        break;
3387
3388                /* we've kicked the IO a few times, if anything has been freed,
3389                 * exit.  There is no sense in looping here for a long time
3390                 * when we really need to commit the transaction, or there are
3391                 * just too many writers without enough free space
3392                 */
3393
3394                if (loops > 3) {
3395                        smp_mb();
3396                        if (progress != space_info->reservation_progress)
3397                                break;
3398                }
3399
3400        }
3401        return reclaimed >= to_reclaim;
3402}
3403
3404/*
3405 * Retries tells us how many times we've called reserve_metadata_bytes.  The
3406 * idea is if this is the first call (retries == 0) then we will add to our
3407 * reserved count if we can't make the allocation in order to hold our place
3408 * while we go and try and free up space.  That way for retries > 1 we don't try
3409 * and add space, we just check to see if the amount of unused space is >= the
3410 * total space, meaning that our reservation is valid.
3411 *
3412 * However if we don't intend to retry this reservation, pass -1 as retries so
3413 * that it short circuits this logic.
3414 */
3415static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
3416                                  struct btrfs_root *root,
3417                                  struct btrfs_block_rsv *block_rsv,
3418                                  u64 orig_bytes, int flush)
3419{
3420        struct btrfs_space_info *space_info = block_rsv->space_info;
3421        u64 unused;
3422        u64 num_bytes = orig_bytes;
3423        int retries = 0;
3424        int ret = 0;
3425        bool reserved = false;
3426        bool committed = false;
3427
3428again:
3429        ret = -ENOSPC;
3430        if (reserved)
3431                num_bytes = 0;
3432
3433        spin_lock(&space_info->lock);
3434        unused = space_info->bytes_used + space_info->bytes_reserved +
3435                 space_info->bytes_pinned + space_info->bytes_readonly +
3436                 space_info->bytes_may_use;
3437
3438        /*
3439         * The idea here is that we've not already over-reserved the block group
3440         * then we can go ahead and save our reservation first and then start
3441         * flushing if we need to.  Otherwise if we've already overcommitted
3442         * lets start flushing stuff first and then come back and try to make
3443         * our reservation.
3444         */
3445        if (unused <= space_info->total_bytes) {
3446                unused = space_info->total_bytes - unused;
3447                if (unused >= num_bytes) {
3448                        if (!reserved)
3449                                space_info->bytes_reserved += orig_bytes;
3450                        ret = 0;
3451                } else {
3452                        /*
3453                         * Ok set num_bytes to orig_bytes since we aren't
3454                         * overocmmitted, this way we only try and reclaim what
3455                         * we need.
3456                         */
3457                        num_bytes = orig_bytes;
3458                }
3459        } else {
3460                /*
3461                 * Ok we're over committed, set num_bytes to the overcommitted
3462                 * amount plus the amount of bytes that we need for this
3463                 * reservation.
3464                 */
3465                num_bytes = unused - space_info->total_bytes +
3466                        (orig_bytes * (retries + 1));
3467        }
3468
3469        /*
3470         * Couldn't make our reservation, save our place so while we're trying
3471         * to reclaim space we can actually use it instead of somebody else
3472         * stealing it from us.
3473         */
3474        if (ret && !reserved) {
3475                space_info->bytes_reserved += orig_bytes;
3476                reserved = true;
3477        }
3478
3479        spin_unlock(&space_info->lock);
3480
3481        if (!ret)
3482                return 0;
3483
3484        if (!flush)
3485                goto out;
3486
3487        /*
3488         * We do synchronous shrinking since we don't actually unreserve
3489         * metadata until after the IO is completed.
3490         */
3491        ret = shrink_delalloc(trans, root, num_bytes, 1);
3492        if (ret > 0)
3493                return 0;
3494        else if (ret < 0)
3495                goto out;
3496
3497        /*
3498         * So if we were overcommitted it's possible that somebody else flushed
3499         * out enough space and we simply didn't have enough space to reclaim,
3500         * so go back around and try again.
3501         */
3502        if (retries < 2) {
3503                retries++;
3504                goto again;
3505        }
3506
3507        spin_lock(&space_info->lock);
3508        /*
3509         * Not enough space to be reclaimed, don't bother committing the
3510         * transaction.
3511         */
3512        if (space_info->bytes_pinned < orig_bytes)
3513                ret = -ENOSPC;
3514        spin_unlock(&space_info->lock);
3515        if (ret)
3516                goto out;
3517
3518        ret = -EAGAIN;
3519        if (trans || committed)
3520                goto out;
3521
3522        ret = -ENOSPC;
3523        trans = btrfs_join_transaction(root, 1);
3524        if (IS_ERR(trans))
3525                goto out;
3526        ret = btrfs_commit_transaction(trans, root);
3527        if (!ret) {
3528                trans = NULL;
3529                committed = true;
3530                goto again;
3531        }
3532
3533out:
3534        if (reserved) {
3535                spin_lock(&space_info->lock);
3536                space_info->bytes_reserved -= orig_bytes;
3537                spin_unlock(&space_info->lock);
3538        }
3539
3540        return ret;
3541}
3542
3543static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3544                                             struct btrfs_root *root)
3545{
3546        struct btrfs_block_rsv *block_rsv;
3547        if (root->ref_cows)
3548                block_rsv = trans->block_rsv;
3549        else
3550                block_rsv = root->block_rsv;
3551
3552        if (!block_rsv)
3553                block_rsv = &root->fs_info->empty_block_rsv;
3554
3555        return block_rsv;
3556}
3557
3558static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3559                               u64 num_bytes)
3560{
3561        int ret = -ENOSPC;
3562        spin_lock(&block_rsv->lock);
3563        if (block_rsv->reserved >= num_bytes) {
3564                block_rsv->reserved -= num_bytes;
3565                if (block_rsv->reserved < block_rsv->size)
3566                        block_rsv->full = 0;
3567                ret = 0;
3568        }
3569        spin_unlock(&block_rsv->lock);
3570        return ret;
3571}
3572
3573static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3574                                u64 num_bytes, int update_size)
3575{
3576        spin_lock(&block_rsv->lock);
3577        block_rsv->reserved += num_bytes;
3578        if (update_size)
3579                block_rsv->size += num_bytes;
3580        else if (block_rsv->reserved >= block_rsv->size)
3581                block_rsv->full = 1;
3582        spin_unlock(&block_rsv->lock);
3583}
3584
3585void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3586                             struct btrfs_block_rsv *dest, u64 num_bytes)
3587{
3588        struct btrfs_space_info *space_info = block_rsv->space_info;
3589
3590        spin_lock(&block_rsv->lock);
3591        if (num_bytes == (u64)-1)
3592                num_bytes = block_rsv->size;
3593        block_rsv->size -= num_bytes;
3594        if (block_rsv->reserved >= block_rsv->size) {
3595                num_bytes = block_rsv->reserved - block_rsv->size;
3596                block_rsv->reserved = block_rsv->size;
3597                block_rsv->full = 1;
3598        } else {
3599                num_bytes = 0;
3600        }
3601        spin_unlock(&block_rsv->lock);
3602
3603        if (num_bytes > 0) {
3604                if (dest) {
3605                        spin_lock(&dest->lock);
3606                        if (!dest->full) {
3607                                u64 bytes_to_add;
3608
3609                                bytes_to_add = dest->size - dest->reserved;
3610                                bytes_to_add = min(num_bytes, bytes_to_add);
3611                                dest->reserved += bytes_to_add;
3612                                if (dest->reserved >= dest->size)
3613                                        dest->full = 1;
3614                                num_bytes -= bytes_to_add;
3615                        }
3616                        spin_unlock(&dest->lock);
3617                }
3618                if (num_bytes) {
3619                        spin_lock(&space_info->lock);
3620                        space_info->bytes_reserved -= num_bytes;
3621                        space_info->reservation_progress++;
3622                        spin_unlock(&space_info->lock);
3623                }
3624        }
3625}
3626
3627static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3628                                   struct btrfs_block_rsv *dst, u64 num_bytes)
3629{
3630        int ret;
3631
3632        ret = block_rsv_use_bytes(src, num_bytes);
3633        if (ret)
3634                return ret;
3635
3636        block_rsv_add_bytes(dst, num_bytes, 1);
3637        return 0;
3638}
3639
3640void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3641{
3642        memset(rsv, 0, sizeof(*rsv));
3643        spin_lock_init(&rsv->lock);
3644        atomic_set(&rsv->usage, 1);
3645        rsv->priority = 6;
3646        INIT_LIST_HEAD(&rsv->list);
3647}
3648
3649struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3650{
3651        struct btrfs_block_rsv *block_rsv;
3652        struct btrfs_fs_info *fs_info = root->fs_info;
3653
3654        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3655        if (!block_rsv)
3656                return NULL;
3657
3658        btrfs_init_block_rsv(block_rsv);
3659        block_rsv->space_info = __find_space_info(fs_info,
3660                                                  BTRFS_BLOCK_GROUP_METADATA);
3661        return block_rsv;
3662}
3663
3664void btrfs_free_block_rsv(struct btrfs_root *root,
3665                          struct btrfs_block_rsv *rsv)
3666{
3667        if (rsv && atomic_dec_and_test(&rsv->usage)) {
3668                btrfs_block_rsv_release(root, rsv, (u64)-1);
3669                if (!rsv->durable)
3670                        kfree(rsv);
3671        }
3672}
3673
3674/*
3675 * make the block_rsv struct be able to capture freed space.
3676 * the captured space will re-add to the the block_rsv struct
3677 * after transaction commit
3678 */
3679void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3680                                 struct btrfs_block_rsv *block_rsv)
3681{
3682        block_rsv->durable = 1;
3683        mutex_lock(&fs_info->durable_block_rsv_mutex);
3684        list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3685        mutex_unlock(&fs_info->durable_block_rsv_mutex);
3686}
3687
3688int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3689                        struct btrfs_root *root,
3690                        struct btrfs_block_rsv *block_rsv,
3691                        u64 num_bytes)
3692{
3693        int ret;
3694
3695        if (num_bytes == 0)
3696                return 0;
3697
3698        ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
3699        if (!ret) {
3700                block_rsv_add_bytes(block_rsv, num_bytes, 1);
3701                return 0;
3702        }
3703
3704        return ret;
3705}
3706
3707int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3708                          struct btrfs_root *root,
3709                          struct btrfs_block_rsv *block_rsv,
3710                          u64 min_reserved, int min_factor)
3711{
3712        u64 num_bytes = 0;
3713        int commit_trans = 0;
3714        int ret = -ENOSPC;
3715
3716        if (!block_rsv)
3717                return 0;
3718
3719        spin_lock(&block_rsv->lock);
3720        if (min_factor > 0)
3721                num_bytes = div_factor(block_rsv->size, min_factor);
3722        if (min_reserved > num_bytes)
3723                num_bytes = min_reserved;
3724
3725        if (block_rsv->reserved >= num_bytes) {
3726                ret = 0;
3727        } else {
3728                num_bytes -= block_rsv->reserved;
3729                if (block_rsv->durable &&
3730                    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3731                        commit_trans = 1;
3732        }
3733        spin_unlock(&block_rsv->lock);
3734        if (!ret)
3735                return 0;
3736
3737        if (block_rsv->refill_used) {
3738                ret = reserve_metadata_bytes(trans, root, block_rsv,
3739                                             num_bytes, 0);
3740                if (!ret) {
3741                        block_rsv_add_bytes(block_rsv, num_bytes, 0);
3742                        return 0;
3743                }
3744        }
3745
3746        if (commit_trans) {
3747                if (trans)
3748                        return -EAGAIN;
3749
3750                trans = btrfs_join_transaction(root, 1);
3751                BUG_ON(IS_ERR(trans));
3752                ret = btrfs_commit_transaction(trans, root);
3753                return 0;
3754        }
3755
3756        return -ENOSPC;
3757}
3758
3759int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3760                            struct btrfs_block_rsv *dst_rsv,
3761                            u64 num_bytes)
3762{
3763        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3764}
3765
3766void btrfs_block_rsv_release(struct btrfs_root *root,
3767                             struct btrfs_block_rsv *block_rsv,
3768                             u64 num_bytes)
3769{
3770        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3771        if (global_rsv->full || global_rsv == block_rsv ||
3772            block_rsv->space_info != global_rsv->space_info)
3773                global_rsv = NULL;
3774        block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3775}
3776
3777/*
3778 * helper to calculate size of global block reservation.
3779 * the desired value is sum of space used by extent tree,
3780 * checksum tree and root tree
3781 */
3782static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3783{
3784        struct btrfs_space_info *sinfo;
3785        u64 num_bytes;
3786        u64 meta_used;
3787        u64 data_used;
3788        int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3789#if 0
3790        /*
3791         * per tree used space accounting can be inaccuracy, so we
3792         * can't rely on it.
3793         */
3794        spin_lock(&fs_info->extent_root->accounting_lock);
3795        num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3796        spin_unlock(&fs_info->extent_root->accounting_lock);
3797
3798        spin_lock(&fs_info->csum_root->accounting_lock);
3799        num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3800        spin_unlock(&fs_info->csum_root->accounting_lock);
3801
3802        spin_lock(&fs_info->tree_root->accounting_lock);
3803        num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3804        spin_unlock(&fs_info->tree_root->accounting_lock);
3805#endif
3806        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3807        spin_lock(&sinfo->lock);
3808        data_used = sinfo->bytes_used;
3809        spin_unlock(&sinfo->lock);
3810
3811        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3812        spin_lock(&sinfo->lock);
3813        if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
3814                data_used = 0;
3815        meta_used = sinfo->bytes_used;
3816        spin_unlock(&sinfo->lock);
3817
3818        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3819                    csum_size * 2;
3820        num_bytes += div64_u64(data_used + meta_used, 50);
3821
3822        if (num_bytes * 3 > meta_used)
3823                num_bytes = div64_u64(meta_used, 3);
3824
3825        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3826}
3827
3828static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3829{
3830        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3831        struct btrfs_space_info *sinfo = block_rsv->space_info;
3832        u64 num_bytes;
3833
3834        num_bytes = calc_global_metadata_size(fs_info);
3835
3836        spin_lock(&block_rsv->lock);
3837        spin_lock(&sinfo->lock);
3838
3839        block_rsv->size = num_bytes;
3840
3841        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3842                    sinfo->bytes_reserved + sinfo->bytes_readonly +
3843                    sinfo->bytes_may_use;
3844
3845        if (sinfo->total_bytes > num_bytes) {
3846                num_bytes = sinfo->total_bytes - num_bytes;
3847                block_rsv->reserved += num_bytes;
3848                sinfo->bytes_reserved += num_bytes;
3849        }
3850
3851        if (block_rsv->reserved >= block_rsv->size) {
3852                num_bytes = block_rsv->reserved - block_rsv->size;
3853                sinfo->bytes_reserved -= num_bytes;
3854                sinfo->reservation_progress++;
3855                block_rsv->reserved = block_rsv->size;
3856                block_rsv->full = 1;
3857        }
3858#if 0
3859        printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3860                block_rsv->size, block_rsv->reserved);
3861#endif
3862        spin_unlock(&sinfo->lock);
3863        spin_unlock(&block_rsv->lock);
3864}
3865
3866static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3867{
3868        struct btrfs_space_info *space_info;
3869
3870        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3871        fs_info->chunk_block_rsv.space_info = space_info;
3872        fs_info->chunk_block_rsv.priority = 10;
3873
3874        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3875        fs_info->global_block_rsv.space_info = space_info;
3876        fs_info->global_block_rsv.priority = 10;
3877        fs_info->global_block_rsv.refill_used = 1;
3878        fs_info->delalloc_block_rsv.space_info = space_info;
3879        fs_info->trans_block_rsv.space_info = space_info;
3880        fs_info->empty_block_rsv.space_info = space_info;
3881        fs_info->empty_block_rsv.priority = 10;
3882
3883        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3884        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3885        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3886        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3887        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3888
3889        btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3890
3891        btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3892
3893        update_global_block_rsv(fs_info);
3894}
3895
3896static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3897{
3898        block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3899        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3900        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3901        WARN_ON(fs_info->trans_block_rsv.size > 0);
3902        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3903        WARN_ON(fs_info->chunk_block_rsv.size > 0);
3904        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3905}
3906
3907static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3908{
3909        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3910                3 * num_items;
3911}
3912
3913int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3914                                 struct btrfs_root *root,
3915                                 int num_items)
3916{
3917        u64 num_bytes;
3918        int ret;
3919
3920        if (num_items == 0 || root->fs_info->chunk_root == root)
3921                return 0;
3922
3923        num_bytes = calc_trans_metadata_size(root, num_items);
3924        ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3925                                  num_bytes);
3926        if (!ret) {
3927                trans->bytes_reserved += num_bytes;
3928                trans->block_rsv = &root->fs_info->trans_block_rsv;
3929        }
3930        return ret;
3931}
3932
3933void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3934                                  struct btrfs_root *root)
3935{
3936        if (!trans->bytes_reserved)
3937                return;
3938
3939        BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3940        btrfs_block_rsv_release(root, trans->block_rsv,
3941                                trans->bytes_reserved);
3942        trans->bytes_reserved = 0;
3943}
3944
3945int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3946                                  struct inode *inode)
3947{
3948        struct btrfs_root *root = BTRFS_I(inode)->root;
3949        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3950        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3951
3952        /*
3953         * one for deleting orphan item, one for updating inode and
3954         * two for calling btrfs_truncate_inode_items.
3955         *
3956         * btrfs_truncate_inode_items is a delete operation, it frees
3957         * more space than it uses in most cases. So two units of
3958         * metadata space should be enough for calling it many times.
3959         * If all of the metadata space is used, we can commit
3960         * transaction and use space it freed.
3961         */
3962        u64 num_bytes = calc_trans_metadata_size(root, 4);
3963        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3964}
3965
3966void btrfs_orphan_release_metadata(struct inode *inode)
3967{
3968        struct btrfs_root *root = BTRFS_I(inode)->root;
3969        u64 num_bytes = calc_trans_metadata_size(root, 4);
3970        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3971}
3972
3973int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3974                                struct btrfs_pending_snapshot *pending)
3975{
3976        struct btrfs_root *root = pending->root;
3977        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3978        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3979        /*
3980         * two for root back/forward refs, two for directory entries
3981         * and one for root of the snapshot.
3982         */
3983        u64 num_bytes = calc_trans_metadata_size(root, 5);
3984        dst_rsv->space_info = src_rsv->space_info;
3985        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3986}
3987
3988static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3989{
3990        return num_bytes >>= 3;
3991}
3992
3993int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3994{
3995        struct btrfs_root *root = BTRFS_I(inode)->root;
3996        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3997        u64 to_reserve;
3998        int nr_extents;
3999        int ret;
4000
4001        if (btrfs_transaction_in_commit(root->fs_info))
4002                schedule_timeout(1);
4003
4004        num_bytes = ALIGN(num_bytes, root->sectorsize);
4005
4006        spin_lock(&BTRFS_I(inode)->accounting_lock);
4007        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
4008        if (nr_extents > BTRFS_I(inode)->reserved_extents) {
4009                nr_extents -= BTRFS_I(inode)->reserved_extents;
4010                to_reserve = calc_trans_metadata_size(root, nr_extents);
4011        } else {
4012                nr_extents = 0;
4013                to_reserve = 0;
4014        }
4015        spin_unlock(&BTRFS_I(inode)->accounting_lock);
4016        to_reserve += calc_csum_metadata_size(inode, num_bytes);
4017        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4018        if (ret)
4019                return ret;
4020
4021        spin_lock(&BTRFS_I(inode)->accounting_lock);
4022        BTRFS_I(inode)->reserved_extents += nr_extents;
4023        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
4024        spin_unlock(&BTRFS_I(inode)->accounting_lock);
4025
4026        block_rsv_add_bytes(block_rsv, to_reserve, 1);
4027
4028        if (block_rsv->size > 512 * 1024 * 1024)
4029                shrink_delalloc(NULL, root, to_reserve, 0);
4030
4031        return 0;
4032}
4033
4034void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4035{
4036        struct btrfs_root *root = BTRFS_I(inode)->root;
4037        u64 to_free;
4038        int nr_extents;
4039
4040        num_bytes = ALIGN(num_bytes, root->sectorsize);
4041        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
4042        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
4043
4044        spin_lock(&BTRFS_I(inode)->accounting_lock);
4045        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
4046        if (nr_extents < BTRFS_I(inode)->reserved_extents) {
4047                nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
4048                BTRFS_I(inode)->reserved_extents -= nr_extents;
4049        } else {
4050                nr_extents = 0;
4051        }
4052        spin_unlock(&BTRFS_I(inode)->accounting_lock);
4053
4054        to_free = calc_csum_metadata_size(inode, num_bytes);
4055        if (nr_extents > 0)
4056                to_free += calc_trans_metadata_size(root, nr_extents);
4057
4058        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4059                                to_free);
4060}
4061
4062int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4063{
4064        int ret;
4065
4066        ret = btrfs_check_data_free_space(inode, num_bytes);
4067        if (ret)
4068                return ret;
4069
4070        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4071        if (ret) {
4072                btrfs_free_reserved_data_space(inode, num_bytes);
4073                return ret;
4074        }
4075
4076        return 0;
4077}
4078
4079void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4080{
4081        btrfs_delalloc_release_metadata(inode, num_bytes);
4082        btrfs_free_reserved_data_space(inode, num_bytes);
4083}
4084
4085static int update_block_group(struct btrfs_trans_handle *trans,
4086                              struct btrfs_root *root,
4087                              u64 bytenr, u64 num_bytes, int alloc)
4088{
4089        struct btrfs_block_group_cache *cache = NULL;
4090        struct btrfs_fs_info *info = root->fs_info;
4091        u64 total = num_bytes;
4092        u64 old_val;
4093        u64 byte_in_group;
4094        int factor;
4095
4096        /* block accounting for super block */
4097        spin_lock(&info->delalloc_lock);
4098        old_val = btrfs_super_bytes_used(&info->super_copy);
4099        if (alloc)
4100                old_val += num_bytes;
4101        else
4102                old_val -= num_bytes;
4103        btrfs_set_super_bytes_used(&info->super_copy, old_val);
4104        spin_unlock(&info->delalloc_lock);
4105
4106        while (total) {
4107                cache = btrfs_lookup_block_group(info, bytenr);
4108                if (!cache)
4109                        return -1;
4110                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4111                                    BTRFS_BLOCK_GROUP_RAID1 |
4112                                    BTRFS_BLOCK_GROUP_RAID10))
4113                        factor = 2;
4114                else
4115                        factor = 1;
4116                /*
4117                 * If this block group has free space cache written out, we
4118                 * need to make sure to load it if we are removing space.  This
4119                 * is because we need the unpinning stage to actually add the
4120                 * space back to the block group, otherwise we will leak space.
4121                 */
4122                if (!alloc && cache->cached == BTRFS_CACHE_NO)
4123                        cache_block_group(cache, trans, NULL, 1);
4124
4125                byte_in_group = bytenr - cache->key.objectid;
4126                WARN_ON(byte_in_group > cache->key.offset);
4127
4128                spin_lock(&cache->space_info->lock);
4129                spin_lock(&cache->lock);
4130
4131                if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
4132                    cache->disk_cache_state < BTRFS_DC_CLEAR)
4133                        cache->disk_cache_state = BTRFS_DC_CLEAR;
4134
4135                cache->dirty = 1;
4136                old_val = btrfs_block_group_used(&cache->item);
4137                num_bytes = min(total, cache->key.offset - byte_in_group);
4138                if (alloc) {
4139                        old_val += num_bytes;
4140                        btrfs_set_block_group_used(&cache->item, old_val);
4141                        cache->reserved -= num_bytes;
4142                        cache->space_info->bytes_reserved -= num_bytes;
4143                        cache->space_info->reservation_progress++;
4144                        cache->space_info->bytes_used += num_bytes;
4145                        cache->space_info->disk_used += num_bytes * factor;
4146                        spin_unlock(&cache->lock);
4147                        spin_unlock(&cache->space_info->lock);
4148                } else {
4149                        old_val -= num_bytes;
4150                        btrfs_set_block_group_used(&cache->item, old_val);
4151                        cache->pinned += num_bytes;
4152                        cache->space_info->bytes_pinned += num_bytes;
4153                        cache->space_info->bytes_used -= num_bytes;
4154                        cache->space_info->disk_used -= num_bytes * factor;
4155                        spin_unlock(&cache->lock);
4156                        spin_unlock(&cache->space_info->lock);
4157
4158                        set_extent_dirty(info->pinned_extents,
4159                                         bytenr, bytenr + num_bytes - 1,
4160                                         GFP_NOFS | __GFP_NOFAIL);
4161                }
4162                btrfs_put_block_group(cache);
4163                total -= num_bytes;
4164                bytenr += num_bytes;
4165        }
4166        return 0;
4167}
4168
4169static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4170{
4171        struct btrfs_block_group_cache *cache;
4172        u64 bytenr;
4173
4174        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4175        if (!cache)
4176                return 0;
4177
4178        bytenr = cache->key.objectid;
4179        btrfs_put_block_group(cache);
4180
4181        return bytenr;
4182}
4183
4184static int pin_down_extent(struct btrfs_root *root,
4185                           struct btrfs_block_group_cache *cache,
4186                           u64 bytenr, u64 num_bytes, int reserved)
4187{
4188        spin_lock(&cache->space_info->lock);
4189        spin_lock(&cache->lock);
4190        cache->pinned += num_bytes;
4191        cache->space_info->bytes_pinned += num_bytes;
4192        if (reserved) {
4193                cache->reserved -= num_bytes;
4194                cache->space_info->bytes_reserved -= num_bytes;
4195                cache->space_info->reservation_progress++;
4196        }
4197        spin_unlock(&cache->lock);
4198        spin_unlock(&cache->space_info->lock);
4199
4200        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4201                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4202        return 0;
4203}
4204
4205/*
4206 * this function must be called within transaction
4207 */
4208int btrfs_pin_extent(struct btrfs_root *root,
4209                     u64 bytenr, u64 num_bytes, int reserved)
4210{
4211        struct btrfs_block_group_cache *cache;
4212
4213        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4214        BUG_ON(!cache);
4215
4216        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4217
4218        btrfs_put_block_group(cache);
4219        return 0;
4220}
4221
4222/*
4223 * update size of reserved extents. this function may return -EAGAIN
4224 * if 'reserve' is true or 'sinfo' is false.
4225 */
4226static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
4227                                 u64 num_bytes, int reserve, int sinfo)
4228{
4229        int ret = 0;
4230        if (sinfo) {
4231                struct btrfs_space_info *space_info = cache->space_info;
4232                spin_lock(&space_info->lock);
4233                spin_lock(&cache->lock);
4234                if (reserve) {
4235                        if (cache->ro) {
4236                                ret = -EAGAIN;
4237                        } else {
4238                                cache->reserved += num_bytes;
4239                                space_info->bytes_reserved += num_bytes;
4240                        }
4241                } else {
4242                        if (cache->ro)
4243                                space_info->bytes_readonly += num_bytes;
4244                        cache->reserved -= num_bytes;
4245                        space_info->bytes_reserved -= num_bytes;
4246                        space_info->reservation_progress++;
4247                }
4248                spin_unlock(&cache->lock);
4249                spin_unlock(&space_info->lock);
4250        } else {
4251                spin_lock(&cache->lock);
4252                if (cache->ro) {
4253                        ret = -EAGAIN;
4254                } else {
4255                        if (reserve)
4256                                cache->reserved += num_bytes;
4257                        else
4258                                cache->reserved -= num_bytes;
4259                }
4260                spin_unlock(&cache->lock);
4261        }
4262        return ret;
4263}
4264
4265int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4266                                struct btrfs_root *root)
4267{
4268        struct btrfs_fs_info *fs_info = root->fs_info;
4269        struct btrfs_caching_control *next;
4270        struct btrfs_caching_control *caching_ctl;
4271        struct btrfs_block_group_cache *cache;
4272
4273        down_write(&fs_info->extent_commit_sem);
4274
4275        list_for_each_entry_safe(caching_ctl, next,
4276                                 &fs_info->caching_block_groups, list) {
4277                cache = caching_ctl->block_group;
4278                if (block_group_cache_done(cache)) {
4279                        cache->last_byte_to_unpin = (u64)-1;
4280                        list_del_init(&caching_ctl->list);
4281                        put_caching_control(caching_ctl);
4282                } else {
4283                        cache->last_byte_to_unpin = caching_ctl->progress;
4284                }
4285        }
4286
4287        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4288                fs_info->pinned_extents = &fs_info->freed_extents[1];
4289        else
4290                fs_info->pinned_extents = &fs_info->freed_extents[0];
4291
4292        up_write(&fs_info->extent_commit_sem);
4293
4294        update_global_block_rsv(fs_info);
4295        return 0;
4296}
4297
4298static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4299{
4300        struct btrfs_fs_info *fs_info = root->fs_info;
4301        struct btrfs_block_group_cache *cache = NULL;
4302        u64 len;
4303
4304        while (start <= end) {
4305                if (!cache ||
4306                    start >= cache->key.objectid + cache->key.offset) {
4307                        if (cache)
4308                                btrfs_put_block_group(cache);
4309                        cache = btrfs_lookup_block_group(fs_info, start);
4310                        BUG_ON(!cache);
4311                }
4312
4313                len = cache->key.objectid + cache->key.offset - start;
4314                len = min(len, end + 1 - start);
4315
4316                if (start < cache->last_byte_to_unpin) {
4317                        len = min(len, cache->last_byte_to_unpin - start);
4318                        btrfs_add_free_space(cache, start, len);
4319                }
4320
4321                start += len;
4322
4323                spin_lock(&cache->space_info->lock);
4324                spin_lock(&cache->lock);
4325                cache->pinned -= len;
4326                cache->space_info->bytes_pinned -= len;
4327                if (cache->ro) {
4328                        cache->space_info->bytes_readonly += len;
4329                } else if (cache->reserved_pinned > 0) {
4330                        len = min(len, cache->reserved_pinned);
4331                        cache->reserved_pinned -= len;
4332                        cache->space_info->bytes_reserved += len;
4333                }
4334                spin_unlock(&cache->lock);
4335                spin_unlock(&cache->space_info->lock);
4336        }
4337
4338        if (cache)
4339                btrfs_put_block_group(cache);
4340        return 0;
4341}
4342
4343int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4344                               struct btrfs_root *root)
4345{
4346        struct btrfs_fs_info *fs_info = root->fs_info;
4347        struct extent_io_tree *unpin;
4348        struct btrfs_block_rsv *block_rsv;
4349        struct btrfs_block_rsv *next_rsv;
4350        u64 start;
4351        u64 end;
4352        int idx;
4353        int ret;
4354
4355        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4356                unpin = &fs_info->freed_extents[1];
4357        else
4358                unpin = &fs_info->freed_extents[0];
4359
4360        while (1) {
4361                ret = find_first_extent_bit(unpin, 0, &start, &end,
4362                                            EXTENT_DIRTY);
4363                if (ret)
4364                        break;
4365
4366                ret = btrfs_discard_extent(root, start, end + 1 - start);
4367
4368                clear_extent_dirty(unpin, start, end, GFP_NOFS);
4369                unpin_extent_range(root, start, end);
4370                cond_resched();
4371        }
4372
4373        mutex_lock(&fs_info->durable_block_rsv_mutex);
4374        list_for_each_entry_safe(block_rsv, next_rsv,
4375                                 &fs_info->durable_block_rsv_list, list) {
4376
4377                idx = trans->transid & 0x1;
4378                if (block_rsv->freed[idx] > 0) {
4379                        block_rsv_add_bytes(block_rsv,
4380                                            block_rsv->freed[idx], 0);
4381                        block_rsv->freed[idx] = 0;
4382                }
4383                if (atomic_read(&block_rsv->usage) == 0) {
4384                        btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4385
4386                        if (block_rsv->freed[0] == 0 &&
4387                            block_rsv->freed[1] == 0) {
4388                                list_del_init(&block_rsv->list);
4389                                kfree(block_rsv);
4390                        }
4391                } else {
4392                        btrfs_block_rsv_release(root, block_rsv, 0);
4393                }
4394        }
4395        mutex_unlock(&fs_info->durable_block_rsv_mutex);
4396
4397        return 0;
4398}
4399
4400static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4401                                struct btrfs_root *root,
4402                                u64 bytenr, u64 num_bytes, u64 parent,
4403                                u64 root_objectid, u64 owner_objectid,
4404                                u64 owner_offset, int refs_to_drop,
4405                                struct btrfs_delayed_extent_op *extent_op)
4406{
4407        struct btrfs_key key;
4408        struct btrfs_path *path;
4409        struct btrfs_fs_info *info = root->fs_info;
4410        struct btrfs_root *extent_root = info->extent_root;
4411        struct extent_buffer *leaf;
4412        struct btrfs_extent_item *ei;
4413        struct btrfs_extent_inline_ref *iref;
4414        int ret;
4415        int is_data;
4416        int extent_slot = 0;
4417        int found_extent = 0;
4418        int num_to_del = 1;
4419        u32 item_size;
4420        u64 refs;
4421
4422        path = btrfs_alloc_path();
4423        if (!path)
4424                return -ENOMEM;
4425
4426        path->reada = 1;
4427        path->leave_spinning = 1;
4428
4429        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4430        BUG_ON(!is_data && refs_to_drop != 1);
4431
4432        ret = lookup_extent_backref(trans, extent_root, path, &iref,
4433                                    bytenr, num_bytes, parent,
4434                                    root_objectid, owner_objectid,
4435                                    owner_offset);
4436        if (ret == 0) {
4437                extent_slot = path->slots[0];
4438                while (extent_slot >= 0) {
4439                        btrfs_item_key_to_cpu(path->nodes[0], &key,
4440                                              extent_slot);
4441                        if (key.objectid != bytenr)
4442                                break;
4443                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4444                            key.offset == num_bytes) {
4445                                found_extent = 1;
4446                                break;
4447                        }
4448                        if (path->slots[0] - extent_slot > 5)
4449                                break;
4450                        extent_slot--;
4451                }
4452#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4453                item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
4454                if (found_extent && item_size < sizeof(*ei))
4455                        found_extent = 0;
4456#endif
4457                if (!found_extent) {
4458                        BUG_ON(iref);
4459                        ret = remove_extent_backref(trans, extent_root, path,
4460                                                    NULL, refs_to_drop,
4461                                                    is_data);
4462                        BUG_ON(ret);
4463                        btrfs_release_path(extent_root, path);
4464                        path->leave_spinning = 1;
4465
4466                        key.objectid = bytenr;
4467                        key.type = BTRFS_EXTENT_ITEM_KEY;
4468                        key.offset = num_bytes;
4469
4470                        ret = btrfs_search_slot(trans, extent_root,
4471                                                &key, path, -1, 1);
4472                        if (ret) {
4473                                printk(KERN_ERR "umm, got %d back from search"
4474                                       ", was looking for %llu\n", ret,
4475                                       (unsigned long long)bytenr);
4476                                btrfs_print_leaf(extent_root, path->nodes[0]);
4477                        }
4478                        BUG_ON(ret);
4479                        extent_slot = path->slots[0];
4480                }
4481        } else {
4482                btrfs_print_leaf(extent_root, path->nodes[0]);
4483                WARN_ON(1);
4484                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
4485                       "parent %llu root %llu  owner %llu offset %llu\n",
4486                       (unsigned long long)bytenr,
4487                       (unsigned long long)parent,
4488                       (unsigned long long)root_objectid,
4489                       (unsigned long long)owner_objectid,
4490                       (unsigned long long)owner_offset);
4491        }
4492
4493        leaf = path->nodes[0];
4494        item_size = btrfs_item_size_nr(leaf, extent_slot);
4495#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4496        if (item_size < sizeof(*ei)) {
4497                BUG_ON(found_extent || extent_slot != path->slots[0]);
4498                ret = convert_extent_item_v0(trans, extent_root, path,
4499                                             owner_objectid, 0);
4500                BUG_ON(ret < 0);
4501
4502                btrfs_release_path(extent_root, path);
4503                path->leave_spinning = 1;
4504
4505                key.objectid = bytenr;
4506                key.type = BTRFS_EXTENT_ITEM_KEY;
4507                key.offset = num_bytes;
4508
4509                ret = btrfs_search_slot(trans, extent_root, &key, path,
4510                                        -1, 1);
4511                if (ret) {
4512                        printk(KERN_ERR "umm, got %d back from search"
4513                               ", was looking for %llu\n", ret,
4514                               (unsigned long long)bytenr);
4515                        btrfs_print_leaf(extent_root, path->nodes[0]);
4516                }
4517                BUG_ON(ret);
4518                extent_slot = path->slots[0];
4519                leaf = path->nodes[0];
4520                item_size = btrfs_item_size_nr(leaf, extent_slot);
4521        }
4522#endif
4523        BUG_ON(item_size < sizeof(*ei));
4524        ei = btrfs_item_ptr(leaf, extent_slot,
4525                            struct btrfs_extent_item);
4526        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4527                struct btrfs_tree_block_info *bi;
4528                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
4529                bi = (struct btrfs_tree_block_info *)(ei + 1);
4530                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
4531        }
4532
4533        refs = btrfs_extent_refs(leaf, ei);
4534        BUG_ON(refs < refs_to_drop);
4535        refs -= refs_to_drop;
4536
4537        if (refs > 0) {
4538                if (extent_op)
4539                        __run_delayed_extent_op(extent_op, leaf, ei);
4540                /*
4541                 * In the case of inline back ref, reference count will
4542                 * be updated by remove_extent_backref
4543                 */
4544                if (iref) {
4545                        BUG_ON(!found_extent);
4546                } else {
4547                        btrfs_set_extent_refs(leaf, ei, refs);
4548                        btrfs_mark_buffer_dirty(leaf);
4549                }
4550                if (found_extent) {
4551                        ret = remove_extent_backref(trans, extent_root, path,
4552                                                    iref, refs_to_drop,
4553                                                    is_data);
4554                        BUG_ON(ret);
4555                }
4556        } else {
4557                if (found_extent) {
4558                        BUG_ON(is_data && refs_to_drop !=
4559                               extent_data_ref_count(root, path, iref));
4560                        if (iref) {
4561                                BUG_ON(path->slots[0] != extent_slot);
4562                        } else {
4563                                BUG_ON(path->slots[0] != extent_slot + 1);
4564                                path->slots[0] = extent_slot;
4565                                num_to_del = 2;
4566                        }
4567                }
4568
4569                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4570                                      num_to_del);
4571                BUG_ON(ret);
4572                btrfs_release_path(extent_root, path);
4573
4574                if (is_data) {
4575                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
4576                        BUG_ON(ret);
4577                } else {
4578                        invalidate_mapping_pages(info->btree_inode->i_mapping,
4579                             bytenr >> PAGE_CACHE_SHIFT,
4580                             (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
4581                }
4582
4583                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
4584                BUG_ON(ret);
4585        }
4586        btrfs_free_path(path);
4587        return ret;
4588}
4589
4590/*
4591 * when we free an block, it is possible (and likely) that we free the last
4592 * delayed ref for that extent as well.  This searches the delayed ref tree for
4593 * a given extent, and if there are no other delayed refs to be processed, it
4594 * removes it from the tree.
4595 */
4596static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4597                                      struct btrfs_root *root, u64 bytenr)
4598{
4599        struct btrfs_delayed_ref_head *head;
4600        struct btrfs_delayed_ref_root *delayed_refs;
4601        struct btrfs_delayed_ref_node *ref;
4602        struct rb_node *node;
4603        int ret = 0;
4604
4605        delayed_refs = &trans->transaction->delayed_refs;
4606        spin_lock(&delayed_refs->lock);
4607        head = btrfs_find_delayed_ref_head(trans, bytenr);
4608        if (!head)
4609                goto out;
4610
4611        node = rb_prev(&head->node.rb_node);
4612        if (!node)
4613                goto out;
4614
4615        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
4616
4617        /* there are still entries for this ref, we can't drop it */
4618        if (ref->bytenr == bytenr)
4619                goto out;
4620
4621        if (head->extent_op) {
4622                if (!head->must_insert_reserved)
4623                        goto out;
4624                kfree(head->extent_op);
4625                head->extent_op = NULL;
4626        }
4627
4628        /*
4629         * waiting for the lock here would deadlock.  If someone else has it
4630         * locked they are already in the process of dropping it anyway
4631         */
4632        if (!mutex_trylock(&head->mutex))
4633                goto out;
4634
4635        /*
4636         * at this point we have a head with no other entries.  Go
4637         * ahead and process it.
4638         */
4639        head->node.in_tree = 0;
4640        rb_erase(&head->node.rb_node, &delayed_refs->root);
4641
4642        delayed_refs->num_entries--;
4643
4644        /*
4645         * we don't take a ref on the node because we're removing it from the
4646         * tree, so we just steal the ref the tree was holding.
4647         */
4648        delayed_refs->num_heads--;
4649        if (list_empty(&head->cluster))
4650                delayed_refs->num_heads_ready--;
4651
4652        list_del_init(&head->cluster);
4653        spin_unlock(&delayed_refs->lock);
4654
4655        BUG_ON(head->extent_op);
4656        if (head->must_insert_reserved)
4657                ret = 1;
4658
4659        mutex_unlock(&head->mutex);
4660        btrfs_put_delayed_ref(&head->node);
4661        return ret;
4662out:
4663        spin_unlock(&delayed_refs->lock);
4664        return 0;
4665}
4666
4667void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4668                           struct btrfs_root *root,
4669                           struct extent_buffer *buf,
4670                           u64 parent, int last_ref)
4671{
4672        struct btrfs_block_rsv *block_rsv;
4673        struct btrfs_block_group_cache *cache = NULL;
4674        int ret;
4675
4676        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4677                ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4678                                                parent, root->root_key.objectid,
4679                                                btrfs_header_level(buf),
4680                                                BTRFS_DROP_DELAYED_REF, NULL);
4681                BUG_ON(ret);
4682        }
4683
4684        if (!last_ref)
4685                return;
4686
4687        block_rsv = get_block_rsv(trans, root);
4688        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4689        if (block_rsv->space_info != cache->space_info)
4690                goto out;
4691
4692        if (btrfs_header_generation(buf) == trans->transid) {
4693                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4694                        ret = check_ref_cleanup(trans, root, buf->start);
4695                        if (!ret)
4696                                goto pin;
4697                }
4698
4699                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4700                        pin_down_extent(root, cache, buf->start, buf->len, 1);
4701                        goto pin;
4702                }
4703
4704                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4705
4706                btrfs_add_free_space(cache, buf->start, buf->len);
4707                ret = update_reserved_bytes(cache, buf->len, 0, 0);
4708                if (ret == -EAGAIN) {
4709                        /* block group became read-only */
4710                        update_reserved_bytes(cache, buf->len, 0, 1);
4711                        goto out;
4712                }
4713
4714                ret = 1;
4715                spin_lock(&block_rsv->lock);
4716                if (block_rsv->reserved < block_rsv->size) {
4717                        block_rsv->reserved += buf->len;
4718                        ret = 0;
4719                }
4720                spin_unlock(&block_rsv->lock);
4721
4722                if (ret) {
4723                        spin_lock(&cache->space_info->lock);
4724                        cache->space_info->bytes_reserved -= buf->len;
4725                        cache->space_info->reservation_progress++;
4726                        spin_unlock(&cache->space_info->lock);
4727                }
4728                goto out;
4729        }
4730pin:
4731        if (block_rsv->durable && !cache->ro) {
4732                ret = 0;
4733                spin_lock(&cache->lock);
4734                if (!cache->ro) {
4735                        cache->reserved_pinned += buf->len;
4736                        ret = 1;
4737                }
4738                spin_unlock(&cache->lock);
4739
4740                if (ret) {
4741                        spin_lock(&block_rsv->lock);
4742                        block_rsv->freed[trans->transid & 0x1] += buf->len;
4743                        spin_unlock(&block_rsv->lock);
4744                }
4745        }
4746out:
4747        btrfs_put_block_group(cache);
4748}
4749
4750int btrfs_free_extent(struct btrfs_trans_handle *trans,
4751                      struct btrfs_root *root,
4752                      u64 bytenr, u64 num_bytes, u64 parent,
4753                      u64 root_objectid, u64 owner, u64 offset)
4754{
4755        int ret;
4756
4757        /*
4758         * tree log blocks never actually go into the extent allocation
4759         * tree, just update pinning info and exit early.
4760         */
4761        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
4762                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
4763                /* unlocks the pinned mutex */
4764                btrfs_pin_extent(root, bytenr, num_bytes, 1);
4765                ret = 0;
4766        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
4767                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
4768                                        parent, root_objectid, (int)owner,
4769                                        BTRFS_DROP_DELAYED_REF, NULL);
4770                BUG_ON(ret);
4771        } else {
4772                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4773                                        parent, root_objectid, owner,
4774                                        offset, BTRFS_DROP_DELAYED_REF, NULL);
4775                BUG_ON(ret);
4776        }
4777        return ret;
4778}
4779
4780static u64 stripe_align(struct btrfs_root *root, u64 val)
4781{
4782        u64 mask = ((u64)root->stripesize - 1);
4783        u64 ret = (val + mask) & ~mask;
4784        return ret;
4785}
4786
4787/*
4788 * when we wait for progress in the block group caching, its because
4789 * our allocation attempt failed at least once.  So, we must sleep
4790 * and let some progress happen before we try again.
4791 *
4792 * This function will sleep at least once waiting for new free space to
4793 * show up, and then it will check the block group free space numbers
4794 * for our min num_bytes.  Another option is to have it go ahead
4795 * and look in the rbtree for a free extent of a given size, but this
4796 * is a good start.
4797 */
4798static noinline int
4799wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4800                                u64 num_bytes)
4801{
4802        struct btrfs_caching_control *caching_ctl;
4803        DEFINE_WAIT(wait);
4804
4805        caching_ctl = get_caching_control(cache);
4806        if (!caching_ctl)
4807                return 0;
4808
4809        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4810                   (cache->free_space >= num_bytes));
4811
4812        put_caching_control(caching_ctl);
4813        return 0;
4814}
4815
4816static noinline int
4817wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4818{
4819        struct btrfs_caching_control *caching_ctl;
4820        DEFINE_WAIT(wait);
4821
4822        caching_ctl = get_caching_control(cache);
4823        if (!caching_ctl)
4824                return 0;
4825
4826        wait_event(caching_ctl->wait, block_group_cache_done(cache));
4827
4828        put_caching_control(caching_ctl);
4829        return 0;
4830}
4831
4832static int get_block_group_index(struct btrfs_block_group_cache *cache)
4833{
4834        int index;
4835        if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4836                index = 0;
4837        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4838                index = 1;
4839        else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4840                index = 2;
4841        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4842                index = 3;
4843        else
4844                index = 4;
4845        return index;
4846}
4847
4848enum btrfs_loop_type {
4849        LOOP_FIND_IDEAL = 0,
4850        LOOP_CACHING_NOWAIT = 1,
4851        LOOP_CACHING_WAIT = 2,
4852        LOOP_ALLOC_CHUNK = 3,
4853        LOOP_NO_EMPTY_SIZE = 4,
4854};
4855
4856/*
4857 * walks the btree of allocated extents and find a hole of a given size.
4858 * The key ins is changed to record the hole:
4859 * ins->objectid == block start
4860 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4861 * ins->offset == number of blocks
4862 * Any available blocks before search_start are skipped.
4863 */
4864static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4865                                     struct btrfs_root *orig_root,
4866                                     u64 num_bytes, u64 empty_size,
4867                                     u64 search_start, u64 search_end,
4868                                     u64 hint_byte, struct btrfs_key *ins,
4869                                     int data)
4870{
4871        int ret = 0;
4872        struct btrfs_root *root = orig_root->fs_info->extent_root;
4873        struct btrfs_free_cluster *last_ptr = NULL;
4874        struct btrfs_block_group_cache *block_group = NULL;
4875        int empty_cluster = 2 * 1024 * 1024;
4876        int allowed_chunk_alloc = 0;
4877        int done_chunk_alloc = 0;
4878        struct btrfs_space_info *space_info;
4879        int last_ptr_loop = 0;
4880        int loop = 0;
4881        int index = 0;
4882        bool found_uncached_bg = false;
4883        bool failed_cluster_refill = false;
4884        bool failed_alloc = false;
4885        bool use_cluster = true;
4886        u64 ideal_cache_percent = 0;
4887        u64 ideal_cache_offset = 0;
4888
4889        WARN_ON(num_bytes < root->sectorsize);
4890        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
4891        ins->objectid = 0;
4892        ins->offset = 0;
4893
4894        space_info = __find_space_info(root->fs_info, data);
4895        if (!space_info) {
4896                printk(KERN_ERR "No space info for %d\n", data);
4897                return -ENOSPC;
4898        }
4899
4900        /*
4901         * If the space info is for both data and metadata it means we have a
4902         * small filesystem and we can't use the clustering stuff.
4903         */
4904        if (btrfs_mixed_space_info(space_info))
4905                use_cluster = false;
4906
4907        if (orig_root->ref_cows || empty_size)
4908                allowed_chunk_alloc = 1;
4909
4910        if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
4911                last_ptr = &root->fs_info->meta_alloc_cluster;
4912                if (!btrfs_test_opt(root, SSD))
4913                        empty_cluster = 64 * 1024;
4914        }
4915
4916        if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
4917            btrfs_test_opt(root, SSD)) {
4918                last_ptr = &root->fs_info->data_alloc_cluster;
4919        }
4920
4921        if (last_ptr) {
4922                spin_lock(&last_ptr->lock);
4923                if (last_ptr->block_group)
4924                        hint_byte = last_ptr->window_start;
4925                spin_unlock(&last_ptr->lock);
4926        }
4927
4928        search_start = max(search_start, first_logical_byte(root, 0));
4929        search_start = max(search_start, hint_byte);
4930
4931        if (!last_ptr)
4932                empty_cluster = 0;
4933
4934        if (search_start == hint_byte) {
4935ideal_cache:
4936                block_group = btrfs_lookup_block_group(root->fs_info,
4937                                                       search_start);
4938                /*
4939                 * we don't want to use the block group if it doesn't match our
4940                 * allocation bits, or if its not cached.
4941                 *
4942                 * However if we are re-searching with an ideal block group
4943                 * picked out then we don't care that the block group is cached.
4944                 */
4945                if (block_group && block_group_bits(block_group, data) &&
4946                    (block_group->cached != BTRFS_CACHE_NO ||
4947                     search_start == ideal_cache_offset)) {
4948                        down_read(&space_info->groups_sem);
4949                        if (list_empty(&block_group->list) ||
4950                            block_group->ro) {
4951                                /*
4952                                 * someone is removing this block group,
4953                                 * we can't jump into the have_block_group
4954                                 * target because our list pointers are not
4955                                 * valid
4956                                 */
4957                                btrfs_put_block_group(block_group);
4958                                up_read(&space_info->groups_sem);
4959                        } else {
4960                                index = get_block_group_index(block_group);
4961                                goto have_block_group;
4962                        }
4963                } else if (block_group) {
4964                        btrfs_put_block_group(block_group);
4965                }
4966        }
4967search:
4968        down_read(&space_info->groups_sem);
4969        list_for_each_entry(block_group, &space_info->block_groups[index],
4970                            list) {
4971                u64 offset;
4972                int cached;
4973
4974                btrfs_get_block_group(block_group);
4975                search_start = block_group->key.objectid;
4976
4977                /*
4978                 * this can happen if we end up cycling through all the
4979                 * raid types, but we want to make sure we only allocate
4980                 * for the proper type.
4981                 */
4982                if (!block_group_bits(block_group, data)) {
4983                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
4984                                BTRFS_BLOCK_GROUP_RAID1 |
4985                                BTRFS_BLOCK_GROUP_RAID10;
4986
4987                        /*
4988                         * if they asked for extra copies and this block group
4989                         * doesn't provide them, bail.  This does allow us to
4990                         * fill raid0 from raid1.
4991                         */
4992                        if ((data & extra) && !(block_group->flags & extra))
4993                                goto loop;
4994                }
4995
4996have_block_group:
4997                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4998                        u64 free_percent;
4999
5000                        ret = cache_block_group(block_group, trans,
5001                                                orig_root, 1);
5002                        if (block_group->cached == BTRFS_CACHE_FINISHED)
5003                                goto have_block_group;
5004
5005                        free_percent = btrfs_block_group_used(&block_group->item);
5006                        free_percent *= 100;
5007                        free_percent = div64_u64(free_percent,
5008                                                 block_group->key.offset);
5009                        free_percent = 100 - free_percent;
5010                        if (free_percent > ideal_cache_percent &&
5011                            likely(!block_group->ro)) {
5012                                ideal_cache_offset = block_group->key.objectid;
5013                                ideal_cache_percent = free_percent;
5014                        }
5015
5016                        /*
5017                         * We only want to start kthread caching if we are at
5018                         * the point where we will wait for caching to make
5019                         * progress, or if our ideal search is over and we've
5020                         * found somebody to start caching.
5021                         */
5022                        if (loop > LOOP_CACHING_NOWAIT ||
5023                            (loop > LOOP_FIND_IDEAL &&
5024                             atomic_read(&space_info->caching_threads) < 2)) {
5025                                ret = cache_block_group(block_group, trans,
5026                                                        orig_root, 0);
5027                                BUG_ON(ret);
5028                        }
5029                        found_uncached_bg = true;
5030
5031                        /*
5032                         * If loop is set for cached only, try the next block
5033                         * group.
5034                         */
5035                        if (loop == LOOP_FIND_IDEAL)
5036                                goto loop;
5037                }
5038
5039                cached = block_group_cache_done(block_group);
5040                if (unlikely(!cached))
5041                        found_uncached_bg = true;
5042
5043                if (unlikely(block_group->ro))
5044                        goto loop;
5045
5046                /*
5047                 * Ok we want to try and use the cluster allocator, so lets look
5048                 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
5049                 * have tried the cluster allocator plenty of times at this
5050                 * point and not have found anything, so we are likely way too
5051                 * fragmented for the clustering stuff to find anything, so lets
5052                 * just skip it and let the allocator find whatever block it can
5053                 * find
5054                 */
5055                if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
5056                        /*
5057                         * the refill lock keeps out other
5058                         * people trying to start a new cluster
5059                         */
5060                        spin_lock(&last_ptr->refill_lock);
5061                        if (last_ptr->block_group &&
5062                            (last_ptr->block_group->ro ||
5063                            !block_group_bits(last_ptr->block_group, data))) {
5064                                offset = 0;
5065                                goto refill_cluster;
5066                        }
5067
5068                        offset = btrfs_alloc_from_cluster(block_group, last_ptr,
5069                                                 num_bytes, search_start);
5070                        if (offset) {
5071                                /* we have a block, we're done */
5072                                spin_unlock(&last_ptr->refill_lock);
5073                                goto checks;
5074                        }
5075
5076                        spin_lock(&last_ptr->lock);
5077                        /*
5078                         * whoops, this cluster doesn't actually point to
5079                         * this block group.  Get a ref on the block
5080                         * group is does point to and try again
5081                         */
5082                        if (!last_ptr_loop && last_ptr->block_group &&
5083                            last_ptr->block_group != block_group) {
5084
5085                                btrfs_put_block_group(block_group);
5086                                block_group = last_ptr->block_group;
5087                                btrfs_get_block_group(block_group);
5088                                spin_unlock(&last_ptr->lock);
5089                                spin_unlock(&last_ptr->refill_lock);
5090
5091                                last_ptr_loop = 1;
5092                                search_start = block_group->key.objectid;
5093                                /*
5094                                 * we know this block group is properly
5095                                 * in the list because
5096                                 * btrfs_remove_block_group, drops the
5097                                 * cluster before it removes the block
5098                                 * group from the list
5099                                 */
5100                                goto have_block_group;
5101                        }
5102                        spin_unlock(&last_ptr->lock);
5103refill_cluster:
5104                        /*
5105                         * this cluster didn't work out, free it and
5106                         * start over
5107                         */
5108                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
5109
5110                        last_ptr_loop = 0;
5111
5112                        /* allocate a cluster in this block group */
5113                        ret = btrfs_find_space_cluster(trans, root,
5114                                               block_group, last_ptr,
5115                                               offset, num_bytes,
5116                                               empty_cluster + empty_size);
5117                        if (ret == 0) {
5118                                /*
5119                                 * now pull our allocation out of this
5120                                 * cluster
5121                                 */
5122                                offset = btrfs_alloc_from_cluster(block_group,
5123                                                  last_ptr, num_bytes,
5124                                                  search_start);
5125                                if (offset) {
5126                                        /* we found one, proceed */
5127                                        spin_unlock(&last_ptr->refill_lock);
5128                                        goto checks;
5129                                }
5130                        } else if (!cached && loop > LOOP_CACHING_NOWAIT
5131                                   && !failed_cluster_refill) {
5132                                spin_unlock(&last_ptr->refill_lock);
5133
5134                                failed_cluster_refill = true;
5135                                wait_block_group_cache_progress(block_group,
5136                                       num_bytes + empty_cluster + empty_size);
5137                                goto have_block_group;
5138                        }
5139
5140                        /*
5141                         * at this point we either didn't find a cluster
5142                         * or we weren't able to allocate a block from our
5143                         * cluster.  Free the cluster we've been trying
5144                         * to use, and go to the next block group
5145                         */
5146                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
5147                        spin_unlock(&last_ptr->refill_lock);
5148                        goto loop;
5149                }
5150
5151                offset = btrfs_find_space_for_alloc(block_group, search_start,
5152                                                    num_bytes, empty_size);
5153                /*
5154                 * If we didn't find a chunk, and we haven't failed on this
5155                 * block group before, and this block group is in the middle of
5156                 * caching and we are ok with waiting, then go ahead and wait
5157                 * for progress to be made, and set failed_alloc to true.
5158                 *
5159                 * If failed_alloc is true then we've already waited on this
5160                 * block group once and should move on to the next block group.
5161                 */
5162                if (!offset && !failed_alloc && !cached &&
5163                    loop > LOOP_CACHING_NOWAIT) {
5164                        wait_block_group_cache_progress(block_group,
5165                                                num_bytes + empty_size);
5166                        failed_alloc = true;
5167                        goto have_block_group;
5168                } else if (!offset) {
5169                        goto loop;
5170                }
5171checks:
5172                search_start = stripe_align(root, offset);
5173                /* move on to the next group */
5174                if (search_start + num_bytes >= search_end) {
5175                        btrfs_add_free_space(block_group, offset, num_bytes);
5176                        goto loop;
5177                }
5178
5179                /* move on to the next group */
5180                if (search_start + num_bytes >
5181                    block_group->key.objectid + block_group->key.offset) {
5182                        btrfs_add_free_space(block_group, offset, num_bytes);
5183                        goto loop;
5184                }
5185
5186                ins->objectid = search_start;
5187                ins->offset = num_bytes;
5188
5189                if (offset < search_start)
5190                        btrfs_add_free_space(block_group, offset,
5191                                             search_start - offset);
5192                BUG_ON(offset > search_start);
5193
5194                ret = update_reserved_bytes(block_group, num_bytes, 1,
5195                                            (data & BTRFS_BLOCK_GROUP_DATA));
5196                if (ret == -EAGAIN) {
5197                        btrfs_add_free_space(block_group, offset, num_bytes);
5198                        goto loop;
5199                }
5200
5201                /* we are all good, lets return */
5202                ins->objectid = search_start;
5203                ins->offset = num_bytes;
5204
5205                if (offset < search_start)
5206                        btrfs_add_free_space(block_group, offset,
5207                                             search_start - offset);
5208                BUG_ON(offset > search_start);
5209                break;
5210loop:
5211                failed_cluster_refill = false;
5212                failed_alloc = false;
5213                BUG_ON(index != get_block_group_index(block_group));
5214                btrfs_put_block_group(block_group);
5215        }
5216        up_read(&space_info->groups_sem);
5217
5218        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5219                goto search;
5220
5221        /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
5222         *                      for them to make caching progress.  Also
5223         *                      determine the best possible bg to cache
5224         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5225         *                      caching kthreads as we move along
5226         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5227         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5228         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5229         *                      again
5230         */
5231        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
5232            (found_uncached_bg || empty_size || empty_cluster ||
5233             allowed_chunk_alloc)) {
5234                index = 0;
5235                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
5236                        found_uncached_bg = false;
5237                        loop++;
5238                        if (!ideal_cache_percent &&
5239                            atomic_read(&space_info->caching_threads))
5240                                goto search;
5241
5242                        /*
5243                         * 1 of the following 2 things have happened so far
5244                         *
5245                         * 1) We found an ideal block group for caching that
5246                         * is mostly full and will cache quickly, so we might
5247                         * as well wait for it.
5248                         *
5249                         * 2) We searched for cached only and we didn't find
5250                         * anything, and we didn't start any caching kthreads
5251                         * either, so chances are we will loop through and
5252                         * start a couple caching kthreads, and then come back
5253                         * around and just wait for them.  This will be slower
5254                         * because we will have 2 caching kthreads reading at
5255                         * the same time when we could have just started one
5256                         * and waited for it to get far enough to give us an
5257                         * allocation, so go ahead and go to the wait caching
5258                         * loop.
5259                         */
5260                        loop = LOOP_CACHING_WAIT;
5261                        search_start = ideal_cache_offset;
5262                        ideal_cache_percent = 0;
5263                        goto ideal_cache;
5264                } else if (loop == LOOP_FIND_IDEAL) {
5265                        /*
5266                         * Didn't find a uncached bg, wait on anything we find
5267                         * next.
5268                         */
5269                        loop = LOOP_CACHING_WAIT;
5270                        goto search;
5271                }
5272
5273                if (loop < LOOP_CACHING_WAIT) {
5274                        loop++;
5275                        goto search;
5276                }
5277
5278                if (loop == LOOP_ALLOC_CHUNK) {
5279                        empty_size = 0;
5280                        empty_cluster = 0;
5281                }
5282
5283                if (allowed_chunk_alloc) {
5284                        ret = do_chunk_alloc(trans, root, num_bytes +
5285                                             2 * 1024 * 1024, data, 1);
5286                        allowed_chunk_alloc = 0;
5287                        done_chunk_alloc = 1;
5288                } else if (!done_chunk_alloc) {
5289                        space_info->force_alloc = 1;
5290                }
5291
5292                if (loop < LOOP_NO_EMPTY_SIZE) {
5293                        loop++;
5294                        goto search;
5295                }
5296                ret = -ENOSPC;
5297        } else if (!ins->objectid) {
5298                ret = -ENOSPC;
5299        }
5300
5301        /* we found what we needed */
5302        if (ins->objectid) {
5303                if (!(data & BTRFS_BLOCK_GROUP_DATA))
5304                        trans->block_group = block_group->key.objectid;
5305
5306                btrfs_put_block_group(block_group);
5307                ret = 0;
5308        }
5309
5310        return ret;
5311}
5312
5313static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5314                            int dump_block_groups)
5315{
5316        struct btrfs_block_group_cache *cache;
5317        int index = 0;
5318
5319        spin_lock(&info->lock);
5320        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
5321               (unsigned long long)(info->total_bytes - info->bytes_used -
5322                                    info->bytes_pinned - info->bytes_reserved -
5323                                    info->bytes_readonly),
5324               (info->full) ? "" : "not ");
5325        printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5326               "reserved=%llu, may_use=%llu, readonly=%llu\n",
5327               (unsigned long long)info->total_bytes,
5328               (unsigned long long)info->bytes_used,
5329               (unsigned long long)info->bytes_pinned,
5330               (unsigned long long)info->bytes_reserved,
5331               (unsigned long long)info->bytes_may_use,
5332               (unsigned long long)info->bytes_readonly);
5333        spin_unlock(&info->lock);
5334
5335        if (!dump_block_groups)
5336                return;
5337
5338        down_read(&info->groups_sem);
5339again:
5340        list_for_each_entry(cache, &info->block_groups[index], list) {
5341                spin_lock(&cache->lock);
5342                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
5343                       "%llu pinned %llu reserved\n",
5344                       (unsigned long long)cache->key.objectid,
5345                       (unsigned long long)cache->key.offset,
5346                       (unsigned long long)btrfs_block_group_used(&cache->item),
5347                       (unsigned long long)cache->pinned,
5348                       (unsigned long long)cache->reserved);
5349                btrfs_dump_free_space(cache, bytes);
5350                spin_unlock(&cache->lock);
5351        }
5352        if (++index < BTRFS_NR_RAID_TYPES)
5353                goto again;
5354        up_read(&info->groups_sem);
5355}
5356
5357int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5358                         struct btrfs_root *root,
5359                         u64 num_bytes, u64 min_alloc_size,
5360                         u64 empty_size, u64 hint_byte,
5361                         u64 search_end, struct btrfs_key *ins,
5362                         u64 data)
5363{
5364        int ret;
5365        u64 search_start = 0;
5366
5367        data = btrfs_get_alloc_profile(root, data);
5368again:
5369        /*
5370         * the only place that sets empty_size is btrfs_realloc_node, which
5371         * is not called recursively on allocations
5372         */
5373        if (empty_size || root->ref_cows)
5374                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5375                                     num_bytes + 2 * 1024 * 1024, data, 0);
5376
5377        WARN_ON(num_bytes < root->sectorsize);
5378        ret = find_free_extent(trans, root, num_bytes, empty_size,
5379                               search_start, search_end, hint_byte,
5380                               ins, data);
5381
5382        if (ret == -ENOSPC && num_bytes > min_alloc_size) {
5383                num_bytes = num_bytes >> 1;
5384                num_bytes = num_bytes & ~(root->sectorsize - 1);
5385                num_bytes = max(num_bytes, min_alloc_size);
5386                do_chunk_alloc(trans, root->fs_info->extent_root,
5387                               num_bytes, data, 1);
5388                goto again;
5389        }
5390        if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
5391                struct btrfs_space_info *sinfo;
5392
5393                sinfo = __find_space_info(root->fs_info, data);
5394                printk(KERN_ERR "btrfs allocation failed flags %llu, "
5395                       "wanted %llu\n", (unsigned long long)data,
5396                       (unsigned long long)num_bytes);
5397                dump_space_info(sinfo, num_bytes, 1);
5398        }
5399
5400        return ret;
5401}
5402
5403int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5404{
5405        struct btrfs_block_group_cache *cache;
5406        int ret = 0;
5407
5408        cache = btrfs_lookup_block_group(root->fs_info, start);
5409        if (!cache) {
5410                printk(KERN_ERR "Unable to find block group for %llu\n",
5411                       (unsigned long long)start);
5412                return -ENOSPC;
5413        }
5414
5415        ret = btrfs_discard_extent(root, start, len);
5416
5417        btrfs_add_free_space(cache, start, len);
5418        update_reserved_bytes(cache, len, 0, 1);
5419        btrfs_put_block_group(cache);
5420
5421        return ret;
5422}
5423
5424static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5425                                      struct btrfs_root *root,
5426                                      u64 parent, u64 root_objectid,
5427                                      u64 flags, u64 owner, u64 offset,
5428                                      struct btrfs_key *ins, int ref_mod)
5429{
5430        int ret;
5431        struct btrfs_fs_info *fs_info = root->fs_info;
5432        struct btrfs_extent_item *extent_item;
5433        struct btrfs_extent_inline_ref *iref;
5434        struct btrfs_path *path;
5435        struct extent_buffer *leaf;
5436        int type;
5437        u32 size;
5438
5439        if (parent > 0)
5440                type = BTRFS_SHARED_DATA_REF_KEY;
5441        else
5442                type = BTRFS_EXTENT_DATA_REF_KEY;
5443
5444        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5445
5446        path = btrfs_alloc_path();
5447        BUG_ON(!path);
5448
5449        path->leave_spinning = 1;
5450        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5451                                      ins, size);
5452        BUG_ON(ret);
5453
5454        leaf = path->nodes[0];
5455        extent_item = btrfs_item_ptr(leaf, path->slots[0],
5456                                     struct btrfs_extent_item);
5457        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
5458        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5459        btrfs_set_extent_flags(leaf, extent_item,
5460                               flags | BTRFS_EXTENT_FLAG_DATA);
5461
5462        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
5463        btrfs_set_extent_inline_ref_type(leaf, iref, type);
5464        if (parent > 0) {
5465                struct btrfs_shared_data_ref *ref;
5466                ref = (struct btrfs_shared_data_ref *)(iref + 1);
5467                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5468                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
5469        } else {
5470                struct btrfs_extent_data_ref *ref;
5471                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
5472                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
5473                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
5474                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
5475                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
5476        }
5477
5478        btrfs_mark_buffer_dirty(path->nodes[0]);
5479        btrfs_free_path(path);
5480
5481        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5482        if (ret) {
5483                printk(KERN_ERR "btrfs update block group failed for %llu "
5484                       "%llu\n", (unsigned long long)ins->objectid,
5485                       (unsigned long long)ins->offset);
5486                BUG();
5487        }
5488        return ret;
5489}
5490
5491static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5492                                     struct btrfs_root *root,
5493                                     u64 parent, u64 root_objectid,
5494                                     u64 flags, struct btrfs_disk_key *key,
5495                                     int level, struct btrfs_key *ins)
5496{
5497        int ret;
5498        struct btrfs_fs_info *fs_info = root->fs_info;
5499        struct btrfs_extent_item *extent_item;
5500        struct btrfs_tree_block_info *block_info;
5501        struct btrfs_extent_inline_ref *iref;
5502        struct btrfs_path *path;
5503        struct extent_buffer *leaf;
5504        u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5505
5506        path = btrfs_alloc_path();
5507        BUG_ON(!path);
5508
5509        path->leave_spinning = 1;
5510        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5511                                      ins, size);
5512        BUG_ON(ret);
5513
5514        leaf = path->nodes[0];
5515        extent_item = btrfs_item_ptr(leaf, path->slots[0],
5516                                     struct btrfs_extent_item);
5517        btrfs_set_extent_refs(leaf, extent_item, 1);
5518        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5519        btrfs_set_extent_flags(leaf, extent_item,
5520                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5521        block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
5522
5523        btrfs_set_tree_block_key(leaf, block_info, key);
5524        btrfs_set_tree_block_level(leaf, block_info, level);
5525
5526        iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
5527        if (parent > 0) {
5528                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
5529                btrfs_set_extent_inline_ref_type(leaf, iref,
5530                                                 BTRFS_SHARED_BLOCK_REF_KEY);
5531                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5532        } else {
5533                btrfs_set_extent_inline_ref_type(leaf, iref,
5534                                                 BTRFS_TREE_BLOCK_REF_KEY);
5535                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
5536        }
5537
5538        btrfs_mark_buffer_dirty(leaf);
5539        btrfs_free_path(path);
5540
5541        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5542        if (ret) {
5543                printk(KERN_ERR "btrfs update block group failed for %llu "
5544                       "%llu\n", (unsigned long long)ins->objectid,
5545                       (unsigned long long)ins->offset);
5546                BUG();
5547        }
5548        return ret;
5549}
5550
5551int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5552                                     struct btrfs_root *root,
5553                                     u64 root_objectid, u64 owner,
5554                                     u64 offset, struct btrfs_key *ins)
5555{
5556        int ret;
5557
5558        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5559
5560        ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
5561                                         0, root_objectid, owner, offset,
5562                                         BTRFS_ADD_DELAYED_EXTENT, NULL);
5563        return ret;
5564}
5565
5566/*
5567 * this is used by the tree logging recovery code.  It records that
5568 * an extent has been allocated and makes sure to clear the free
5569 * space cache bits as well
5570 */
5571int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5572                                   struct btrfs_root *root,
5573                                   u64 root_objectid, u64 owner, u64 offset,
5574                                   struct btrfs_key *ins)
5575{
5576        int ret;
5577        struct btrfs_block_group_cache *block_group;
5578        struct btrfs_caching_control *caching_ctl;
5579        u64 start = ins->objectid;
5580        u64 num_bytes = ins->offset;
5581
5582        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5583        cache_block_group(block_group, trans, NULL, 0);
5584        caching_ctl = get_caching_control(block_group);
5585
5586        if (!caching_ctl) {
5587                BUG_ON(!block_group_cache_done(block_group));
5588                ret = btrfs_remove_free_space(block_group, start, num_bytes);
5589                BUG_ON(ret);
5590        } else {
5591                mutex_lock(&caching_ctl->mutex);
5592
5593                if (start >= caching_ctl->progress) {
5594                        ret = add_excluded_extent(root, start, num_bytes);
5595                        BUG_ON(ret);
5596                } else if (start + num_bytes <= caching_ctl->progress) {
5597                        ret = btrfs_remove_free_space(block_group,
5598                                                      start, num_bytes);
5599                        BUG_ON(ret);
5600                } else {
5601                        num_bytes = caching_ctl->progress - start;
5602                        ret = btrfs_remove_free_space(block_group,
5603                                                      start, num_bytes);
5604                        BUG_ON(ret);
5605
5606                        start = caching_ctl->progress;
5607                        num_bytes = ins->objectid + ins->offset -
5608                                    caching_ctl->progress;
5609                        ret = add_excluded_extent(root, start, num_bytes);
5610                        BUG_ON(ret);
5611                }
5612
5613                mutex_unlock(&caching_ctl->mutex);
5614                put_caching_control(caching_ctl);
5615        }
5616
5617        ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5618        BUG_ON(ret);
5619        btrfs_put_block_group(block_group);
5620        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
5621                                         0, owner, offset, ins, 1);
5622        return ret;
5623}
5624
5625struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5626                                            struct btrfs_root *root,
5627                                            u64 bytenr, u32 blocksize,
5628                                            int level)
5629{
5630        struct extent_buffer *buf;
5631
5632        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
5633        if (!buf)
5634                return ERR_PTR(-ENOMEM);
5635        btrfs_set_header_generation(buf, trans->transid);
5636        btrfs_set_buffer_lockdep_class(buf, level);
5637        btrfs_tree_lock(buf);
5638        clean_tree_block(trans, root, buf);
5639
5640        btrfs_set_lock_blocking(buf);
5641        btrfs_set_buffer_uptodate(buf);
5642
5643        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
5644                /*
5645                 * we allow two log transactions at a time, use different
5646                 * EXENT bit to differentiate dirty pages.
5647                 */
5648                if (root->log_transid % 2 == 0)
5649                        set_extent_dirty(&root->dirty_log_pages, buf->start,
5650                                        buf->start + buf->len - 1, GFP_NOFS);
5651                else
5652                        set_extent_new(&root->dirty_log_pages, buf->start,
5653                                        buf->start + buf->len - 1, GFP_NOFS);
5654        } else {
5655                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
5656                         buf->start + buf->len - 1, GFP_NOFS);
5657        }
5658        trans->blocks_used++;
5659        /* this returns a buffer locked for blocking */
5660        return buf;
5661}
5662
5663static struct btrfs_block_rsv *
5664use_block_rsv(struct btrfs_trans_handle *trans,
5665              struct btrfs_root *root, u32 blocksize)
5666{
5667        struct btrfs_block_rsv *block_rsv;
5668        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5669        int ret;
5670
5671        block_rsv = get_block_rsv(trans, root);
5672
5673        if (block_rsv->size == 0) {
5674                ret = reserve_metadata_bytes(trans, root, block_rsv,
5675                                             blocksize, 0);
5676                /*
5677                 * If we couldn't reserve metadata bytes try and use some from
5678                 * the global reserve.
5679                 */
5680                if (ret && block_rsv != global_rsv) {
5681                        ret = block_rsv_use_bytes(global_rsv, blocksize);
5682                        if (!ret)
5683                                return global_rsv;
5684                        return ERR_PTR(ret);
5685                } else if (ret) {
5686                        return ERR_PTR(ret);
5687                }
5688                return block_rsv;
5689        }
5690
5691        ret = block_rsv_use_bytes(block_rsv, blocksize);
5692        if (!ret)
5693                return block_rsv;
5694        if (ret) {
5695                WARN_ON(1);
5696                ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
5697                                             0);
5698                if (!ret) {
5699                        spin_lock(&block_rsv->lock);
5700                        block_rsv->size += blocksize;
5701                        spin_unlock(&block_rsv->lock);
5702                        return block_rsv;
5703                } else if (ret && block_rsv != global_rsv) {
5704                        ret = block_rsv_use_bytes(global_rsv, blocksize);
5705                        if (!ret)
5706                                return global_rsv;
5707                }
5708        }
5709
5710        return ERR_PTR(-ENOSPC);
5711}
5712
5713static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5714{
5715        block_rsv_add_bytes(block_rsv, blocksize, 0);
5716        block_rsv_release_bytes(block_rsv, NULL, 0);
5717}
5718
5719/*
5720 * finds a free extent and does all the dirty work required for allocation
5721 * returns the key for the extent through ins, and a tree buffer for
5722 * the first block of the extent through buf.
5723 *
5724 * returns the tree buffer or NULL.
5725 */
5726struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
5727                                        struct btrfs_root *root, u32 blocksize,
5728                                        u64 parent, u64 root_objectid,
5729                                        struct btrfs_disk_key *key, int level,
5730                                        u64 hint, u64 empty_size)
5731{
5732        struct btrfs_key ins;
5733        struct btrfs_block_rsv *block_rsv;
5734        struct extent_buffer *buf;
5735        u64 flags = 0;
5736        int ret;
5737
5738
5739        block_rsv = use_block_rsv(trans, root, blocksize);
5740        if (IS_ERR(block_rsv))
5741                return ERR_CAST(block_rsv);
5742
5743        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5744                                   empty_size, hint, (u64)-1, &ins, 0);
5745        if (ret) {
5746                unuse_block_rsv(block_rsv, blocksize);
5747                return ERR_PTR(ret);
5748        }
5749
5750        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
5751                                    blocksize, level);
5752        BUG_ON(IS_ERR(buf));
5753
5754        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5755                if (parent == 0)
5756                        parent = ins.objectid;
5757                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5758        } else
5759                BUG_ON(parent > 0);
5760
5761        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5762                struct btrfs_delayed_extent_op *extent_op;
5763                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5764                BUG_ON(!extent_op);
5765                if (key)
5766                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
5767                else
5768                        memset(&extent_op->key, 0, sizeof(extent_op->key));
5769                extent_op->flags_to_set = flags;
5770                extent_op->update_key = 1;
5771                extent_op->update_flags = 1;
5772                extent_op->is_data = 0;
5773
5774                ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5775                                        ins.offset, parent, root_objectid,
5776                                        level, BTRFS_ADD_DELAYED_EXTENT,
5777                                        extent_op);
5778                BUG_ON(ret);
5779        }
5780        return buf;
5781}
5782
5783struct walk_control {
5784        u64 refs[BTRFS_MAX_LEVEL];
5785        u64 flags[BTRFS_MAX_LEVEL];
5786        struct btrfs_key update_progress;
5787        int stage;
5788        int level;
5789        int shared_level;
5790        int update_ref;
5791        int keep_locks;
5792        int reada_slot;
5793        int reada_count;
5794};
5795
5796#define DROP_REFERENCE  1
5797#define UPDATE_BACKREF  2
5798
5799static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5800                                     struct btrfs_root *root,
5801                                     struct walk_control *wc,
5802                                     struct btrfs_path *path)
5803{
5804        u64 bytenr;
5805        u64 generation;
5806        u64 refs;
5807        u64 flags;
5808        u32 nritems;
5809        u32 blocksize;
5810        struct btrfs_key key;
5811        struct extent_buffer *eb;
5812        int ret;
5813        int slot;
5814        int nread = 0;
5815
5816        if (path->slots[wc->level] < wc->reada_slot) {
5817                wc->reada_count = wc->reada_count * 2 / 3;
5818                wc->reada_count = max(wc->reada_count, 2);
5819        } else {
5820                wc->reada_count = wc->reada_count * 3 / 2;
5821                wc->reada_count = min_t(int, wc->reada_count,
5822                                        BTRFS_NODEPTRS_PER_BLOCK(root));
5823        }
5824
5825        eb = path->nodes[wc->level];
5826        nritems = btrfs_header_nritems(eb);
5827        blocksize = btrfs_level_size(root, wc->level - 1);
5828
5829        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
5830                if (nread >= wc->reada_count)
5831                        break;
5832
5833                cond_resched();
5834                bytenr = btrfs_node_blockptr(eb, slot);
5835                generation = btrfs_node_ptr_generation(eb, slot);
5836
5837                if (slot == path->slots[wc->level])
5838                        goto reada;
5839
5840                if (wc->stage == UPDATE_BACKREF &&
5841                    generation <= root->root_key.offset)
5842                        continue;
5843
5844                /* We don't lock the tree block, it's OK to be racy here */
5845                ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
5846                                               &refs, &flags);
5847                BUG_ON(ret);
5848                BUG_ON(refs == 0);
5849
5850                if (wc->stage == DROP_REFERENCE) {
5851                        if (refs == 1)
5852                                goto reada;
5853
5854                        if (wc->level == 1 &&
5855                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5856                                continue;
5857                        if (!wc->update_ref ||
5858                            generation <= root->root_key.offset)
5859                                continue;
5860                        btrfs_node_key_to_cpu(eb, &key, slot);
5861                        ret = btrfs_comp_cpu_keys(&key,
5862                                                  &wc->update_progress);
5863                        if (ret < 0)
5864                                continue;
5865                } else {
5866                        if (wc->level == 1 &&
5867                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
5868                                continue;
5869                }
5870reada:
5871                ret = readahead_tree_block(root, bytenr, blocksize,
5872                                           generation);
5873                if (ret)
5874                        break;
5875                nread++;
5876        }
5877        wc->reada_slot = slot;
5878}
5879
5880/*
5881 * hepler to process tree block while walking down the tree.
5882 *
5883 * when wc->stage == UPDATE_BACKREF, this function updates
5884 * back refs for pointers in the block.
5885 *
5886 * NOTE: return value 1 means we should stop walking down.
5887 */
5888static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5889                                   struct btrfs_root *root,
5890                                   struct btrfs_path *path,
5891                                   struct walk_control *wc, int lookup_info)
5892{
5893        int level = wc->level;
5894        struct extent_buffer *eb = path->nodes[level];
5895        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5896        int ret;
5897
5898        if (wc->stage == UPDATE_BACKREF &&
5899            btrfs_header_owner(eb) != root->root_key.objectid)
5900                return 1;
5901
5902        /*
5903         * when reference count of tree block is 1, it won't increase
5904         * again. once full backref flag is set, we never clear it.
5905         */
5906        if (lookup_info &&
5907            ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
5908             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5909                BUG_ON(!path->locks[level]);
5910                ret = btrfs_lookup_extent_info(trans, root,
5911                                               eb->start, eb->len,
5912                                               &wc->refs[level],
5913                                               &wc->flags[level]);
5914                BUG_ON(ret);
5915                BUG_ON(wc->refs[level] == 0);
5916        }
5917
5918        if (wc->stage == DROP_REFERENCE) {
5919                if (wc->refs[level] > 1)
5920                        return 1;
5921
5922                if (path->locks[level] && !wc->keep_locks) {
5923                        btrfs_tree_unlock(eb);
5924                        path->locks[level] = 0;
5925                }
5926                return 0;
5927        }
5928
5929        /* wc->stage == UPDATE_BACKREF */
5930        if (!(wc->flags[level] & flag)) {
5931                BUG_ON(!path->locks[level]);
5932                ret = btrfs_inc_ref(trans, root, eb, 1);
5933                BUG_ON(ret);
5934                ret = btrfs_dec_ref(trans, root, eb, 0);
5935                BUG_ON(ret);
5936                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
5937                                                  eb->len, flag, 0);
5938                BUG_ON(ret);
5939                wc->flags[level] |= flag;
5940        }
5941
5942        /*
5943         * the block is shared by multiple trees, so it's not good to
5944         * keep the tree lock
5945         */
5946        if (path->locks[level] && level > 0) {
5947                btrfs_tree_unlock(eb);
5948                path->locks[level] = 0;
5949        }
5950        return 0;
5951}
5952
5953/*
5954 * hepler to process tree block pointer.
5955 *
5956 * when wc->stage == DROP_REFERENCE, this function checks
5957 * reference count of the block pointed to. if the block
5958 * is shared and we need update back refs for the subtree
5959 * rooted at the block, this function changes wc->stage to
5960 * UPDATE_BACKREF. if the block is shared and there is no
5961 * need to update back, this function drops the reference
5962 * to the block.
5963 *
5964 * NOTE: return value 1 means we should stop walking down.
5965 */
5966static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5967                                 struct btrfs_root *root,
5968                                 struct btrfs_path *path,
5969                                 struct walk_control *wc, int *lookup_info)
5970{
5971        u64 bytenr;
5972        u64 generation;
5973        u64 parent;
5974        u32 blocksize;
5975        struct btrfs_key key;
5976        struct extent_buffer *next;
5977        int level = wc->level;
5978        int reada = 0;
5979        int ret = 0;
5980
5981        generation = btrfs_node_ptr_generation(path->nodes[level],
5982                                               path->slots[level]);
5983        /*
5984         * if the lower level block was created before the snapshot
5985         * was created, we know there is no need to update back refs
5986         * for the subtree
5987         */
5988        if (wc->stage == UPDATE_BACKREF &&
5989            generation <= root->root_key.offset) {
5990                *lookup_info = 1;
5991                return 1;
5992        }
5993
5994        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5995        blocksize = btrfs_level_size(root, level - 1);
5996
5997        next = btrfs_find_tree_block(root, bytenr, blocksize);
5998        if (!next) {
5999                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6000                if (!next)
6001                        return -ENOMEM;
6002                reada = 1;
6003        }
6004        btrfs_tree_lock(next);
6005        btrfs_set_lock_blocking(next);
6006
6007        ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6008                                       &wc->refs[level - 1],
6009                                       &wc->flags[level - 1]);
6010        BUG_ON(ret);
6011        BUG_ON(wc->refs[level - 1] == 0);
6012        *lookup_info = 0;
6013
6014        if (wc->stage == DROP_REFERENCE) {
6015                if (wc->refs[level - 1] > 1) {
6016                        if (level == 1 &&
6017                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6018                                goto skip;
6019
6020                        if (!wc->update_ref ||
6021                            generation <= root->root_key.offset)
6022                                goto skip;
6023
6024                        btrfs_node_key_to_cpu(path->nodes[level], &key,
6025                                              path->slots[level]);
6026                        ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6027                        if (ret < 0)
6028                                goto skip;
6029
6030                        wc->stage = UPDATE_BACKREF;
6031                        wc->shared_level = level - 1;
6032                }
6033        } else {
6034                if (level == 1 &&
6035                    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6036                        goto skip;
6037        }
6038
6039        if (!btrfs_buffer_uptodate(next, generation)) {
6040                btrfs_tree_unlock(next);
6041                free_extent_buffer(next);
6042                next = NULL;
6043                *lookup_info = 1;
6044        }
6045
6046        if (!next) {
6047                if (reada && level == 1)
6048                        reada_walk_down(trans, root, wc, path);
6049                next = read_tree_block(root, bytenr, blocksize, generation);
6050                btrfs_tree_lock(next);
6051                btrfs_set_lock_blocking(next);
6052        }
6053
6054        level--;
6055        BUG_ON(level != btrfs_header_level(next));
6056        path->nodes[level] = next;
6057        path->slots[level] = 0;
6058        path->locks[level] = 1;
6059        wc->level = level;
6060        if (wc->level == 1)
6061                wc->reada_slot = 0;
6062        return 0;
6063skip:
6064        wc->refs[level - 1] = 0;
6065        wc->flags[level - 1] = 0;
6066        if (wc->stage == DROP_REFERENCE) {
6067                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6068                        parent = path->nodes[level]->start;
6069                } else {
6070                        BUG_ON(root->root_key.objectid !=
6071                               btrfs_header_owner(path->nodes[level]));
6072                        parent = 0;
6073                }
6074
6075                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6076                                        root->root_key.objectid, level - 1, 0);
6077                BUG_ON(ret);
6078        }
6079        btrfs_tree_unlock(next);
6080        free_extent_buffer(next);
6081        *lookup_info = 1;
6082        return 1;
6083}
6084
6085/*
6086 * hepler to process tree block while walking up the tree.
6087 *
6088 * when wc->stage == DROP_REFERENCE, this function drops
6089 * reference count on the block.
6090 *
6091 * when wc->stage == UPDATE_BACKREF, this function changes
6092 * wc->stage back to DROP_REFERENCE if we changed wc->stage
6093 * to UPDATE_BACKREF previously while processing the block.
6094 *
6095 * NOTE: return value 1 means we should stop walking up.
6096 */
6097static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6098                                 struct btrfs_root *root,
6099                                 struct btrfs_path *path,
6100                                 struct walk_control *wc)
6101{
6102        int ret;
6103        int level = wc->level;
6104        struct extent_buffer *eb = path->nodes[level];
6105        u64 parent = 0;
6106
6107        if (wc->stage == UPDATE_BACKREF) {
6108                BUG_ON(wc->shared_level < level);
6109                if (level < wc->shared_level)
6110                        goto out;
6111
6112                ret = find_next_key(path, level + 1, &wc->update_progress);
6113                if (ret > 0)
6114                        wc->update_ref = 0;
6115
6116                wc->stage = DROP_REFERENCE;
6117                wc->shared_level = -1;
6118                path->slots[level] = 0;
6119
6120                /*
6121                 * check reference count again if the block isn't locked.
6122                 * we should start walking down the tree again if reference
6123                 * count is one.
6124                 */
6125                if (!path->locks[level]) {
6126                        BUG_ON(level == 0);
6127                        btrfs_tree_lock(eb);
6128                        btrfs_set_lock_blocking(eb);
6129                        path->locks[level] = 1;
6130
6131                        ret = btrfs_lookup_extent_info(trans, root,
6132                                                       eb->start, eb->len,
6133                                                       &wc->refs[level],
6134                                                       &wc->flags[level]);
6135                        BUG_ON(ret);
6136                        BUG_ON(wc->refs[level] == 0);
6137                        if (wc->refs[level] == 1) {
6138                                btrfs_tree_unlock(eb);
6139                                path->locks[level] = 0;
6140                                return 1;
6141                        }
6142                }
6143        }
6144
6145        /* wc->stage == DROP_REFERENCE */
6146        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6147
6148        if (wc->refs[level] == 1) {
6149                if (level == 0) {
6150                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6151                                ret = btrfs_dec_ref(trans, root, eb, 1);
6152                        else
6153                                ret = btrfs_dec_ref(trans, root, eb, 0);
6154                        BUG_ON(ret);
6155                }
6156                /* make block locked assertion in clean_tree_block happy */
6157                if (!path->locks[level] &&
6158                    btrfs_header_generation(eb) == trans->transid) {
6159                        btrfs_tree_lock(eb);
6160                        btrfs_set_lock_blocking(eb);
6161                        path->locks[level] = 1;
6162                }
6163                clean_tree_block(trans, root, eb);
6164        }
6165
6166        if (eb == root->node) {
6167                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6168                        parent = eb->start;
6169                else
6170                        BUG_ON(root->root_key.objectid !=
6171                               btrfs_header_owner(eb));
6172        } else {
6173                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6174                        parent = path->nodes[level + 1]->start;
6175                else
6176                        BUG_ON(root->root_key.objectid !=
6177                               btrfs_header_owner(path->nodes[level + 1]));
6178        }
6179
6180        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6181out:
6182        wc->refs[level] = 0;
6183        wc->flags[level] = 0;
6184        return 0;
6185}
6186
6187static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6188                                   struct btrfs_root *root,
6189                                   struct btrfs_path *path,
6190                                   struct walk_control *wc)
6191{
6192        int level = wc->level;
6193        int lookup_info = 1;
6194        int ret;
6195
6196        while (level >= 0) {
6197                ret = walk_down_proc(trans, root, path, wc, lookup_info);
6198                if (ret > 0)
6199                        break;
6200
6201                if (level == 0)
6202                        break;
6203
6204                if (path->slots[level] >=
6205                    btrfs_header_nritems(path->nodes[level]))
6206                        break;
6207
6208                ret = do_walk_down(trans, root, path, wc, &lookup_info);
6209                if (ret > 0) {
6210                        path->slots[level]++;
6211                        continue;
6212                } else if (ret < 0)
6213                        return ret;
6214                level = wc->level;
6215        }
6216        return 0;
6217}
6218
6219static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6220                                 struct btrfs_root *root,
6221                                 struct btrfs_path *path,
6222                                 struct walk_control *wc, int max_level)
6223{
6224        int level = wc->level;
6225        int ret;
6226
6227        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6228        while (level < max_level && path->nodes[level]) {
6229                wc->level = level;
6230                if (path->slots[level] + 1 <
6231                    btrfs_header_nritems(path->nodes[level])) {
6232                        path->slots[level]++;
6233                        return 0;
6234                } else {
6235                        ret = walk_up_proc(trans, root, path, wc);
6236                        if (ret > 0)
6237                                return 0;
6238
6239                        if (path->locks[level]) {
6240                                btrfs_tree_unlock(path->nodes[level]);
6241                                path->locks[level] = 0;
6242                        }
6243                        free_extent_buffer(path->nodes[level]);
6244                        path->nodes[level] = NULL;
6245                        level++;
6246                }
6247        }
6248        return 1;
6249}
6250
6251/*
6252 * drop a subvolume tree.
6253 *
6254 * this function traverses the tree freeing any blocks that only
6255 * referenced by the tree.
6256 *
6257 * when a shared tree block is found. this function decreases its
6258 * reference count by one. if update_ref is true, this function
6259 * also make sure backrefs for the shared block and all lower level
6260 * blocks are properly updated.
6261 */
6262int btrfs_drop_snapshot(struct btrfs_root *root,
6263                        struct btrfs_block_rsv *block_rsv, int update_ref)
6264{
6265        struct btrfs_path *path;
6266        struct btrfs_trans_handle *trans;
6267        struct btrfs_root *tree_root = root->fs_info->tree_root;
6268        struct btrfs_root_item *root_item = &root->root_item;
6269        struct walk_control *wc;
6270        struct btrfs_key key;
6271        int err = 0;
6272        int ret;
6273        int level;
6274
6275        path = btrfs_alloc_path();
6276        BUG_ON(!path);
6277
6278        wc = kzalloc(sizeof(*wc), GFP_NOFS);
6279        BUG_ON(!wc);
6280
6281        trans = btrfs_start_transaction(tree_root, 0);
6282        BUG_ON(IS_ERR(trans));
6283
6284        if (block_rsv)
6285                trans->block_rsv = block_rsv;
6286
6287        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6288                level = btrfs_header_level(root->node);
6289                path->nodes[level] = btrfs_lock_root_node(root);
6290                btrfs_set_lock_blocking(path->nodes[level]);
6291                path->slots[level] = 0;
6292                path->locks[level] = 1;
6293                memset(&wc->update_progress, 0,
6294                       sizeof(wc->update_progress));
6295        } else {
6296                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6297                memcpy(&wc->update_progress, &key,
6298                       sizeof(wc->update_progress));
6299
6300                level = root_item->drop_level;
6301                BUG_ON(level == 0);
6302                path->lowest_level = level;
6303                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6304                path->lowest_level = 0;
6305                if (ret < 0) {
6306                        err = ret;
6307                        goto out;
6308                }
6309                WARN_ON(ret > 0);
6310
6311                /*
6312                 * unlock our path, this is safe because only this
6313                 * function is allowed to delete this snapshot
6314                 */
6315                btrfs_unlock_up_safe(path, 0);
6316
6317                level = btrfs_header_level(root->node);
6318                while (1) {
6319                        btrfs_tree_lock(path->nodes[level]);
6320                        btrfs_set_lock_blocking(path->nodes[level]);
6321
6322                        ret = btrfs_lookup_extent_info(trans, root,
6323                                                path->nodes[level]->start,
6324                                                path->nodes[level]->len,
6325                                                &wc->refs[level],
6326                                                &wc->flags[level]);
6327                        BUG_ON(ret);
6328                        BUG_ON(wc->refs[level] == 0);
6329
6330                        if (level == root_item->drop_level)
6331                                break;
6332
6333                        btrfs_tree_unlock(path->nodes[level]);
6334                        WARN_ON(wc->refs[level] != 1);
6335                        level--;
6336                }
6337        }
6338
6339        wc->level = level;
6340        wc->shared_level = -1;
6341        wc->stage = DROP_REFERENCE;
6342        wc->update_ref = update_ref;
6343        wc->keep_locks = 0;
6344        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6345
6346        while (1) {
6347                ret = walk_down_tree(trans, root, path, wc);
6348                if (ret < 0) {
6349                        err = ret;
6350                        break;
6351                }
6352
6353                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6354                if (ret < 0) {
6355                        err = ret;
6356                        break;
6357                }
6358
6359                if (ret > 0) {
6360                        BUG_ON(wc->stage != DROP_REFERENCE);
6361                        break;
6362                }
6363
6364                if (wc->stage == DROP_REFERENCE) {
6365                        level = wc->level;
6366                        btrfs_node_key(path->nodes[level],
6367                                       &root_item->drop_progress,
6368                                       path->slots[level]);
6369                        root_item->drop_level = level;
6370                }
6371
6372                BUG_ON(wc->level == 0);
6373                if (btrfs_should_end_transaction(trans, tree_root)) {
6374                        ret = btrfs_update_root(trans, tree_root,
6375                                                &root->root_key,
6376                                                root_item);
6377                        BUG_ON(ret);
6378
6379                        btrfs_end_transaction_throttle(trans, tree_root);
6380                        trans = btrfs_start_transaction(tree_root, 0);
6381                        BUG_ON(IS_ERR(trans));
6382                        if (block_rsv)
6383                                trans->block_rsv = block_rsv;
6384                }
6385        }
6386        btrfs_release_path(root, path);
6387        BUG_ON(err);
6388
6389        ret = btrfs_del_root(trans, tree_root, &root->root_key);
6390        BUG_ON(ret);
6391
6392        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
6393                ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
6394                                           NULL, NULL);
6395                BUG_ON(ret < 0);
6396                if (ret > 0) {
6397                        /* if we fail to delete the orphan item this time
6398                         * around, it'll get picked up the next time.
6399                         *
6400                         * The most common failure here is just -ENOENT.
6401                         */
6402                        btrfs_del_orphan_item(trans, tree_root,
6403                                              root->root_key.objectid);
6404                }
6405        }
6406
6407        if (root->in_radix) {
6408                btrfs_free_fs_root(tree_root->fs_info, root);
6409        } else {
6410                free_extent_buffer(root->node);
6411                free_extent_buffer(root->commit_root);
6412                kfree(root);
6413        }
6414out:
6415        btrfs_end_transaction_throttle(trans, tree_root);
6416        kfree(wc);
6417        btrfs_free_path(path);
6418        return err;
6419}
6420
6421/*
6422 * drop subtree rooted at tree block 'node'.
6423 *
6424 * NOTE: this function will unlock and release tree block 'node'
6425 */
6426int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6427                        struct btrfs_root *root,
6428                        struct extent_buffer *node,
6429                        struct extent_buffer *parent)
6430{
6431        struct btrfs_path *path;
6432        struct walk_control *wc;
6433        int level;
6434        int parent_level;
6435        int ret = 0;
6436        int wret;
6437
6438        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6439
6440        path = btrfs_alloc_path();
6441        BUG_ON(!path);
6442
6443        wc = kzalloc(sizeof(*wc), GFP_NOFS);
6444        BUG_ON(!wc);
6445
6446        btrfs_assert_tree_locked(parent);
6447        parent_level = btrfs_header_level(parent);
6448        extent_buffer_get(parent);
6449        path->nodes[parent_level] = parent;
6450        path->slots[parent_level] = btrfs_header_nritems(parent);
6451
6452        btrfs_assert_tree_locked(node);
6453        level = btrfs_header_level(node);
6454        path->nodes[level] = node;
6455        path->slots[level] = 0;
6456        path->locks[level] = 1;
6457
6458        wc->refs[parent_level] = 1;
6459        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6460        wc->level = level;
6461        wc->shared_level = -1;
6462        wc->stage = DROP_REFERENCE;
6463        wc->update_ref = 0;
6464        wc->keep_locks = 1;
6465        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6466
6467        while (1) {
6468                wret = walk_down_tree(trans, root, path, wc);
6469                if (wret < 0) {
6470                        ret = wret;
6471                        break;
6472                }
6473
6474                wret = walk_up_tree(trans, root, path, wc, parent_level);
6475                if (wret < 0)
6476                        ret = wret;
6477                if (wret != 0)
6478                        break;
6479        }
6480
6481        kfree(wc);
6482        btrfs_free_path(path);
6483        return ret;
6484}
6485
6486#if 0
6487static unsigned long calc_ra(unsigned long start, unsigned long last,
6488                             unsigned long nr)
6489{
6490        return min(last, start + nr - 1);
6491}
6492
6493static noinline int relocate_inode_pages(struct inode *inode, u64 start,
6494                                         u64 len)
6495{
6496        u64 page_start;
6497        u64 page_end;
6498        unsigned long first_index;
6499        unsigned long last_index;
6500        unsigned long i;
6501        struct page *page;
6502        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6503        struct file_ra_state *ra;
6504        struct btrfs_ordered_extent *ordered;
6505        unsigned int total_read = 0;
6506        unsigned int total_dirty = 0;
6507        int ret = 0;
6508
6509        ra = kzalloc(sizeof(*ra), GFP_NOFS);
6510        if (!ra)
6511                return -ENOMEM;
6512
6513        mutex_lock(&inode->i_mutex);
6514        first_index = start >> PAGE_CACHE_SHIFT;
6515        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
6516
6517        /* make sure the dirty trick played by the caller work */
6518        ret = invalidate_inode_pages2_range(inode->i_mapping,
6519                                            first_index, last_index);
6520        if (ret)
6521                goto out_unlock;
6522
6523        file_ra_state_init(ra, inode->i_mapping);
6524
6525        for (i = first_index ; i <= last_index; i++) {
6526                if (total_read % ra->ra_pages == 0) {
6527                        btrfs_force_ra(inode->i_mapping, ra, NULL, i,
6528                                       calc_ra(i, last_index, ra->ra_pages));
6529                }
6530                total_read++;
6531again:
6532                if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
6533                        BUG_ON(1);
6534                page = grab_cache_page(inode->i_mapping, i);
6535                if (!page) {
6536                        ret = -ENOMEM;
6537                        goto out_unlock;
6538                }
6539                if (!PageUptodate(page)) {
6540                        btrfs_readpage(NULL, page);
6541                        lock_page(page);
6542                        if (!PageUptodate(page)) {
6543                                unlock_page(page);
6544                                page_cache_release(page);
6545                                ret = -EIO;
6546                                goto out_unlock;
6547                        }
6548                }
6549                wait_on_page_writeback(page);
6550
6551                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
6552                page_end = page_start + PAGE_CACHE_SIZE - 1;
6553                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
6554
6555                ordered = btrfs_lookup_ordered_extent(inode, page_start);
6556                if (ordered) {
6557                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
6558                        unlock_page(page);
6559                        page_cache_release(page);
6560                        btrfs_start_ordered_extent(inode, ordered, 1);
6561                        btrfs_put_ordered_extent(ordered);
6562                        goto again;
6563                }
6564                set_page_extent_mapped(page);
6565
6566                if (i == first_index)
6567                        set_extent_bits(io_tree, page_start, page_end,
6568                                        EXTENT_BOUNDARY, GFP_NOFS);
6569                btrfs_set_extent_delalloc(inode, page_start, page_end);
6570
6571                set_page_dirty(page);
6572                total_dirty++;
6573
6574                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
6575                unlock_page(page);
6576                page_cache_release(page);
6577        }
6578
6579out_unlock:
6580        kfree(ra);
6581        mutex_unlock(&inode->i_mutex);
6582        balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
6583        return ret;
6584}
6585
6586static noinline int relocate_data_extent(struct inode *reloc_inode,
6587                                         struct btrfs_key *extent_key,
6588                                         u64 offset)
6589{
6590        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6591        struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
6592        struct extent_map *em;
6593        u64 start = extent_key->objectid - offset;
6594        u64 end = start + extent_key->offset - 1;
6595
6596        em = alloc_extent_map(GFP_NOFS);
6597        BUG_ON(!em);
6598
6599        em->start = start;
6600        em->len = extent_key->offset;
6601        em->block_len = extent_key->offset;
6602        em->block_start = extent_key->objectid;
6603        em->bdev = root->fs_info->fs_devices->latest_bdev;
6604        set_bit(EXTENT_FLAG_PINNED, &em->flags);
6605
6606        /* setup extent map to cheat btrfs_readpage */
6607        lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
6608        while (1) {
6609                int ret;
6610                write_lock(&em_tree->lock);
6611                ret = add_extent_mapping(em_tree, em);
6612                write_unlock(&em_tree->lock);
6613                if (ret != -EEXIST) {
6614                        free_extent_map(em);
6615                        break;
6616                }
6617                btrfs_drop_extent_cache(reloc_inode, start, end, 0);
6618        }
6619        unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
6620
6621        return relocate_inode_pages(reloc_inode, start, extent_key->offset);
6622}
6623
6624struct btrfs_ref_path {
6625        u64 extent_start;
6626        u64 nodes[BTRFS_MAX_LEVEL];
6627        u64 root_objectid;
6628        u64 root_generation;
6629        u64 owner_objectid;
6630        u32 num_refs;
6631        int lowest_level;
6632        int current_level;
6633        int shared_level;
6634
6635        struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
6636        u64 new_nodes[BTRFS_MAX_LEVEL];
6637};
6638
6639struct disk_extent {
6640        u64 ram_bytes;
6641        u64 disk_bytenr;
6642        u64 disk_num_bytes;
6643        u64 offset;
6644        u64 num_bytes;
6645        u8 compression;
6646        u8 encryption;
6647        u16 other_encoding;
6648};
6649
6650static int is_cowonly_root(u64 root_objectid)
6651{
6652        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
6653            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
6654            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
6655            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
6656            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
6657            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
6658                return 1;
6659        return 0;
6660}
6661
6662static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
6663                                    struct btrfs_root *extent_root,
6664                                    struct btrfs_ref_path *ref_path,
6665                                    int first_time)
6666{
6667        struct extent_buffer *leaf;
6668        struct btrfs_path *path;
6669        struct btrfs_extent_ref *ref;
6670        struct btrfs_key key;
6671        struct btrfs_key found_key;
6672        u64 bytenr;
6673        u32 nritems;
6674        int level;
6675        int ret = 1;
6676
6677        path = btrfs_alloc_path();
6678        if (!path)
6679                return -ENOMEM;
6680
6681        if (first_time) {
6682                ref_path->lowest_level = -1;
6683                ref_path->current_level = -1;
6684                ref_path->shared_level = -1;
6685                goto walk_up;
6686        }
6687walk_down:
6688        level = ref_path->current_level - 1;
6689        while (level >= -1) {
6690                u64 parent;
6691                if (level < ref_path->lowest_level)
6692                        break;
6693
6694                if (level >= 0)
6695                        bytenr = ref_path->nodes[level];
6696                else
6697                        bytenr = ref_path->extent_start;
6698                BUG_ON(bytenr == 0);
6699
6700                parent = ref_path->nodes[level + 1];
6701                ref_path->nodes[level + 1] = 0;
6702                ref_path->current_level = level;
6703                BUG_ON(parent == 0);
6704
6705                key.objectid = bytenr;
6706                key.offset = parent + 1;
6707                key.type = BTRFS_EXTENT_REF_KEY;
6708
6709                ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6710                if (ret < 0)
6711                        goto out;
6712                BUG_ON(ret == 0);
6713
6714                leaf = path->nodes[0];
6715                nritems = btrfs_header_nritems(leaf);
6716                if (path->slots[0] >= nritems) {
6717                        ret = btrfs_next_leaf(extent_root, path);
6718                        if (ret < 0)
6719                                goto out;
6720                        if (ret > 0)
6721                                goto next;
6722                        leaf = path->nodes[0];
6723                }
6724
6725                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6726                if (found_key.objectid == bytenr &&
6727                    found_key.type == BTRFS_EXTENT_REF_KEY) {
6728                        if (level < ref_path->shared_level)
6729                                ref_path->shared_level = level;
6730                        goto found;
6731                }
6732next:
6733                level--;
6734                btrfs_release_path(extent_root, path);
6735                cond_resched();
6736        }
6737        /* reached lowest level */
6738        ret = 1;
6739        goto out;
6740walk_up:
6741        level = ref_path->current_level;
6742        while (level < BTRFS_MAX_LEVEL - 1) {
6743                u64 ref_objectid;
6744
6745                if (level >= 0)
6746                        bytenr = ref_path->nodes[level];
6747                else
6748                        bytenr = ref_path->extent_start;
6749
6750                BUG_ON(bytenr == 0);
6751
6752                key.objectid = bytenr;
6753                key.offset = 0;
6754                key.type = BTRFS_EXTENT_REF_KEY;
6755
6756                ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6757                if (ret < 0)
6758                        goto out;
6759
6760                leaf = path->nodes[0];
6761                nritems = btrfs_header_nritems(leaf);
6762                if (path->slots[0] >= nritems) {
6763                        ret = btrfs_next_leaf(extent_root, path);
6764                        if (ret < 0)
6765                                goto out;
6766                        if (ret > 0) {
6767                                /* the extent was freed by someone */
6768                                if (ref_path->lowest_level == level)
6769                                        goto out;
6770                                btrfs_release_path(extent_root, path);
6771                                goto walk_down;
6772                        }
6773                        leaf = path->nodes[0];
6774                }
6775
6776                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6777                if (found_key.objectid != bytenr ||
6778                                found_key.type != BTRFS_EXTENT_REF_KEY) {
6779                        /* the extent was freed by someone */
6780                        if (ref_path->lowest_level == level) {
6781                                ret = 1;
6782                                goto out;
6783                        }
6784                        btrfs_release_path(extent_root, path);
6785                        goto walk_down;
6786                }
6787found:
6788                ref = btrfs_item_ptr(leaf, path->slots[0],
6789                                struct btrfs_extent_ref);
6790                ref_objectid = btrfs_ref_objectid(leaf, ref);
6791                if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
6792                        if (first_time) {
6793                                level = (int)ref_objectid;
6794                                BUG_ON(level >= BTRFS_MAX_LEVEL);
6795                                ref_path->lowest_level = level;
6796                                ref_path->current_level = level;
6797                                ref_path->nodes[level] = bytenr;
6798                        } else {
6799                                WARN_ON(ref_objectid != level);
6800                        }
6801                } else {
6802                        WARN_ON(level != -1);
6803                }
6804                first_time = 0;
6805
6806                if (ref_path->lowest_level == level) {
6807                        ref_path->owner_objectid = ref_objectid;
6808                        ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
6809                }
6810
6811                /*
6812                 * the block is tree root or the block isn't in reference
6813                 * counted tree.
6814                 */
6815                if (found_key.objectid == found_key.offset ||
6816                    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
6817                        ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6818                        ref_path->root_generation =
6819                                btrfs_ref_generation(leaf, ref);
6820                        if (level < 0) {
6821                                /* special reference from the tree log */
6822                                ref_path->nodes[0] = found_key.offset;
6823                                ref_path->current_level = 0;
6824                        }
6825                        ret = 0;
6826                        goto out;
6827                }
6828
6829                level++;
6830                BUG_ON(ref_path->nodes[level] != 0);
6831                ref_path->nodes[level] = found_key.offset;
6832                ref_path->current_level = level;
6833
6834                /*
6835                 * the reference was created in the running transaction,
6836                 * no need to continue walking up.
6837                 */
6838                if (btrfs_ref_generation(leaf, ref) == trans->transid) {
6839                        ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6840                        ref_path->root_generation =
6841                                btrfs_ref_generation(leaf, ref);
6842                        ret = 0;
6843                        goto out;
6844                }
6845
6846                btrfs_release_path(extent_root, path);
6847                cond_resched();
6848        }
6849        /* reached max tree level, but no tree root found. */
6850        BUG();
6851out:
6852        btrfs_free_path(path);
6853        return ret;
6854}
6855
6856static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
6857                                struct btrfs_root *extent_root,
6858                                struct btrfs_ref_path *ref_path,
6859                                u64 extent_start)
6860{
6861        memset(ref_path, 0, sizeof(*ref_path));
6862        ref_path->extent_start = extent_start;
6863
6864        return __next_ref_path(trans, extent_root, ref_path, 1);
6865}
6866
6867static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
6868                               struct btrfs_root *extent_root,
6869                               struct btrfs_ref_path *ref_path)
6870{
6871        return __next_ref_path(trans, extent_root, ref_path, 0);
6872}
6873
6874static noinline int get_new_locations(struct inode *reloc_inode,
6875                                      struct btrfs_key *extent_key,
6876                                      u64 offset, int no_fragment,
6877                                      struct disk_extent **extents,
6878                                      int *nr_extents)
6879{
6880        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6881        struct btrfs_path *path;
6882        struct btrfs_file_extent_item *fi;
6883        struct extent_buffer *leaf;
6884        struct disk_extent *exts = *extents;
6885        struct btrfs_key found_key;
6886        u64 cur_pos;
6887        u64 last_byte;
6888        u32 nritems;
6889        int nr = 0;
6890        int max = *nr_extents;
6891        int ret;
6892
6893        WARN_ON(!no_fragment && *extents);
6894        if (!exts) {
6895                max = 1;
6896                exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
6897                if (!exts)
6898                        return -ENOMEM;
6899        }
6900
6901        path = btrfs_alloc_path();
6902        BUG_ON(!path);
6903
6904        cur_pos = extent_key->objectid - offset;
6905        last_byte = extent_key->objectid + extent_key->offset;
6906        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
6907                                       cur_pos, 0);
6908        if (ret < 0)
6909                goto out;
6910        if (ret > 0) {
6911                ret = -ENOENT;
6912                goto out;
6913        }
6914
6915        while (1) {
6916                leaf = path->nodes[0];
6917                nritems = btrfs_header_nritems(leaf);
6918                if (path->slots[0] >= nritems) {
6919                        ret = btrfs_next_leaf(root, path);
6920                        if (ret < 0)
6921                                goto out;
6922                        if (ret > 0)
6923                                break;
6924                        leaf = path->nodes[0];
6925                }
6926
6927                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6928                if (found_key.offset != cur_pos ||
6929                    found_key.type != BTRFS_EXTENT_DATA_KEY ||
6930                    found_key.objectid != reloc_inode->i_ino)
6931                        break;
6932
6933                fi = btrfs_item_ptr(leaf, path->slots[0],
6934                                    struct btrfs_file_extent_item);
6935                if (btrfs_file_extent_type(leaf, fi) !=
6936                    BTRFS_FILE_EXTENT_REG ||
6937                    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6938                        break;
6939
6940                if (nr == max) {
6941                        struct disk_extent *old = exts;
6942                        max *= 2;
6943                        exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
6944                        memcpy(exts, old, sizeof(*exts) * nr);
6945                        if (old != *extents)
6946                                kfree(old);
6947                }
6948
6949                exts[nr].disk_bytenr =
6950                        btrfs_file_extent_disk_bytenr(leaf, fi);
6951                exts[nr].disk_num_bytes =
6952                        btrfs_file_extent_disk_num_bytes(leaf, fi);
6953                exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
6954                exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6955                exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6956                exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
6957                exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
6958                exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
6959                                                                           fi);
6960                BUG_ON(exts[nr].offset > 0);
6961                BUG_ON(exts[nr].compression || exts[nr].encryption);
6962                BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
6963
6964                cur_pos += exts[nr].num_bytes;
6965                nr++;
6966
6967                if (cur_pos + offset >= last_byte)
6968                        break;
6969
6970                if (no_fragment) {
6971                        ret = 1;
6972                        goto out;
6973                }
6974                path->slots[0]++;
6975        }
6976
6977        BUG_ON(cur_pos + offset > last_byte);
6978        if (cur_pos + offset < last_byte) {
6979                ret = -ENOENT;
6980                goto out;
6981        }
6982        ret = 0;
6983out:
6984        btrfs_free_path(path);
6985        if (ret) {
6986                if (exts != *extents)
6987                        kfree(exts);
6988        } else {
6989                *extents = exts;
6990                *nr_extents = nr;
6991        }
6992        return ret;
6993}
6994
6995static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
6996                                        struct btrfs_root *root,
6997                                        struct btrfs_path *path,
6998                                        struct btrfs_key *extent_key,
6999                                        struct btrfs_key *leaf_key,
7000                                        struct btrfs_ref_path *ref_path,
7001                                        struct disk_extent *new_extents,
7002                                        int nr_extents)
7003{
7004        struct extent_buffer *leaf;
7005        struct btrfs_file_extent_item *fi;
7006        struct inode *inode = NULL;
7007        struct btrfs_key key;
7008        u64 lock_start = 0;
7009        u64 lock_end = 0;
7010        u64 num_bytes;
7011        u64 ext_offset;
7012        u64 search_end = (u64)-1;
7013        u32 nritems;
7014        int nr_scaned = 0;
7015        int extent_locked = 0;
7016        int extent_type;
7017        int ret;
7018
7019        memcpy(&key, leaf_key, sizeof(key));
7020        if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
7021                if (key.objectid < ref_path->owner_objectid ||
7022                    (key.objectid == ref_path->owner_objectid &&
7023                     key.type < BTRFS_EXTENT_DATA_KEY)) {
7024                        key.objectid = ref_path->owner_objectid;
7025                        key.type = BTRFS_EXTENT_DATA_KEY;
7026                        key.offset = 0;
7027                }
7028        }
7029
7030        while (1) {
7031                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
7032                if (ret < 0)
7033                        goto out;
7034
7035                leaf = path->nodes[0];
7036                nritems = btrfs_header_nritems(leaf);
7037next:
7038                if (extent_locked && ret > 0) {
7039                        /*
7040                         * the file extent item was modified by someone
7041                         * before the extent got locked.
7042                         */
7043                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
7044                                      lock_end, GFP_NOFS);
7045                        extent_locked = 0;
7046                }
7047
7048                if (path->slots[0] >= nritems) {
7049                        if (++nr_scaned > 2)
7050                                break;
7051
7052                        BUG_ON(extent_locked);
7053                        ret = btrfs_next_leaf(root, path);
7054                        if (ret < 0)
7055                                goto out;
7056                        if (ret > 0)
7057                                break;
7058                        leaf = path->nodes[0];
7059                        nritems = btrfs_header_nritems(leaf);
7060                }
7061
7062                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7063
7064                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
7065                        if ((key.objectid > ref_path->owner_objectid) ||
7066                            (key.objectid == ref_path->owner_objectid &&
7067                             key.type > BTRFS_EXTENT_DATA_KEY) ||
7068                            key.offset >= search_end)
7069                                break;
7070                }
7071
7072                if (inode && key.objectid != inode->i_ino) {
7073                        BUG_ON(extent_locked);
7074                        btrfs_release_path(root, path);
7075                        mutex_unlock(&inode->i_mutex);
7076                        iput(inode);
7077                        inode = NULL;
7078                        continue;
7079                }
7080
7081                if (key.type != BTRFS_EXTENT_DATA_KEY) {
7082                        path->slots[0]++;
7083                        ret = 1;
7084                        goto next;
7085                }
7086                fi = btrfs_item_ptr(leaf, path->slots[0],
7087                                    struct btrfs_file_extent_item);
7088                extent_type = btrfs_file_extent_type(leaf, fi);
7089                if ((extent_type != BTRFS_FILE_EXTENT_REG &&
7090                     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
7091                    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
7092                     extent_key->objectid)) {
7093                        path->slots[0]++;
7094                        ret = 1;
7095                        goto next;
7096                }
7097
7098                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
7099                ext_offset = btrfs_file_extent_offset(leaf, fi);
7100
7101                if (search_end == (u64)-1) {
7102                        search_end = key.offset - ext_offset +
7103                                btrfs_file_extent_ram_bytes(leaf, fi);
7104                }
7105
7106                if (!extent_locked) {
7107                        lock_start = key.offset;
7108                        lock_end = lock_start + num_bytes - 1;
7109                } else {
7110                        if (lock_start > key.offset ||
7111                            lock_end + 1 < key.offset + num_bytes) {
7112                                unlock_extent(&BTRFS_I(inode)->io_tree,
7113                                              lock_start, lock_end, GFP_NOFS);
7114                                extent_locked = 0;
7115                        }
7116                }
7117
7118                if (!inode) {
7119                        btrfs_release_path(root, path);
7120
7121                        inode = btrfs_iget_locked(root->fs_info->sb,
7122                                                  key.objectid, root);
7123                        if (inode->i_state & I_NEW) {
7124                                BTRFS_I(inode)->root = root;
7125                                BTRFS_I(inode)->location.objectid =
7126                                        key.objectid;
7127                                BTRFS_I(inode)->location.type =
7128                                        BTRFS_INODE_ITEM_KEY;
7129                                BTRFS_I(inode)->location.offset = 0;
7130                                btrfs_read_locked_inode(inode);
7131                                unlock_new_inode(inode);
7132                        }
7133                        /*
7134                         * some code call btrfs_commit_transaction while
7135                         * holding the i_mutex, so we can't use mutex_lock
7136                         * here.
7137                         */
7138                        if (is_bad_inode(inode) ||
7139                            !mutex_trylock(&inode->i_mutex)) {
7140                                iput(inode);
7141                                inode = NULL;
7142                                key.offset = (u64)-1;
7143                                goto skip;
7144                        }
7145                }
7146
7147                if (!extent_locked) {
7148                        struct btrfs_ordered_extent *ordered;
7149
7150                        btrfs_release_path(root, path);
7151
7152                        lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
7153                                    lock_end, GFP_NOFS);
7154                        ordered = btrfs_lookup_first_ordered_extent(inode,
7155                                                                    lock_end);
7156                        if (ordered &&
7157                            ordered->file_offset <= lock_end &&
7158                            ordered->file_offset + ordered->len > lock_start) {
7159                                unlock_extent(&BTRFS_I(inode)->io_tree,
7160                                              lock_start, lock_end, GFP_NOFS);
7161                                btrfs_start_ordered_extent(inode, ordered, 1);
7162                                btrfs_put_ordered_extent(ordered);
7163                                key.offset += num_bytes;
7164                                goto skip;
7165                        }
7166                        if (ordered)
7167                                btrfs_put_ordered_extent(ordered);
7168
7169                        extent_locked = 1;
7170                        continue;
7171                }
7172
7173                if (nr_extents == 1) {
7174                        /* update extent pointer in place */
7175                        btrfs_set_file_extent_disk_bytenr(leaf, fi,
7176                                                new_extents[0].disk_bytenr);
7177                        btrfs_set_file_extent_disk_num_bytes(leaf, fi,
7178                                                new_extents[0].disk_num_bytes);
7179                        btrfs_mark_buffer_dirty(leaf);
7180
7181                        btrfs_drop_extent_cache(inode, key.offset,
7182                                                key.offset + num_bytes - 1, 0);
7183
7184                        ret = btrfs_inc_extent_ref(trans, root,
7185                                                new_extents[0].disk_bytenr,
7186                                                new_extents[0].disk_num_bytes,
7187                                                leaf->start,
7188                                                root->root_key.objectid,
7189                                                trans->transid,
7190                                                key.objectid);
7191                        BUG_ON(ret);
7192
7193                        ret = btrfs_free_extent(trans, root,
7194                                                extent_key->objectid,
7195                                                extent_key->offset,
7196                                                leaf->start,
7197                                                btrfs_header_owner(leaf),
7198                                                btrfs_header_generation(leaf),
7199                                                key.objectid, 0);
7200                        BUG_ON(ret);
7201
7202                        btrfs_release_path(root, path);
7203                        key.offset += num_bytes;
7204                } else {
7205                        BUG_ON(1);
7206#if 0
7207                        u64 alloc_hint;
7208                        u64 extent_len;
7209                        int i;
7210                        /*
7211                         * drop old extent pointer at first, then insert the
7212                         * new pointers one bye one
7213                         */
7214                        btrfs_release_path(root, path);
7215                        ret = btrfs_drop_extents(trans, root, inode, key.offset,
7216                                                 key.offset + num_bytes,
7217                                                 key.offset, &alloc_hint);
7218                        BUG_ON(ret);
7219
7220                        for (i = 0; i < nr_extents; i++) {
7221                                if (ext_offset >= new_extents[i].num_bytes) {
7222                                        ext_offset -= new_extents[i].num_bytes;
7223                                        continue;
7224                                }
7225                                extent_len = min(new_extents[i].num_bytes -
7226                                                 ext_offset, num_bytes);
7227
7228                                ret = btrfs_insert_empty_item(trans, root,
7229                                                              path, &key,
7230                                                              sizeof(*fi));
7231                                BUG_ON(ret);
7232
7233                                leaf = path->nodes[0];
7234                                fi = btrfs_item_ptr(leaf, path->slots[0],
7235                                                struct btrfs_file_extent_item);
7236                                btrfs_set_file_extent_generation(leaf, fi,
7237                                                        trans->transid);
7238                                btrfs_set_file_extent_type(leaf, fi,
7239                                                        BTRFS_FILE_EXTENT_REG);
7240                                btrfs_set_file_extent_disk_bytenr(leaf, fi,
7241                                                new_extents[i].disk_bytenr);
7242                                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
7243                                                new_extents[i].disk_num_bytes);
7244                                btrfs_set_file_extent_ram_bytes(leaf, fi,
7245                                                new_extents[i].ram_bytes);
7246
7247                                btrfs_set_file_extent_compression(leaf, fi,
7248                                                new_extents[i].compression);
7249                                btrfs_set_file_extent_encryption(leaf, fi,
7250                                                new_extents[i].encryption);
7251                                btrfs_set_file_extent_other_encoding(leaf, fi,
7252                                                new_extents[i].other_encoding);
7253
7254                                btrfs_set_file_extent_num_bytes(leaf, fi,
7255                                                        extent_len);
7256                                ext_offset += new_extents[i].offset;
7257                                btrfs_set_file_extent_offset(leaf, fi,
7258                                                        ext_offset);
7259                                btrfs_mark_buffer_dirty(leaf);
7260
7261                                btrfs_drop_extent_cache(inode, key.offset,
7262                                                key.offset + extent_len - 1, 0);
7263
7264                                ret = btrfs_inc_extent_ref(trans, root,
7265                                                new_extents[i].disk_bytenr,
7266                                                new_extents[i].disk_num_bytes,
7267                                                leaf->start,
7268                                                root->root_key.objectid,
7269                                                trans->transid, key.objectid);
7270                                BUG_ON(ret);
7271                                btrfs_release_path(root, path);
7272
7273                                inode_add_bytes(inode, extent_len);
7274
7275                                ext_offset = 0;
7276                                num_bytes -= extent_len;
7277                                key.offset += extent_len;
7278
7279                                if (num_bytes == 0)
7280                                        break;
7281                        }
7282                        BUG_ON(i >= nr_extents);
7283#endif
7284                }
7285
7286                if (extent_locked) {
7287                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
7288                                      lock_end, GFP_NOFS);
7289                        extent_locked = 0;
7290                }
7291skip:
7292                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
7293                    key.offset >= search_end)
7294                        break;
7295
7296                cond_resched();
7297        }
7298        ret = 0;
7299out:
7300        btrfs_release_path(root, path);
7301        if (inode) {
7302                mutex_unlock(&inode->i_mutex);
7303                if (extent_locked) {
7304                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
7305                                      lock_end, GFP_NOFS);
7306                }
7307                iput(inode);
7308        }
7309        return ret;
7310}
7311
7312int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
7313                               struct btrfs_root *root,
7314                               struct extent_buffer *buf, u64 orig_start)
7315{
7316        int level;
7317        int ret;
7318
7319        BUG_ON(btrfs_header_generation(buf) != trans->transid);
7320        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7321
7322        level = btrfs_header_level(buf);
7323        if (level == 0) {
7324                struct btrfs_leaf_ref *ref;
7325                struct btrfs_leaf_ref *orig_ref;
7326
7327                orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
7328                if (!orig_ref)
7329                        return -ENOENT;
7330
7331                ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
7332                if (!ref) {
7333                        btrfs_free_leaf_ref(root, orig_ref);
7334                        return -ENOMEM;
7335                }
7336
7337                ref->nritems = orig_ref->nritems;
7338                memcpy(ref->extents, orig_ref->extents,
7339                        sizeof(ref->extents[0]) * ref->nritems);
7340
7341                btrfs_free_leaf_ref(root, orig_ref);
7342
7343                ref->root_gen = trans->transid;
7344                ref->bytenr = buf->start;
7345                ref->owner = btrfs_header_owner(buf);
7346                ref->generation = btrfs_header_generation(buf);
7347
7348                ret = btrfs_add_leaf_ref(root, ref, 0);
7349                WARN_ON(ret);
7350                btrfs_free_leaf_ref(root, ref);
7351        }
7352        return 0;
7353}
7354
7355static noinline int invalidate_extent_cache(struct btrfs_root *root,
7356                                        struct extent_buffer *leaf,
7357                                        struct btrfs_block_group_cache *group,
7358                                        struct btrfs_root *target_root)
7359{
7360        struct btrfs_key key;
7361        struct inode *inode = NULL;
7362        struct btrfs_file_extent_item *fi;
7363        struct extent_state *cached_state = NULL;
7364        u64 num_bytes;
7365        u64 skip_objectid = 0;
7366        u32 nritems;
7367        u32 i;
7368
7369        nritems = btrfs_header_nritems(leaf);
7370        for (i = 0; i < nritems; i++) {
7371                btrfs_item_key_to_cpu(leaf, &key, i);
7372                if (key.objectid == skip_objectid ||
7373                    key.type != BTRFS_EXTENT_DATA_KEY)
7374                        continue;
7375                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
7376                if (btrfs_file_extent_type(leaf, fi) ==
7377                    BTRFS_FILE_EXTENT_INLINE)
7378                        continue;
7379                if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
7380                        continue;
7381                if (!inode || inode->i_ino != key.objectid) {
7382                        iput(inode);
7383                        inode = btrfs_ilookup(target_root->fs_info->sb,
7384                                              key.objectid, target_root, 1);
7385                }
7386                if (!inode) {
7387                        skip_objectid = key.objectid;
7388                        continue;
7389                }
7390                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
7391
7392                lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
7393                                 key.offset + num_bytes - 1, 0, &cached_state,
7394                                 GFP_NOFS);
7395                btrfs_drop_extent_cache(inode, key.offset,
7396                                        key.offset + num_bytes - 1, 1);
7397                unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
7398                                     key.offset + num_bytes - 1, &cached_state,
7399                                     GFP_NOFS);
7400                cond_resched();
7401        }
7402        iput(inode);
7403        return 0;
7404}
7405
7406static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
7407                                        struct btrfs_root *root,
7408                                        struct extent_buffer *leaf,
7409                                        struct btrfs_block_group_cache *group,
7410                                        struct inode *reloc_inode)
7411{
7412        struct btrfs_key key;
7413        struct btrfs_key extent_key;
7414        struct btrfs_file_extent_item *fi;
7415        struct btrfs_leaf_ref *ref;
7416        struct disk_extent *new_extent;
7417        u64 bytenr;
7418        u64 num_bytes;
7419        u32 nritems;
7420        u32 i;
7421        int ext_index;
7422        int nr_extent;
7423        int ret;
7424
7425        new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
7426        BUG_ON(!new_extent);
7427
7428        ref = btrfs_lookup_leaf_ref(root, leaf->start);
7429        BUG_ON(!ref);
7430
7431        ext_index = -1;
7432        nritems = btrfs_header_nritems(leaf);
7433        for (i = 0; i < nritems; i++) {
7434                btrfs_item_key_to_cpu(leaf, &key, i);
7435                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
7436                        continue;
7437                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
7438                if (btrfs_file_extent_type(leaf, fi) ==
7439                    BTRFS_FILE_EXTENT_INLINE)
7440                        continue;
7441                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7442                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
7443                if (bytenr == 0)
7444                        continue;
7445
7446                ext_index++;
7447                if (bytenr >= group->key.objectid + group->key.offset ||
7448                    bytenr + num_bytes <= group->key.objectid)
7449                        continue;
7450
7451                extent_key.objectid = bytenr;
7452                extent_key.offset = num_bytes;
7453                extent_key.type = BTRFS_EXTENT_ITEM_KEY;
7454                nr_extent = 1;
7455                ret = get_new_locations(reloc_inode, &extent_key,
7456                                        group->key.objectid, 1,
7457                                        &new_extent, &nr_extent);
7458                if (ret > 0)
7459                        continue;
7460                BUG_ON(ret < 0);
7461
7462                BUG_ON(ref->extents[ext_index].bytenr != bytenr);
7463                BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
7464                ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
7465                ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
7466
7467                btrfs_set_file_extent_disk_bytenr(leaf, fi,
7468                                                new_extent->disk_bytenr);
7469                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
7470                                                new_extent->disk_num_bytes);
7471                btrfs_mark_buffer_dirty(leaf);
7472
7473                ret = btrfs_inc_extent_ref(trans, root,
7474                                        new_extent->disk_bytenr,
7475                                        new_extent->disk_num_bytes,
7476                                        leaf->start,
7477                                        root->root_key.objectid,
7478                                        trans->transid, key.objectid);
7479                BUG_ON(ret);
7480
7481                ret = btrfs_free_extent(trans, root,
7482                                        bytenr, num_bytes, leaf->start,
7483                                        btrfs_header_owner(leaf),
7484                                        btrfs_header_generation(leaf),
7485                                        key.objectid, 0);
7486                BUG_ON(ret);
7487                cond_resched();
7488        }
7489        kfree(new_extent);
7490        BUG_ON(ext_index + 1 != ref->nritems);
7491        btrfs_free_leaf_ref(root, ref);
7492        return 0;
7493}
7494
7495int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
7496                          struct btrfs_root *root)
7497{
7498        struct btrfs_root *reloc_root;
7499        int ret;
7500
7501        if (root->reloc_root) {
7502                reloc_root = root->reloc_root;
7503                root->reloc_root = NULL;
7504                list_add(&reloc_root->dead_list,
7505                         &root->fs_info->dead_reloc_roots);
7506
7507                btrfs_set_root_bytenr(&reloc_root->root_item,
7508                                      reloc_root->node->start);
7509                btrfs_set_root_level(&root->root_item,
7510                                     btrfs_header_level(reloc_root->node));
7511                memset(&reloc_root->root_item.drop_progress, 0,
7512                        sizeof(struct btrfs_disk_key));
7513                reloc_root->root_item.drop_level = 0;
7514
7515                ret = btrfs_update_root(trans, root->fs_info->tree_root,
7516                                        &reloc_root->root_key,
7517                                        &reloc_root->root_item);
7518                BUG_ON(ret);
7519        }
7520        return 0;
7521}
7522
7523int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
7524{
7525        struct btrfs_trans_handle *trans;
7526        struct btrfs_root *reloc_root;
7527        struct btrfs_root *prev_root = NULL;
7528        struct list_head dead_roots;
7529        int ret;
7530        unsigned long nr;
7531
7532        INIT_LIST_HEAD(&dead_roots);
7533        list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
7534
7535        while (!list_empty(&dead_roots)) {
7536                reloc_root = list_entry(dead_roots.prev,
7537                                        struct btrfs_root, dead_list);
7538                list_del_init(&reloc_root->dead_list);
7539
7540                BUG_ON(reloc_root->commit_root != NULL);
7541                while (1) {
7542                        trans = btrfs_join_transaction(root, 1);
7543                        BUG_ON(IS_ERR(trans));
7544
7545                        mutex_lock(&root->fs_info->drop_mutex);
7546                        ret = btrfs_drop_snapshot(trans, reloc_root);
7547                        if (ret != -EAGAIN)
7548                                break;
7549                        mutex_unlock(&root->fs_info->drop_mutex);
7550
7551                        nr = trans->blocks_used;
7552                        ret = btrfs_end_transaction(trans, root);
7553                        BUG_ON(ret);
7554                        btrfs_btree_balance_dirty(root, nr);
7555                }
7556
7557                free_extent_buffer(reloc_root->node);
7558
7559                ret = btrfs_del_root(trans, root->fs_info->tree_root,
7560                                     &reloc_root->root_key);
7561                BUG_ON(ret);
7562                mutex_unlock(&root->fs_info->drop_mutex);
7563
7564                nr = trans->blocks_used;
7565                ret = btrfs_end_transaction(trans, root);
7566                BUG_ON(ret);
7567                btrfs_btree_balance_dirty(root, nr);
7568
7569                kfree(prev_root);
7570                prev_root = reloc_root;
7571        }
7572        if (prev_root) {
7573                btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
7574                kfree(prev_root);
7575        }
7576        return 0;
7577}
7578
7579int btrfs_add_dead_reloc_root(struct btrfs_root *root)
7580{
7581        list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
7582        return 0;
7583}
7584
7585int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7586{
7587        struct btrfs_root *reloc_root;
7588        struct btrfs_trans_handle *trans;
7589        struct btrfs_key location;
7590        int found;
7591        int ret;
7592
7593        mutex_lock(&root->fs_info->tree_reloc_mutex);
7594        ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
7595        BUG_ON(ret);
7596        found = !list_empty(&root->fs_info->dead_reloc_roots);
7597        mutex_unlock(&root->fs_info->tree_reloc_mutex);
7598
7599        if (found) {
7600                trans = btrfs_start_transaction(root, 1);
7601                BUG_ON(IS_ERR(trans));
7602                ret = btrfs_commit_transaction(trans, root);
7603                BUG_ON(ret);
7604        }
7605
7606        location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
7607        location.offset = (u64)-1;
7608        location.type = BTRFS_ROOT_ITEM_KEY;
7609
7610        reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
7611        BUG_ON(!reloc_root);
7612        btrfs_orphan_cleanup(reloc_root);
7613        return 0;
7614}
7615
7616static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7617                                    struct btrfs_root *root)
7618{
7619        struct btrfs_root *reloc_root;
7620        struct extent_buffer *eb;
7621        struct btrfs_root_item *root_item;
7622        struct btrfs_key root_key;
7623        int ret;
7624
7625        BUG_ON(!root->ref_cows);
7626        if (root->reloc_root)
7627                return 0;
7628
7629        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
7630        BUG_ON(!root_item);
7631
7632        ret = btrfs_copy_root(trans, root, root->commit_root,
7633                              &eb, BTRFS_TREE_RELOC_OBJECTID);
7634        BUG_ON(ret);
7635
7636        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
7637        root_key.offset = root->root_key.objectid;
7638        root_key.type = BTRFS_ROOT_ITEM_KEY;
7639
7640        memcpy(root_item, &root->root_item, sizeof(root_item));
7641        btrfs_set_root_refs(root_item, 0);
7642        btrfs_set_root_bytenr(root_item, eb->start);
7643        btrfs_set_root_level(root_item, btrfs_header_level(eb));
7644        btrfs_set_root_generation(root_item, trans->transid);
7645
7646        btrfs_tree_unlock(eb);
7647        free_extent_buffer(eb);
7648
7649        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
7650                                &root_key, root_item);
7651        BUG_ON(ret);
7652        kfree(root_item);
7653
7654        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
7655                                                 &root_key);
7656        BUG_ON(!reloc_root);
7657        reloc_root->last_trans = trans->transid;
7658        reloc_root->commit_root = NULL;
7659        reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
7660
7661        root->reloc_root = reloc_root;
7662        return 0;
7663}
7664
7665/*
7666 * Core function of space balance.
7667 *
7668 * The idea is using reloc trees to relocate tree blocks in reference
7669 * counted roots. There is one reloc tree for each subvol, and all
7670 * reloc trees share same root key objectid. Reloc trees are snapshots
7671 * of the latest committed roots of subvols (root->commit_root).
7672 *
7673 * To relocate a tree block referenced by a subvol, there are two steps.
7674 * COW the block through subvol's reloc tree, then update block pointer
7675 * in the subvol to point to the new block. Since all reloc trees share
7676 * same root key objectid, doing special handing for tree blocks owned
7677 * by them is easy. Once a tree block has been COWed in one reloc tree,
7678 * we can use the resulting new block directly when the same block is
7679 * required to COW again through other reloc trees. By this way, relocated
7680 * tree blocks are shared between reloc trees, so they are also shared
7681 * between subvols.
7682 */
7683static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
7684                                      struct btrfs_root *root,
7685                                      struct btrfs_path *path,
7686                                      struct btrfs_key *first_key,
7687                                      struct btrfs_ref_path *ref_path,
7688                                      struct btrfs_block_group_cache *group,
7689                                      struct inode *reloc_inode)
7690{
7691        struct btrfs_root *reloc_root;
7692        struct extent_buffer *eb = NULL;
7693        struct btrfs_key *keys;
7694        u64 *nodes;
7695        int level;
7696        int shared_level;
7697        int lowest_level = 0;
7698        int ret;
7699
7700        if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
7701                lowest_level = ref_path->owner_objectid;
7702
7703        if (!root->ref_cows) {
7704                path->lowest_level = lowest_level;
7705                ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
7706                BUG_ON(ret < 0);
7707                path->lowest_level = 0;
7708                btrfs_release_path(root, path);
7709                return 0;
7710        }
7711
7712        mutex_lock(&root->fs_info->tree_reloc_mutex);
7713        ret = init_reloc_tree(trans, root);
7714        BUG_ON(ret);
7715        reloc_root = root->reloc_root;
7716
7717        shared_level = ref_path->shared_level;
7718        ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
7719
7720        keys = ref_path->node_keys;
7721        nodes = ref_path->new_nodes;
7722        memset(&keys[shared_level + 1], 0,
7723               sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
7724        memset(&nodes[shared_level + 1], 0,
7725               sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
7726
7727        if (nodes[lowest_level] == 0) {
7728                path->lowest_level = lowest_level;
7729                ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7730                                        0, 1);
7731                BUG_ON(ret);
7732                for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
7733                        eb = path->nodes[level];
7734                        if (!eb || eb == reloc_root->node)
7735                                break;
7736                        nodes[level] = eb->start;
7737                        if (level == 0)
7738                                btrfs_item_key_to_cpu(eb, &keys[level], 0);
7739                        else
7740                                btrfs_node_key_to_cpu(eb, &keys[level], 0);
7741                }
7742                if (nodes[0] &&
7743                    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7744                        eb = path->nodes[0];
7745                        ret = replace_extents_in_leaf(trans, reloc_root, eb,
7746                                                      group, reloc_inode);
7747                        BUG_ON(ret);
7748                }
7749                btrfs_release_path(reloc_root, path);
7750        } else {
7751                ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
7752                                       lowest_level);
7753                BUG_ON(ret);
7754        }
7755
7756        /*
7757         * replace tree blocks in the fs tree with tree blocks in
7758         * the reloc tree.
7759         */
7760        ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
7761        BUG_ON(ret < 0);
7762
7763        if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7764                ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7765                                        0, 0);
7766                BUG_ON(ret);
7767                extent_buffer_get(path->nodes[0]);
7768                eb = path->nodes[0];
7769                btrfs_release_path(reloc_root, path);
7770                ret = invalidate_extent_cache(reloc_root, eb, group, root);
7771                BUG_ON(ret);
7772                free_extent_buffer(eb);
7773        }
7774
7775        mutex_unlock(&root->fs_info->tree_reloc_mutex);
7776        path->lowest_level = 0;
7777        return 0;
7778}
7779
7780static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
7781                                        struct btrfs_root *root,
7782                                        struct btrfs_path *path,
7783                                        struct btrfs_key *first_key,
7784                                        struct btrfs_ref_path *ref_path)
7785{
7786        int ret;
7787
7788        ret = relocate_one_path(trans, root, path, first_key,
7789                                ref_path, NULL, NULL);
7790        BUG_ON(ret);
7791
7792        return 0;
7793}
7794
7795static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
7796                                    struct btrfs_root *extent_root,
7797                                    struct btrfs_path *path,
7798                                    struct btrfs_key *extent_key)
7799{
7800        int ret;
7801
7802        ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
7803        if (ret)
7804                goto out;
7805        ret = btrfs_del_item(trans, extent_root, path);
7806out:
7807        btrfs_release_path(extent_root, path);
7808        return ret;
7809}
7810
7811static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
7812                                                struct btrfs_ref_path *ref_path)
7813{
7814        struct btrfs_key root_key;
7815
7816        root_key.objectid = ref_path->root_objectid;
7817        root_key.type = BTRFS_ROOT_ITEM_KEY;
7818        if (is_cowonly_root(ref_path->root_objectid))
7819                root_key.offset = 0;
7820        else
7821                root_key.offset = (u64)-1;
7822
7823        return btrfs_read_fs_root_no_name(fs_info, &root_key);
7824}
7825
7826static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7827                                        struct btrfs_path *path,
7828                                        struct btrfs_key *extent_key,
7829                                        struct btrfs_block_group_cache *group,
7830                                        struct inode *reloc_inode, int pass)
7831{
7832        struct btrfs_trans_handle *trans;
7833        struct btrfs_root *found_root;
7834        struct btrfs_ref_path *ref_path = NULL;
7835        struct disk_extent *new_extents = NULL;
7836        int nr_extents = 0;
7837        int loops;
7838        int ret;
7839        int level;
7840        struct btrfs_key first_key;
7841        u64 prev_block = 0;
7842
7843
7844        trans = btrfs_start_transaction(extent_root, 1);
7845        BUG_ON(IS_ERR(trans));
7846
7847        if (extent_key->objectid == 0) {
7848                ret = del_extent_zero(trans, extent_root, path, extent_key);
7849                goto out;
7850        }
7851
7852        ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
7853        if (!ref_path) {
7854                ret = -ENOMEM;
7855                goto out;
7856        }
7857
7858        for (loops = 0; ; loops++) {
7859                if (loops == 0) {
7860                        ret = btrfs_first_ref_path(trans, extent_root, ref_path,
7861                                                   extent_key->objectid);
7862                } else {
7863                        ret = btrfs_next_ref_path(trans, extent_root, ref_path);
7864                }
7865                if (ret < 0)
7866                        goto out;
7867                if (ret > 0)
7868                        break;
7869
7870                if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
7871                    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
7872                        continue;
7873
7874                found_root = read_ref_root(extent_root->fs_info, ref_path);
7875                BUG_ON(!found_root);
7876                /*
7877                 * for reference counted tree, only process reference paths
7878                 * rooted at the latest committed root.
7879                 */
7880                if (found_root->ref_cows &&
7881                    ref_path->root_generation != found_root->root_key.offset)
7882                        continue;
7883
7884                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7885                        if (pass == 0) {
7886                                /*
7887                                 * copy data extents to new locations
7888                                 */
7889                                u64 group_start = group->key.objectid;
7890                                ret = relocate_data_extent(reloc_inode,
7891                                                           extent_key,
7892                                                           group_start);
7893                                if (ret < 0)
7894                                        goto out;
7895                                break;
7896                        }
7897                        level = 0;
7898                } else {
7899                        level = ref_path->owner_objectid;
7900                }
7901
7902                if (prev_block != ref_path->nodes[level]) {
7903                        struct extent_buffer *eb;
7904                        u64 block_start = ref_path->nodes[level];
7905                        u64 block_size = btrfs_level_size(found_root, level);
7906
7907                        eb = read_tree_block(found_root, block_start,
7908                                             block_size, 0);
7909                        btrfs_tree_lock(eb);
7910                        BUG_ON(level != btrfs_header_level(eb));
7911
7912                        if (level == 0)
7913                                btrfs_item_key_to_cpu(eb, &first_key, 0);
7914                        else
7915                                btrfs_node_key_to_cpu(eb, &first_key, 0);
7916
7917                        btrfs_tree_unlock(eb);
7918                        free_extent_buffer(eb);
7919                        prev_block = block_start;
7920                }
7921
7922                mutex_lock(&extent_root->fs_info->trans_mutex);
7923                btrfs_record_root_in_trans(found_root);
7924                mutex_unlock(&extent_root->fs_info->trans_mutex);
7925                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7926                        /*
7927                         * try to update data extent references while
7928                         * keeping metadata shared between snapshots.
7929                         */
7930                        if (pass == 1) {
7931                                ret = relocate_one_path(trans, found_root,
7932                                                path, &first_key, ref_path,
7933                                                group, reloc_inode);
7934                                if (ret < 0)
7935                                        goto out;
7936                                continue;
7937                        }
7938                        /*
7939                         * use fallback method to process the remaining
7940                         * references.
7941                         */
7942                        if (!new_extents) {
7943                                u64 group_start = group->key.objectid;
7944                                new_extents = kmalloc(sizeof(*new_extents),
7945                                                      GFP_NOFS);
7946                                nr_extents = 1;
7947                                ret = get_new_locations(reloc_inode,
7948                                                        extent_key,
7949                                                        group_start, 1,
7950                                                        &new_extents,
7951                                                        &nr_extents);
7952                                if (ret)
7953                                        goto out;
7954                        }
7955                        ret = replace_one_extent(trans, found_root,
7956                                                path, extent_key,
7957                                                &first_key, ref_path,
7958                                                new_extents, nr_extents);
7959                } else {
7960                        ret = relocate_tree_block(trans, found_root, path,
7961                                                  &first_key, ref_path);
7962                }
7963                if (ret < 0)
7964                        goto out;
7965        }
7966        ret = 0;
7967out:
7968        btrfs_end_transaction(trans, extent_root);
7969        kfree(new_extents);
7970        kfree(ref_path);
7971        return ret;
7972}
7973#endif
7974
7975static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7976{
7977        u64 num_devices;
7978        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7979                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7980
7981        /*
7982         * we add in the count of missing devices because we want
7983         * to make sure that any RAID levels on a degraded FS
7984         * continue to be honored.
7985         */
7986        num_devices = root->fs_info->fs_devices->rw_devices +
7987                root->fs_info->fs_devices->missing_devices;
7988
7989        if (num_devices == 1) {
7990                stripped |= BTRFS_BLOCK_GROUP_DUP;
7991                stripped = flags & ~stripped;
7992
7993                /* turn raid0 into single device chunks */
7994                if (flags & BTRFS_BLOCK_GROUP_RAID0)
7995                        return stripped;
7996
7997                /* turn mirroring into duplication */
7998                if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7999                             BTRFS_BLOCK_GROUP_RAID10))
8000                        return stripped | BTRFS_BLOCK_GROUP_DUP;
8001                return flags;
8002        } else {
8003                /* they already had raid on here, just return */
8004                if (flags & stripped)
8005                        return flags;
8006
8007                stripped |= BTRFS_BLOCK_GROUP_DUP;
8008                stripped = flags & ~stripped;
8009
8010                /* switch duplicated blocks with raid1 */
8011                if (flags & BTRFS_BLOCK_GROUP_DUP)
8012                        return stripped | BTRFS_BLOCK_GROUP_RAID1;
8013
8014                /* turn single device chunks into raid0 */
8015                return stripped | BTRFS_BLOCK_GROUP_RAID0;
8016        }
8017        return flags;
8018}
8019
8020static int set_block_group_ro(struct btrfs_block_group_cache *cache)
8021{
8022        struct btrfs_space_info *sinfo = cache->space_info;
8023        u64 num_bytes;
8024        int ret = -ENOSPC;
8025
8026        if (cache->ro)
8027                return 0;
8028
8029        spin_lock(&sinfo->lock);
8030        spin_lock(&cache->lock);
8031        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8032                    cache->bytes_super - btrfs_block_group_used(&cache->item);
8033
8034        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
8035            sinfo->bytes_may_use + sinfo->bytes_readonly +
8036            cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
8037                sinfo->bytes_readonly += num_bytes;
8038                sinfo->bytes_reserved += cache->reserved_pinned;
8039                cache->reserved_pinned = 0;
8040                cache->ro = 1;
8041                ret = 0;
8042        }
8043
8044        spin_unlock(&cache->lock);
8045        spin_unlock(&sinfo->lock);
8046        return ret;
8047}
8048
8049int btrfs_set_block_group_ro(struct btrfs_root *root,
8050                             struct btrfs_block_group_cache *cache)
8051
8052{
8053        struct btrfs_trans_handle *trans;
8054        u64 alloc_flags;
8055        int ret;
8056
8057        BUG_ON(cache->ro);
8058
8059        trans = btrfs_join_transaction(root, 1);
8060        BUG_ON(IS_ERR(trans));
8061
8062        alloc_flags = update_block_group_flags(root, cache->flags);
8063        if (alloc_flags != cache->flags)
8064                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
8065
8066        ret = set_block_group_ro(cache);
8067        if (!ret)
8068                goto out;
8069        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
8070        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
8071        if (ret < 0)
8072                goto out;
8073        ret = set_block_group_ro(cache);
8074out:
8075        btrfs_end_transaction(trans, root);
8076        return ret;
8077}
8078
8079int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8080                            struct btrfs_root *root, u64 type)
8081{
8082        u64 alloc_flags = get_alloc_profile(root, type);
8083        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
8084}
8085
8086/*
8087 * helper to account the unused space of all the readonly block group in the
8088 * list. takes mirrors into account.
8089 */
8090static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8091{
8092        struct btrfs_block_group_cache *block_group;
8093        u64 free_bytes = 0;
8094        int factor;
8095
8096        list_for_each_entry(block_group, groups_list, list) {
8097                spin_lock(&block_group->lock);
8098
8099                if (!block_group->ro) {
8100                        spin_unlock(&block_group->lock);
8101                        continue;
8102                }
8103
8104                if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8105                                          BTRFS_BLOCK_GROUP_RAID10 |
8106                                          BTRFS_BLOCK_GROUP_DUP))
8107                        factor = 2;
8108                else
8109                        factor = 1;
8110
8111                free_bytes += (block_group->key.offset -
8112                               btrfs_block_group_used(&block_group->item)) *
8113                               factor;
8114
8115                spin_unlock(&block_group->lock);
8116        }
8117
8118        return free_bytes;
8119}
8120
8121/*
8122 * helper to account the unused space of all the readonly block group in the
8123 * space_info. takes mirrors into account.
8124 */
8125u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8126{
8127        int i;
8128        u64 free_bytes = 0;
8129
8130        spin_lock(&sinfo->lock);
8131
8132        for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8133                if (!list_empty(&sinfo->block_groups[i]))
8134                        free_bytes += __btrfs_get_ro_block_group_free_space(
8135                                                &sinfo->block_groups[i]);
8136
8137        spin_unlock(&sinfo->lock);
8138
8139        return free_bytes;
8140}
8141
8142int btrfs_set_block_group_rw(struct btrfs_root *root,
8143                              struct btrfs_block_group_cache *cache)
8144{
8145        struct btrfs_space_info *sinfo = cache->space_info;
8146        u64 num_bytes;
8147
8148        BUG_ON(!cache->ro);
8149
8150        spin_lock(&sinfo->lock);
8151        spin_lock(&cache->lock);
8152        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8153                    cache->bytes_super - btrfs_block_group_used(&cache->item);
8154        sinfo->bytes_readonly -= num_bytes;
8155        cache->ro = 0;
8156        spin_unlock(&cache->lock);
8157        spin_unlock(&sinfo->lock);
8158        return 0;
8159}
8160
8161/*
8162 * checks to see if its even possible to relocate this block group.
8163 *
8164 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
8165 * ok to go ahead and try.
8166 */
8167int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8168{
8169        struct btrfs_block_group_cache *block_group;
8170        struct btrfs_space_info *space_info;
8171        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
8172        struct btrfs_device *device;
8173        int full = 0;
8174        int ret = 0;
8175
8176        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
8177
8178        /* odd, couldn't find the block group, leave it alone */
8179        if (!block_group)
8180                return -1;
8181
8182        /* no bytes used, we're good */
8183        if (!btrfs_block_group_used(&block_group->item))
8184                goto out;
8185
8186        space_info = block_group->space_info;
8187        spin_lock(&space_info->lock);
8188
8189        full = space_info->full;
8190
8191        /*
8192         * if this is the last block group we have in this space, we can't
8193         * relocate it unless we're able to allocate a new chunk below.
8194         *
8195         * Otherwise, we need to make sure we have room in the space to handle
8196         * all of the extents from this block group.  If we can, we're good
8197         */
8198        if ((space_info->total_bytes != block_group->key.offset) &&
8199           (space_info->bytes_used + space_info->bytes_reserved +
8200            space_info->bytes_pinned + space_info->bytes_readonly +
8201            btrfs_block_group_used(&block_group->item) <
8202            space_info->total_bytes)) {
8203                spin_unlock(&space_info->lock);
8204                goto out;
8205        }
8206        spin_unlock(&space_info->lock);
8207
8208        /*
8209         * ok we don't have enough space, but maybe we have free space on our
8210         * devices to allocate new chunks for relocation, so loop through our
8211         * alloc devices and guess if we have enough space.  However, if we
8212         * were marked as full, then we know there aren't enough chunks, and we
8213         * can just return.
8214         */
8215        ret = -1;
8216        if (full)
8217                goto out;
8218
8219        mutex_lock(&root->fs_info->chunk_mutex);
8220        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8221                u64 min_free = btrfs_block_group_used(&block_group->item);
8222                u64 dev_offset;
8223
8224                /*
8225                 * check to make sure we can actually find a chunk with enough
8226                 * space to fit our block group in.
8227                 */
8228                if (device->total_bytes > device->bytes_used + min_free) {
8229                        ret = find_free_dev_extent(NULL, device, min_free,
8230                                                   &dev_offset, NULL);
8231                        if (!ret)
8232                                break;
8233                        ret = -1;
8234                }
8235        }
8236        mutex_unlock(&root->fs_info->chunk_mutex);
8237out:
8238        btrfs_put_block_group(block_group);
8239        return ret;
8240}
8241
8242static int find_first_block_group(struct btrfs_root *root,
8243                struct btrfs_path *path, struct btrfs_key *key)
8244{
8245        int ret = 0;
8246        struct btrfs_key found_key;
8247        struct extent_buffer *leaf;
8248        int slot;
8249
8250        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
8251        if (ret < 0)
8252                goto out;
8253
8254        while (1) {
8255                slot = path->slots[0];
8256                leaf = path->nodes[0];
8257                if (slot >= btrfs_header_nritems(leaf)) {
8258                        ret = btrfs_next_leaf(root, path);
8259                        if (ret == 0)
8260                                continue;
8261                        if (ret < 0)
8262                                goto out;
8263                        break;
8264                }
8265                btrfs_item_key_to_cpu(leaf, &found_key, slot);
8266
8267                if (found_key.objectid >= key->objectid &&
8268                    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
8269                        ret = 0;
8270                        goto out;
8271                }
8272                path->slots[0]++;
8273        }
8274out:
8275        return ret;
8276}
8277
8278void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8279{
8280        struct btrfs_block_group_cache *block_group;
8281        u64 last = 0;
8282
8283        while (1) {
8284                struct inode *inode;
8285
8286                block_group = btrfs_lookup_first_block_group(info, last);
8287                while (block_group) {
8288                        spin_lock(&block_group->lock);
8289                        if (block_group->iref)
8290                                break;
8291                        spin_unlock(&block_group->lock);
8292                        block_group = next_block_group(info->tree_root,
8293                                                       block_group);
8294                }
8295                if (!block_group) {
8296                        if (last == 0)
8297                                break;
8298                        last = 0;
8299                        continue;
8300                }
8301
8302                inode = block_group->inode;
8303                block_group->iref = 0;
8304                block_group->inode = NULL;
8305                spin_unlock(&block_group->lock);
8306                iput(inode);
8307                last = block_group->key.objectid + block_group->key.offset;
8308                btrfs_put_block_group(block_group);
8309        }
8310}
8311
8312int btrfs_free_block_groups(struct btrfs_fs_info *info)
8313{
8314        struct btrfs_block_group_cache *block_group;
8315        struct btrfs_space_info *space_info;
8316        struct btrfs_caching_control *caching_ctl;
8317        struct rb_node *n;
8318
8319        down_write(&info->extent_commit_sem);
8320        while (!list_empty(&info->caching_block_groups)) {
8321                caching_ctl = list_entry(info->caching_block_groups.next,
8322                                         struct btrfs_caching_control, list);
8323                list_del(&caching_ctl->list);
8324                put_caching_control(caching_ctl);
8325        }
8326        up_write(&info->extent_commit_sem);
8327
8328        spin_lock(&info->block_group_cache_lock);
8329        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8330                block_group = rb_entry(n, struct btrfs_block_group_cache,
8331                                       cache_node);
8332                rb_erase(&block_group->cache_node,
8333                         &info->block_group_cache_tree);
8334                spin_unlock(&info->block_group_cache_lock);
8335
8336                down_write(&block_group->space_info->groups_sem);
8337                list_del(&block_group->list);
8338                up_write(&block_group->space_info->groups_sem);
8339
8340                if (block_group->cached == BTRFS_CACHE_STARTED)
8341                        wait_block_group_cache_done(block_group);
8342
8343                /*
8344                 * We haven't cached this block group, which means we could
8345                 * possibly have excluded extents on this block group.
8346                 */
8347                if (block_group->cached == BTRFS_CACHE_NO)
8348                        free_excluded_extents(info->extent_root, block_group);
8349
8350                btrfs_remove_free_space_cache(block_group);
8351                btrfs_put_block_group(block_group);
8352
8353                spin_lock(&info->block_group_cache_lock);
8354        }
8355        spin_unlock(&info->block_group_cache_lock);
8356
8357        /* now that all the block groups are freed, go through and
8358         * free all the space_info structs.  This is only called during
8359         * the final stages of unmount, and so we know nobody is
8360         * using them.  We call synchronize_rcu() once before we start,
8361         * just to be on the safe side.
8362         */
8363        synchronize_rcu();
8364
8365        release_global_block_rsv(info);
8366
8367        while(!list_empty(&info->space_info)) {
8368                space_info = list_entry(info->space_info.next,
8369                                        struct btrfs_space_info,
8370                                        list);
8371                if (space_info->bytes_pinned > 0 ||
8372                    space_info->bytes_reserved > 0) {
8373                        WARN_ON(1);
8374                        dump_space_info(space_info, 0, 0);
8375                }
8376                list_del(&space_info->list);
8377                kfree(space_info);
8378        }
8379        return 0;
8380}
8381
8382static void __link_block_group(struct btrfs_space_info *space_info,
8383                               struct btrfs_block_group_cache *cache)
8384{
8385        int index = get_block_group_index(cache);
8386
8387        down_write(&space_info->groups_sem);
8388        list_add_tail(&cache->list, &space_info->block_groups[index]);
8389        up_write(&space_info->groups_sem);
8390}
8391
8392int btrfs_read_block_groups(struct btrfs_root *root)
8393{
8394        struct btrfs_path *path;
8395        int ret;
8396        struct btrfs_block_group_cache *cache;
8397        struct btrfs_fs_info *info = root->fs_info;
8398        struct btrfs_space_info *space_info;
8399        struct btrfs_key key;
8400        struct btrfs_key found_key;
8401        struct extent_buffer *leaf;
8402        int need_clear = 0;
8403        u64 cache_gen;
8404
8405        root = info->extent_root;
8406        key.objectid = 0;
8407        key.offset = 0;
8408        btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
8409        path = btrfs_alloc_path();
8410        if (!path)
8411                return -ENOMEM;
8412
8413        cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
8414        if (cache_gen != 0 &&
8415            btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
8416                need_clear = 1;
8417        if (btrfs_test_opt(root, CLEAR_CACHE))
8418                need_clear = 1;
8419        if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
8420                printk(KERN_INFO "btrfs: disk space caching is enabled\n");
8421
8422        while (1) {
8423                ret = find_first_block_group(root, path, &key);
8424                if (ret > 0)
8425                        break;
8426                if (ret != 0)
8427                        goto error;
8428                leaf = path->nodes[0];
8429                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8430                cache = kzalloc(sizeof(*cache), GFP_NOFS);
8431                if (!cache) {
8432                        ret = -ENOMEM;
8433                        goto error;
8434                }
8435
8436                atomic_set(&cache->count, 1);
8437                spin_lock_init(&cache->lock);
8438                spin_lock_init(&cache->tree_lock);
8439                cache->fs_info = info;
8440                INIT_LIST_HEAD(&cache->list);
8441                INIT_LIST_HEAD(&cache->cluster_list);
8442
8443                if (need_clear)
8444                        cache->disk_cache_state = BTRFS_DC_CLEAR;
8445
8446                /*
8447                 * we only want to have 32k of ram per block group for keeping
8448                 * track of free space, and if we pass 1/2 of that we want to
8449                 * start converting things over to using bitmaps
8450                 */
8451                cache->extents_thresh = ((1024 * 32) / 2) /
8452                        sizeof(struct btrfs_free_space);
8453
8454                read_extent_buffer(leaf, &cache->item,
8455                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
8456                                   sizeof(cache->item));
8457                memcpy(&cache->key, &found_key, sizeof(found_key));
8458
8459                key.objectid = found_key.objectid + found_key.offset;
8460                btrfs_release_path(root, path);
8461                cache->flags = btrfs_block_group_flags(&cache->item);
8462                cache->sectorsize = root->sectorsize;
8463
8464                /*
8465                 * We need to exclude the super stripes now so that the space
8466                 * info has super bytes accounted for, otherwise we'll think
8467                 * we have more space than we actually do.
8468                 */
8469                exclude_super_stripes(root, cache);
8470
8471                /*
8472                 * check for two cases, either we are full, and therefore
8473                 * don't need to bother with the caching work since we won't
8474                 * find any space, or we are empty, and we can just add all
8475                 * the space in and be done with it.  This saves us _alot_ of
8476                 * time, particularly in the full case.
8477                 */
8478                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8479                        cache->last_byte_to_unpin = (u64)-1;
8480                        cache->cached = BTRFS_CACHE_FINISHED;
8481                        free_excluded_extents(root, cache);
8482                } else if (btrfs_block_group_used(&cache->item) == 0) {
8483                        cache->last_byte_to_unpin = (u64)-1;
8484                        cache->cached = BTRFS_CACHE_FINISHED;
8485                        add_new_free_space(cache, root->fs_info,
8486                                           found_key.objectid,
8487                                           found_key.objectid +
8488                                           found_key.offset);
8489                        free_excluded_extents(root, cache);
8490                }
8491
8492                ret = update_space_info(info, cache->flags, found_key.offset,
8493                                        btrfs_block_group_used(&cache->item),
8494                                        &space_info);
8495                BUG_ON(ret);
8496                cache->space_info = space_info;
8497                spin_lock(&cache->space_info->lock);
8498                cache->space_info->bytes_readonly += cache->bytes_super;
8499                spin_unlock(&cache->space_info->lock);
8500
8501                __link_block_group(space_info, cache);
8502
8503                ret = btrfs_add_block_group_cache(root->fs_info, cache);
8504                BUG_ON(ret);
8505
8506                set_avail_alloc_bits(root->fs_info, cache->flags);
8507                if (btrfs_chunk_readonly(root, cache->key.objectid))
8508                        set_block_group_ro(cache);
8509        }
8510
8511        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
8512                if (!(get_alloc_profile(root, space_info->flags) &
8513                      (BTRFS_BLOCK_GROUP_RAID10 |
8514                       BTRFS_BLOCK_GROUP_RAID1 |
8515                       BTRFS_BLOCK_GROUP_DUP)))
8516                        continue;
8517                /*
8518                 * avoid allocating from un-mirrored block group if there are
8519                 * mirrored block groups.
8520                 */
8521                list_for_each_entry(cache, &space_info->block_groups[3], list)
8522                        set_block_group_ro(cache);
8523                list_for_each_entry(cache, &space_info->block_groups[4], list)
8524                        set_block_group_ro(cache);
8525        }
8526
8527        init_global_block_rsv(info);
8528        ret = 0;
8529error:
8530        btrfs_free_path(path);
8531        return ret;
8532}
8533
8534int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8535                           struct btrfs_root *root, u64 bytes_used,
8536                           u64 type, u64 chunk_objectid, u64 chunk_offset,
8537                           u64 size)
8538{
8539        int ret;
8540        struct btrfs_root *extent_root;
8541        struct btrfs_block_group_cache *cache;
8542
8543        extent_root = root->fs_info->extent_root;
8544
8545        root->fs_info->last_trans_log_full_commit = trans->transid;
8546
8547        cache = kzalloc(sizeof(*cache), GFP_NOFS);
8548        if (!cache)
8549                return -ENOMEM;
8550
8551        cache->key.objectid = chunk_offset;
8552        cache->key.offset = size;
8553        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8554        cache->sectorsize = root->sectorsize;
8555        cache->fs_info = root->fs_info;
8556
8557        /*
8558         * we only want to have 32k of ram per block group for keeping track
8559         * of free space, and if we pass 1/2 of that we want to start
8560         * converting things over to using bitmaps
8561         */
8562        cache->extents_thresh = ((1024 * 32) / 2) /
8563                sizeof(struct btrfs_free_space);
8564        atomic_set(&cache->count, 1);
8565        spin_lock_init(&cache->lock);
8566        spin_lock_init(&cache->tree_lock);
8567        INIT_LIST_HEAD(&cache->list);
8568        INIT_LIST_HEAD(&cache->cluster_list);
8569
8570        btrfs_set_block_group_used(&cache->item, bytes_used);
8571        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8572        cache->flags = type;
8573        btrfs_set_block_group_flags(&cache->item, type);
8574
8575        cache->last_byte_to_unpin = (u64)-1;
8576        cache->cached = BTRFS_CACHE_FINISHED;
8577        exclude_super_stripes(root, cache);
8578
8579        add_new_free_space(cache, root->fs_info, chunk_offset,
8580                           chunk_offset + size);
8581
8582        free_excluded_extents(root, cache);
8583
8584        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
8585                                &cache->space_info);
8586        BUG_ON(ret);
8587
8588        spin_lock(&cache->space_info->lock);
8589        cache->space_info->bytes_readonly += cache->bytes_super;
8590        spin_unlock(&cache->space_info->lock);
8591
8592        __link_block_group(cache->space_info, cache);
8593
8594        ret = btrfs_add_block_group_cache(root->fs_info, cache);
8595        BUG_ON(ret);
8596
8597        ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
8598                                sizeof(cache->item));
8599        BUG_ON(ret);
8600
8601        set_avail_alloc_bits(extent_root->fs_info, type);
8602
8603        return 0;
8604}
8605
8606int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8607                             struct btrfs_root *root, u64 group_start)
8608{
8609        struct btrfs_path *path;
8610        struct btrfs_block_group_cache *block_group;
8611        struct btrfs_free_cluster *cluster;
8612        struct btrfs_root *tree_root = root->fs_info->tree_root;
8613        struct btrfs_key key;
8614        struct inode *inode;
8615        int ret;
8616        int factor;
8617
8618        root = root->fs_info->extent_root;
8619
8620        block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8621        BUG_ON(!block_group);
8622        BUG_ON(!block_group->ro);
8623
8624        memcpy(&key, &block_group->key, sizeof(key));
8625        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8626                                  BTRFS_BLOCK_GROUP_RAID1 |
8627                                  BTRFS_BLOCK_GROUP_RAID10))
8628                factor = 2;
8629        else
8630                factor = 1;
8631
8632        /* make sure this block group isn't part of an allocation cluster */
8633        cluster = &root->fs_info->data_alloc_cluster;
8634        spin_lock(&cluster->refill_lock);
8635        btrfs_return_cluster_to_free_space(block_group, cluster);
8636        spin_unlock(&cluster->refill_lock);
8637
8638        /*
8639         * make sure this block group isn't part of a metadata
8640         * allocation cluster
8641         */
8642        cluster = &root->fs_info->meta_alloc_cluster;
8643        spin_lock(&cluster->refill_lock);
8644        btrfs_return_cluster_to_free_space(block_group, cluster);
8645        spin_unlock(&cluster->refill_lock);
8646
8647        path = btrfs_alloc_path();
8648        BUG_ON(!path);
8649
8650        inode = lookup_free_space_inode(root, block_group, path);
8651        if (!IS_ERR(inode)) {
8652                btrfs_orphan_add(trans, inode);
8653                clear_nlink(inode);
8654                /* One for the block groups ref */
8655                spin_lock(&block_group->lock);
8656                if (block_group->iref) {
8657                        block_group->iref = 0;
8658                        block_group->inode = NULL;
8659                        spin_unlock(&block_group->lock);
8660                        iput(inode);
8661                } else {
8662                        spin_unlock(&block_group->lock);
8663                }
8664                /* One for our lookup ref */
8665                iput(inode);
8666        }
8667
8668        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8669        key.offset = block_group->key.objectid;
8670        key.type = 0;
8671
8672        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8673        if (ret < 0)
8674                goto out;
8675        if (ret > 0)
8676                btrfs_release_path(tree_root, path);
8677        if (ret == 0) {
8678                ret = btrfs_del_item(trans, tree_root, path);
8679                if (ret)
8680                        goto out;
8681                btrfs_release_path(tree_root, path);
8682        }
8683
8684        spin_lock(&root->fs_info->block_group_cache_lock);
8685        rb_erase(&block_group->cache_node,
8686                 &root->fs_info->block_group_cache_tree);
8687        spin_unlock(&root->fs_info->block_group_cache_lock);
8688
8689        down_write(&block_group->space_info->groups_sem);
8690        /*
8691         * we must use list_del_init so people can check to see if they
8692         * are still on the list after taking the semaphore
8693         */
8694        list_del_init(&block_group->list);
8695        up_write(&block_group->space_info->groups_sem);
8696
8697        if (block_group->cached == BTRFS_CACHE_STARTED)
8698                wait_block_group_cache_done(block_group);
8699
8700        btrfs_remove_free_space_cache(block_group);
8701
8702        spin_lock(&block_group->space_info->lock);
8703        block_group->space_info->total_bytes -= block_group->key.offset;
8704        block_group->space_info->bytes_readonly -= block_group->key.offset;
8705        block_group->space_info->disk_total -= block_group->key.offset * factor;
8706        spin_unlock(&block_group->space_info->lock);
8707
8708        memcpy(&key, &block_group->key, sizeof(key));
8709
8710        btrfs_clear_space_info_full(root->fs_info);
8711
8712        btrfs_put_block_group(block_group);
8713        btrfs_put_block_group(block_group);
8714
8715        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8716        if (ret > 0)
8717                ret = -EIO;
8718        if (ret < 0)
8719                goto out;
8720
8721        ret = btrfs_del_item(trans, root, path);
8722out:
8723        btrfs_free_path(path);
8724        return ret;
8725}
8726
8727int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8728{
8729        return unpin_extent_range(root, start, end);
8730}
8731
8732int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8733                               u64 num_bytes)
8734{
8735        return btrfs_discard_extent(root, bytenr, num_bytes);
8736}
8737