linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/pagemap.h>
  20#include <linux/writeback.h>
  21#include <linux/blkdev.h>
  22#include <linux/sort.h>
  23#include <linux/rcupdate.h>
  24#include <linux/kthread.h>
  25#include <linux/slab.h>
  26#include <linux/ratelimit.h>
  27#include "compat.h"
  28#include "hash.h"
  29#include "ctree.h"
  30#include "disk-io.h"
  31#include "print-tree.h"
  32#include "transaction.h"
  33#include "volumes.h"
  34#include "locking.h"
  35#include "free-space-cache.h"
  36
  37/* control flags for do_chunk_alloc's force field
  38 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  39 * if we really need one.
  40 *
  41 * CHUNK_ALLOC_FORCE means it must try to allocate one
  42 *
  43 * CHUNK_ALLOC_LIMITED means to only try and allocate one
  44 * if we have very few chunks already allocated.  This is
  45 * used as part of the clustering code to help make sure
  46 * we have a good pool of storage to cluster in, without
  47 * filling the FS with empty chunks
  48 *
  49 */
  50enum {
  51        CHUNK_ALLOC_NO_FORCE = 0,
  52        CHUNK_ALLOC_FORCE = 1,
  53        CHUNK_ALLOC_LIMITED = 2,
  54};
  55
  56/*
  57 * Control how reservations are dealt with.
  58 *
  59 * RESERVE_FREE - freeing a reservation.
  60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
  61 *   ENOSPC accounting
  62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
  63 *   bytes_may_use as the ENOSPC accounting is done elsewhere
  64 */
  65enum {
  66        RESERVE_FREE = 0,
  67        RESERVE_ALLOC = 1,
  68        RESERVE_ALLOC_NO_ACCOUNT = 2,
  69};
  70
  71static int update_block_group(struct btrfs_trans_handle *trans,
  72                              struct btrfs_root *root,
  73                              u64 bytenr, u64 num_bytes, int alloc);
  74static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  75                                struct btrfs_root *root,
  76                                u64 bytenr, u64 num_bytes, u64 parent,
  77                                u64 root_objectid, u64 owner_objectid,
  78                                u64 owner_offset, int refs_to_drop,
  79                                struct btrfs_delayed_extent_op *extra_op);
  80static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  81                                    struct extent_buffer *leaf,
  82                                    struct btrfs_extent_item *ei);
  83static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  84                                      struct btrfs_root *root,
  85                                      u64 parent, u64 root_objectid,
  86                                      u64 flags, u64 owner, u64 offset,
  87                                      struct btrfs_key *ins, int ref_mod);
  88static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  89                                     struct btrfs_root *root,
  90                                     u64 parent, u64 root_objectid,
  91                                     u64 flags, struct btrfs_disk_key *key,
  92                                     int level, struct btrfs_key *ins);
  93static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  94                          struct btrfs_root *extent_root, u64 alloc_bytes,
  95                          u64 flags, int force);
  96static int find_next_key(struct btrfs_path *path, int level,
  97                         struct btrfs_key *key);
  98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
  99                            int dump_block_groups);
 100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 101                                       u64 num_bytes, int reserve);
 102
 103static noinline int
 104block_group_cache_done(struct btrfs_block_group_cache *cache)
 105{
 106        smp_mb();
 107        return cache->cached == BTRFS_CACHE_FINISHED;
 108}
 109
 110static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 111{
 112        return (cache->flags & bits) == bits;
 113}
 114
 115static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 116{
 117        atomic_inc(&cache->count);
 118}
 119
 120void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 121{
 122        if (atomic_dec_and_test(&cache->count)) {
 123                WARN_ON(cache->pinned > 0);
 124                WARN_ON(cache->reserved > 0);
 125                kfree(cache->free_space_ctl);
 126                kfree(cache);
 127        }
 128}
 129
 130/*
 131 * this adds the block group to the fs_info rb tree for the block group
 132 * cache
 133 */
 134static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 135                                struct btrfs_block_group_cache *block_group)
 136{
 137        struct rb_node **p;
 138        struct rb_node *parent = NULL;
 139        struct btrfs_block_group_cache *cache;
 140
 141        spin_lock(&info->block_group_cache_lock);
 142        p = &info->block_group_cache_tree.rb_node;
 143
 144        while (*p) {
 145                parent = *p;
 146                cache = rb_entry(parent, struct btrfs_block_group_cache,
 147                                 cache_node);
 148                if (block_group->key.objectid < cache->key.objectid) {
 149                        p = &(*p)->rb_left;
 150                } else if (block_group->key.objectid > cache->key.objectid) {
 151                        p = &(*p)->rb_right;
 152                } else {
 153                        spin_unlock(&info->block_group_cache_lock);
 154                        return -EEXIST;
 155                }
 156        }
 157
 158        rb_link_node(&block_group->cache_node, parent, p);
 159        rb_insert_color(&block_group->cache_node,
 160                        &info->block_group_cache_tree);
 161        spin_unlock(&info->block_group_cache_lock);
 162
 163        return 0;
 164}
 165
 166/*
 167 * This will return the block group at or after bytenr if contains is 0, else
 168 * it will return the block group that contains the bytenr
 169 */
 170static struct btrfs_block_group_cache *
 171block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 172                              int contains)
 173{
 174        struct btrfs_block_group_cache *cache, *ret = NULL;
 175        struct rb_node *n;
 176        u64 end, start;
 177
 178        spin_lock(&info->block_group_cache_lock);
 179        n = info->block_group_cache_tree.rb_node;
 180
 181        while (n) {
 182                cache = rb_entry(n, struct btrfs_block_group_cache,
 183                                 cache_node);
 184                end = cache->key.objectid + cache->key.offset - 1;
 185                start = cache->key.objectid;
 186
 187                if (bytenr < start) {
 188                        if (!contains && (!ret || start < ret->key.objectid))
 189                                ret = cache;
 190                        n = n->rb_left;
 191                } else if (bytenr > start) {
 192                        if (contains && bytenr <= end) {
 193                                ret = cache;
 194                                break;
 195                        }
 196                        n = n->rb_right;
 197                } else {
 198                        ret = cache;
 199                        break;
 200                }
 201        }
 202        if (ret)
 203                btrfs_get_block_group(ret);
 204        spin_unlock(&info->block_group_cache_lock);
 205
 206        return ret;
 207}
 208
 209static int add_excluded_extent(struct btrfs_root *root,
 210                               u64 start, u64 num_bytes)
 211{
 212        u64 end = start + num_bytes - 1;
 213        set_extent_bits(&root->fs_info->freed_extents[0],
 214                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 215        set_extent_bits(&root->fs_info->freed_extents[1],
 216                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 217        return 0;
 218}
 219
 220static void free_excluded_extents(struct btrfs_root *root,
 221                                  struct btrfs_block_group_cache *cache)
 222{
 223        u64 start, end;
 224
 225        start = cache->key.objectid;
 226        end = start + cache->key.offset - 1;
 227
 228        clear_extent_bits(&root->fs_info->freed_extents[0],
 229                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 230        clear_extent_bits(&root->fs_info->freed_extents[1],
 231                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 232}
 233
 234static int exclude_super_stripes(struct btrfs_root *root,
 235                                 struct btrfs_block_group_cache *cache)
 236{
 237        u64 bytenr;
 238        u64 *logical;
 239        int stripe_len;
 240        int i, nr, ret;
 241
 242        if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 243                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 244                cache->bytes_super += stripe_len;
 245                ret = add_excluded_extent(root, cache->key.objectid,
 246                                          stripe_len);
 247                BUG_ON(ret);
 248        }
 249
 250        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 251                bytenr = btrfs_sb_offset(i);
 252                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 253                                       cache->key.objectid, bytenr,
 254                                       0, &logical, &nr, &stripe_len);
 255                BUG_ON(ret);
 256
 257                while (nr--) {
 258                        cache->bytes_super += stripe_len;
 259                        ret = add_excluded_extent(root, logical[nr],
 260                                                  stripe_len);
 261                        BUG_ON(ret);
 262                }
 263
 264                kfree(logical);
 265        }
 266        return 0;
 267}
 268
 269static struct btrfs_caching_control *
 270get_caching_control(struct btrfs_block_group_cache *cache)
 271{
 272        struct btrfs_caching_control *ctl;
 273
 274        spin_lock(&cache->lock);
 275        if (cache->cached != BTRFS_CACHE_STARTED) {
 276                spin_unlock(&cache->lock);
 277                return NULL;
 278        }
 279
 280        /* We're loading it the fast way, so we don't have a caching_ctl. */
 281        if (!cache->caching_ctl) {
 282                spin_unlock(&cache->lock);
 283                return NULL;
 284        }
 285
 286        ctl = cache->caching_ctl;
 287        atomic_inc(&ctl->count);
 288        spin_unlock(&cache->lock);
 289        return ctl;
 290}
 291
 292static void put_caching_control(struct btrfs_caching_control *ctl)
 293{
 294        if (atomic_dec_and_test(&ctl->count))
 295                kfree(ctl);
 296}
 297
 298/*
 299 * this is only called by cache_block_group, since we could have freed extents
 300 * we need to check the pinned_extents for any extents that can't be used yet
 301 * since their free space will be released as soon as the transaction commits.
 302 */
 303static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 304                              struct btrfs_fs_info *info, u64 start, u64 end)
 305{
 306        u64 extent_start, extent_end, size, total_added = 0;
 307        int ret;
 308
 309        while (start < end) {
 310                ret = find_first_extent_bit(info->pinned_extents, start,
 311                                            &extent_start, &extent_end,
 312                                            EXTENT_DIRTY | EXTENT_UPTODATE);
 313                if (ret)
 314                        break;
 315
 316                if (extent_start <= start) {
 317                        start = extent_end + 1;
 318                } else if (extent_start > start && extent_start < end) {
 319                        size = extent_start - start;
 320                        total_added += size;
 321                        ret = btrfs_add_free_space(block_group, start,
 322                                                   size);
 323                        BUG_ON(ret);
 324                        start = extent_end + 1;
 325                } else {
 326                        break;
 327                }
 328        }
 329
 330        if (start < end) {
 331                size = end - start;
 332                total_added += size;
 333                ret = btrfs_add_free_space(block_group, start, size);
 334                BUG_ON(ret);
 335        }
 336
 337        return total_added;
 338}
 339
 340static noinline void caching_thread(struct btrfs_work *work)
 341{
 342        struct btrfs_block_group_cache *block_group;
 343        struct btrfs_fs_info *fs_info;
 344        struct btrfs_caching_control *caching_ctl;
 345        struct btrfs_root *extent_root;
 346        struct btrfs_path *path;
 347        struct extent_buffer *leaf;
 348        struct btrfs_key key;
 349        u64 total_found = 0;
 350        u64 last = 0;
 351        u32 nritems;
 352        int ret = 0;
 353
 354        caching_ctl = container_of(work, struct btrfs_caching_control, work);
 355        block_group = caching_ctl->block_group;
 356        fs_info = block_group->fs_info;
 357        extent_root = fs_info->extent_root;
 358
 359        path = btrfs_alloc_path();
 360        if (!path)
 361                goto out;
 362
 363        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 364
 365        /*
 366         * We don't want to deadlock with somebody trying to allocate a new
 367         * extent for the extent root while also trying to search the extent
 368         * root to add free space.  So we skip locking and search the commit
 369         * root, since its read-only
 370         */
 371        path->skip_locking = 1;
 372        path->search_commit_root = 1;
 373        path->reada = 1;
 374
 375        key.objectid = last;
 376        key.offset = 0;
 377        key.type = BTRFS_EXTENT_ITEM_KEY;
 378again:
 379        mutex_lock(&caching_ctl->mutex);
 380        /* need to make sure the commit_root doesn't disappear */
 381        down_read(&fs_info->extent_commit_sem);
 382
 383        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 384        if (ret < 0)
 385                goto err;
 386
 387        leaf = path->nodes[0];
 388        nritems = btrfs_header_nritems(leaf);
 389
 390        while (1) {
 391                if (btrfs_fs_closing(fs_info) > 1) {
 392                        last = (u64)-1;
 393                        break;
 394                }
 395
 396                if (path->slots[0] < nritems) {
 397                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 398                } else {
 399                        ret = find_next_key(path, 0, &key);
 400                        if (ret)
 401                                break;
 402
 403                        if (need_resched() ||
 404                            btrfs_next_leaf(extent_root, path)) {
 405                                caching_ctl->progress = last;
 406                                btrfs_release_path(path);
 407                                up_read(&fs_info->extent_commit_sem);
 408                                mutex_unlock(&caching_ctl->mutex);
 409                                cond_resched();
 410                                goto again;
 411                        }
 412                        leaf = path->nodes[0];
 413                        nritems = btrfs_header_nritems(leaf);
 414                        continue;
 415                }
 416
 417                if (key.objectid < block_group->key.objectid) {
 418                        path->slots[0]++;
 419                        continue;
 420                }
 421
 422                if (key.objectid >= block_group->key.objectid +
 423                    block_group->key.offset)
 424                        break;
 425
 426                if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 427                        total_found += add_new_free_space(block_group,
 428                                                          fs_info, last,
 429                                                          key.objectid);
 430                        last = key.objectid + key.offset;
 431
 432                        if (total_found > (1024 * 1024 * 2)) {
 433                                total_found = 0;
 434                                wake_up(&caching_ctl->wait);
 435                        }
 436                }
 437                path->slots[0]++;
 438        }
 439        ret = 0;
 440
 441        total_found += add_new_free_space(block_group, fs_info, last,
 442                                          block_group->key.objectid +
 443                                          block_group->key.offset);
 444        caching_ctl->progress = (u64)-1;
 445
 446        spin_lock(&block_group->lock);
 447        block_group->caching_ctl = NULL;
 448        block_group->cached = BTRFS_CACHE_FINISHED;
 449        spin_unlock(&block_group->lock);
 450
 451err:
 452        btrfs_free_path(path);
 453        up_read(&fs_info->extent_commit_sem);
 454
 455        free_excluded_extents(extent_root, block_group);
 456
 457        mutex_unlock(&caching_ctl->mutex);
 458out:
 459        wake_up(&caching_ctl->wait);
 460
 461        put_caching_control(caching_ctl);
 462        btrfs_put_block_group(block_group);
 463}
 464
 465static int cache_block_group(struct btrfs_block_group_cache *cache,
 466                             struct btrfs_trans_handle *trans,
 467                             struct btrfs_root *root,
 468                             int load_cache_only)
 469{
 470        DEFINE_WAIT(wait);
 471        struct btrfs_fs_info *fs_info = cache->fs_info;
 472        struct btrfs_caching_control *caching_ctl;
 473        int ret = 0;
 474
 475        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 476        BUG_ON(!caching_ctl);
 477
 478        INIT_LIST_HEAD(&caching_ctl->list);
 479        mutex_init(&caching_ctl->mutex);
 480        init_waitqueue_head(&caching_ctl->wait);
 481        caching_ctl->block_group = cache;
 482        caching_ctl->progress = cache->key.objectid;
 483        atomic_set(&caching_ctl->count, 1);
 484        caching_ctl->work.func = caching_thread;
 485
 486        spin_lock(&cache->lock);
 487        /*
 488         * This should be a rare occasion, but this could happen I think in the
 489         * case where one thread starts to load the space cache info, and then
 490         * some other thread starts a transaction commit which tries to do an
 491         * allocation while the other thread is still loading the space cache
 492         * info.  The previous loop should have kept us from choosing this block
 493         * group, but if we've moved to the state where we will wait on caching
 494         * block groups we need to first check if we're doing a fast load here,
 495         * so we can wait for it to finish, otherwise we could end up allocating
 496         * from a block group who's cache gets evicted for one reason or
 497         * another.
 498         */
 499        while (cache->cached == BTRFS_CACHE_FAST) {
 500                struct btrfs_caching_control *ctl;
 501
 502                ctl = cache->caching_ctl;
 503                atomic_inc(&ctl->count);
 504                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 505                spin_unlock(&cache->lock);
 506
 507                schedule();
 508
 509                finish_wait(&ctl->wait, &wait);
 510                put_caching_control(ctl);
 511                spin_lock(&cache->lock);
 512        }
 513
 514        if (cache->cached != BTRFS_CACHE_NO) {
 515                spin_unlock(&cache->lock);
 516                kfree(caching_ctl);
 517                return 0;
 518        }
 519        WARN_ON(cache->caching_ctl);
 520        cache->caching_ctl = caching_ctl;
 521        cache->cached = BTRFS_CACHE_FAST;
 522        spin_unlock(&cache->lock);
 523
 524        /*
 525         * We can't do the read from on-disk cache during a commit since we need
 526         * to have the normal tree locking.  Also if we are currently trying to
 527         * allocate blocks for the tree root we can't do the fast caching since
 528         * we likely hold important locks.
 529         */
 530        if (trans && (!trans->transaction->in_commit) &&
 531            (root && root != root->fs_info->tree_root) &&
 532            btrfs_test_opt(root, SPACE_CACHE)) {
 533                ret = load_free_space_cache(fs_info, cache);
 534
 535                spin_lock(&cache->lock);
 536                if (ret == 1) {
 537                        cache->caching_ctl = NULL;
 538                        cache->cached = BTRFS_CACHE_FINISHED;
 539                        cache->last_byte_to_unpin = (u64)-1;
 540                } else {
 541                        if (load_cache_only) {
 542                                cache->caching_ctl = NULL;
 543                                cache->cached = BTRFS_CACHE_NO;
 544                        } else {
 545                                cache->cached = BTRFS_CACHE_STARTED;
 546                        }
 547                }
 548                spin_unlock(&cache->lock);
 549                wake_up(&caching_ctl->wait);
 550                if (ret == 1) {
 551                        put_caching_control(caching_ctl);
 552                        free_excluded_extents(fs_info->extent_root, cache);
 553                        return 0;
 554                }
 555        } else {
 556                /*
 557                 * We are not going to do the fast caching, set cached to the
 558                 * appropriate value and wakeup any waiters.
 559                 */
 560                spin_lock(&cache->lock);
 561                if (load_cache_only) {
 562                        cache->caching_ctl = NULL;
 563                        cache->cached = BTRFS_CACHE_NO;
 564                } else {
 565                        cache->cached = BTRFS_CACHE_STARTED;
 566                }
 567                spin_unlock(&cache->lock);
 568                wake_up(&caching_ctl->wait);
 569        }
 570
 571        if (load_cache_only) {
 572                put_caching_control(caching_ctl);
 573                return 0;
 574        }
 575
 576        down_write(&fs_info->extent_commit_sem);
 577        atomic_inc(&caching_ctl->count);
 578        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 579        up_write(&fs_info->extent_commit_sem);
 580
 581        btrfs_get_block_group(cache);
 582
 583        btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
 584
 585        return ret;
 586}
 587
 588/*
 589 * return the block group that starts at or after bytenr
 590 */
 591static struct btrfs_block_group_cache *
 592btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 593{
 594        struct btrfs_block_group_cache *cache;
 595
 596        cache = block_group_cache_tree_search(info, bytenr, 0);
 597
 598        return cache;
 599}
 600
 601/*
 602 * return the block group that contains the given bytenr
 603 */
 604struct btrfs_block_group_cache *btrfs_lookup_block_group(
 605                                                 struct btrfs_fs_info *info,
 606                                                 u64 bytenr)
 607{
 608        struct btrfs_block_group_cache *cache;
 609
 610        cache = block_group_cache_tree_search(info, bytenr, 1);
 611
 612        return cache;
 613}
 614
 615static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 616                                                  u64 flags)
 617{
 618        struct list_head *head = &info->space_info;
 619        struct btrfs_space_info *found;
 620
 621        flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
 622                 BTRFS_BLOCK_GROUP_METADATA;
 623
 624        rcu_read_lock();
 625        list_for_each_entry_rcu(found, head, list) {
 626                if (found->flags & flags) {
 627                        rcu_read_unlock();
 628                        return found;
 629                }
 630        }
 631        rcu_read_unlock();
 632        return NULL;
 633}
 634
 635/*
 636 * after adding space to the filesystem, we need to clear the full flags
 637 * on all the space infos.
 638 */
 639void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 640{
 641        struct list_head *head = &info->space_info;
 642        struct btrfs_space_info *found;
 643
 644        rcu_read_lock();
 645        list_for_each_entry_rcu(found, head, list)
 646                found->full = 0;
 647        rcu_read_unlock();
 648}
 649
 650static u64 div_factor(u64 num, int factor)
 651{
 652        if (factor == 10)
 653                return num;
 654        num *= factor;
 655        do_div(num, 10);
 656        return num;
 657}
 658
 659static u64 div_factor_fine(u64 num, int factor)
 660{
 661        if (factor == 100)
 662                return num;
 663        num *= factor;
 664        do_div(num, 100);
 665        return num;
 666}
 667
 668u64 btrfs_find_block_group(struct btrfs_root *root,
 669                           u64 search_start, u64 search_hint, int owner)
 670{
 671        struct btrfs_block_group_cache *cache;
 672        u64 used;
 673        u64 last = max(search_hint, search_start);
 674        u64 group_start = 0;
 675        int full_search = 0;
 676        int factor = 9;
 677        int wrapped = 0;
 678again:
 679        while (1) {
 680                cache = btrfs_lookup_first_block_group(root->fs_info, last);
 681                if (!cache)
 682                        break;
 683
 684                spin_lock(&cache->lock);
 685                last = cache->key.objectid + cache->key.offset;
 686                used = btrfs_block_group_used(&cache->item);
 687
 688                if ((full_search || !cache->ro) &&
 689                    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
 690                        if (used + cache->pinned + cache->reserved <
 691                            div_factor(cache->key.offset, factor)) {
 692                                group_start = cache->key.objectid;
 693                                spin_unlock(&cache->lock);
 694                                btrfs_put_block_group(cache);
 695                                goto found;
 696                        }
 697                }
 698                spin_unlock(&cache->lock);
 699                btrfs_put_block_group(cache);
 700                cond_resched();
 701        }
 702        if (!wrapped) {
 703                last = search_start;
 704                wrapped = 1;
 705                goto again;
 706        }
 707        if (!full_search && factor < 10) {
 708                last = search_start;
 709                full_search = 1;
 710                factor = 10;
 711                goto again;
 712        }
 713found:
 714        return group_start;
 715}
 716
 717/* simple helper to search for an existing extent at a given offset */
 718int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 719{
 720        int ret;
 721        struct btrfs_key key;
 722        struct btrfs_path *path;
 723
 724        path = btrfs_alloc_path();
 725        if (!path)
 726                return -ENOMEM;
 727
 728        key.objectid = start;
 729        key.offset = len;
 730        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
 731        ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 732                                0, 0);
 733        btrfs_free_path(path);
 734        return ret;
 735}
 736
 737/*
 738 * helper function to lookup reference count and flags of extent.
 739 *
 740 * the head node for delayed ref is used to store the sum of all the
 741 * reference count modifications queued up in the rbtree. the head
 742 * node may also store the extent flags to set. This way you can check
 743 * to see what the reference count and extent flags would be if all of
 744 * the delayed refs are not processed.
 745 */
 746int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 747                             struct btrfs_root *root, u64 bytenr,
 748                             u64 num_bytes, u64 *refs, u64 *flags)
 749{
 750        struct btrfs_delayed_ref_head *head;
 751        struct btrfs_delayed_ref_root *delayed_refs;
 752        struct btrfs_path *path;
 753        struct btrfs_extent_item *ei;
 754        struct extent_buffer *leaf;
 755        struct btrfs_key key;
 756        u32 item_size;
 757        u64 num_refs;
 758        u64 extent_flags;
 759        int ret;
 760
 761        path = btrfs_alloc_path();
 762        if (!path)
 763                return -ENOMEM;
 764
 765        key.objectid = bytenr;
 766        key.type = BTRFS_EXTENT_ITEM_KEY;
 767        key.offset = num_bytes;
 768        if (!trans) {
 769                path->skip_locking = 1;
 770                path->search_commit_root = 1;
 771        }
 772again:
 773        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 774                                &key, path, 0, 0);
 775        if (ret < 0)
 776                goto out_free;
 777
 778        if (ret == 0) {
 779                leaf = path->nodes[0];
 780                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 781                if (item_size >= sizeof(*ei)) {
 782                        ei = btrfs_item_ptr(leaf, path->slots[0],
 783                                            struct btrfs_extent_item);
 784                        num_refs = btrfs_extent_refs(leaf, ei);
 785                        extent_flags = btrfs_extent_flags(leaf, ei);
 786                } else {
 787#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 788                        struct btrfs_extent_item_v0 *ei0;
 789                        BUG_ON(item_size != sizeof(*ei0));
 790                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 791                                             struct btrfs_extent_item_v0);
 792                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
 793                        /* FIXME: this isn't correct for data */
 794                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 795#else
 796                        BUG();
 797#endif
 798                }
 799                BUG_ON(num_refs == 0);
 800        } else {
 801                num_refs = 0;
 802                extent_flags = 0;
 803                ret = 0;
 804        }
 805
 806        if (!trans)
 807                goto out;
 808
 809        delayed_refs = &trans->transaction->delayed_refs;
 810        spin_lock(&delayed_refs->lock);
 811        head = btrfs_find_delayed_ref_head(trans, bytenr);
 812        if (head) {
 813                if (!mutex_trylock(&head->mutex)) {
 814                        atomic_inc(&head->node.refs);
 815                        spin_unlock(&delayed_refs->lock);
 816
 817                        btrfs_release_path(path);
 818
 819                        /*
 820                         * Mutex was contended, block until it's released and try
 821                         * again
 822                         */
 823                        mutex_lock(&head->mutex);
 824                        mutex_unlock(&head->mutex);
 825                        btrfs_put_delayed_ref(&head->node);
 826                        goto again;
 827                }
 828                if (head->extent_op && head->extent_op->update_flags)
 829                        extent_flags |= head->extent_op->flags_to_set;
 830                else
 831                        BUG_ON(num_refs == 0);
 832
 833                num_refs += head->node.ref_mod;
 834                mutex_unlock(&head->mutex);
 835        }
 836        spin_unlock(&delayed_refs->lock);
 837out:
 838        WARN_ON(num_refs == 0);
 839        if (refs)
 840                *refs = num_refs;
 841        if (flags)
 842                *flags = extent_flags;
 843out_free:
 844        btrfs_free_path(path);
 845        return ret;
 846}
 847
 848/*
 849 * Back reference rules.  Back refs have three main goals:
 850 *
 851 * 1) differentiate between all holders of references to an extent so that
 852 *    when a reference is dropped we can make sure it was a valid reference
 853 *    before freeing the extent.
 854 *
 855 * 2) Provide enough information to quickly find the holders of an extent
 856 *    if we notice a given block is corrupted or bad.
 857 *
 858 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 859 *    maintenance.  This is actually the same as #2, but with a slightly
 860 *    different use case.
 861 *
 862 * There are two kinds of back refs. The implicit back refs is optimized
 863 * for pointers in non-shared tree blocks. For a given pointer in a block,
 864 * back refs of this kind provide information about the block's owner tree
 865 * and the pointer's key. These information allow us to find the block by
 866 * b-tree searching. The full back refs is for pointers in tree blocks not
 867 * referenced by their owner trees. The location of tree block is recorded
 868 * in the back refs. Actually the full back refs is generic, and can be
 869 * used in all cases the implicit back refs is used. The major shortcoming
 870 * of the full back refs is its overhead. Every time a tree block gets
 871 * COWed, we have to update back refs entry for all pointers in it.
 872 *
 873 * For a newly allocated tree block, we use implicit back refs for
 874 * pointers in it. This means most tree related operations only involve
 875 * implicit back refs. For a tree block created in old transaction, the
 876 * only way to drop a reference to it is COW it. So we can detect the
 877 * event that tree block loses its owner tree's reference and do the
 878 * back refs conversion.
 879 *
 880 * When a tree block is COW'd through a tree, there are four cases:
 881 *
 882 * The reference count of the block is one and the tree is the block's
 883 * owner tree. Nothing to do in this case.
 884 *
 885 * The reference count of the block is one and the tree is not the
 886 * block's owner tree. In this case, full back refs is used for pointers
 887 * in the block. Remove these full back refs, add implicit back refs for
 888 * every pointers in the new block.
 889 *
 890 * The reference count of the block is greater than one and the tree is
 891 * the block's owner tree. In this case, implicit back refs is used for
 892 * pointers in the block. Add full back refs for every pointers in the
 893 * block, increase lower level extents' reference counts. The original
 894 * implicit back refs are entailed to the new block.
 895 *
 896 * The reference count of the block is greater than one and the tree is
 897 * not the block's owner tree. Add implicit back refs for every pointer in
 898 * the new block, increase lower level extents' reference count.
 899 *
 900 * Back Reference Key composing:
 901 *
 902 * The key objectid corresponds to the first byte in the extent,
 903 * The key type is used to differentiate between types of back refs.
 904 * There are different meanings of the key offset for different types
 905 * of back refs.
 906 *
 907 * File extents can be referenced by:
 908 *
 909 * - multiple snapshots, subvolumes, or different generations in one subvol
 910 * - different files inside a single subvolume
 911 * - different offsets inside a file (bookend extents in file.c)
 912 *
 913 * The extent ref structure for the implicit back refs has fields for:
 914 *
 915 * - Objectid of the subvolume root
 916 * - objectid of the file holding the reference
 917 * - original offset in the file
 918 * - how many bookend extents
 919 *
 920 * The key offset for the implicit back refs is hash of the first
 921 * three fields.
 922 *
 923 * The extent ref structure for the full back refs has field for:
 924 *
 925 * - number of pointers in the tree leaf
 926 *
 927 * The key offset for the implicit back refs is the first byte of
 928 * the tree leaf
 929 *
 930 * When a file extent is allocated, The implicit back refs is used.
 931 * the fields are filled in:
 932 *
 933 *     (root_key.objectid, inode objectid, offset in file, 1)
 934 *
 935 * When a file extent is removed file truncation, we find the
 936 * corresponding implicit back refs and check the following fields:
 937 *
 938 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 939 *
 940 * Btree extents can be referenced by:
 941 *
 942 * - Different subvolumes
 943 *
 944 * Both the implicit back refs and the full back refs for tree blocks
 945 * only consist of key. The key offset for the implicit back refs is
 946 * objectid of block's owner tree. The key offset for the full back refs
 947 * is the first byte of parent block.
 948 *
 949 * When implicit back refs is used, information about the lowest key and
 950 * level of the tree block are required. These information are stored in
 951 * tree block info structure.
 952 */
 953
 954#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 955static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 956                                  struct btrfs_root *root,
 957                                  struct btrfs_path *path,
 958                                  u64 owner, u32 extra_size)
 959{
 960        struct btrfs_extent_item *item;
 961        struct btrfs_extent_item_v0 *ei0;
 962        struct btrfs_extent_ref_v0 *ref0;
 963        struct btrfs_tree_block_info *bi;
 964        struct extent_buffer *leaf;
 965        struct btrfs_key key;
 966        struct btrfs_key found_key;
 967        u32 new_size = sizeof(*item);
 968        u64 refs;
 969        int ret;
 970
 971        leaf = path->nodes[0];
 972        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
 973
 974        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 975        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 976                             struct btrfs_extent_item_v0);
 977        refs = btrfs_extent_refs_v0(leaf, ei0);
 978
 979        if (owner == (u64)-1) {
 980                while (1) {
 981                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 982                                ret = btrfs_next_leaf(root, path);
 983                                if (ret < 0)
 984                                        return ret;
 985                                BUG_ON(ret > 0);
 986                                leaf = path->nodes[0];
 987                        }
 988                        btrfs_item_key_to_cpu(leaf, &found_key,
 989                                              path->slots[0]);
 990                        BUG_ON(key.objectid != found_key.objectid);
 991                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
 992                                path->slots[0]++;
 993                                continue;
 994                        }
 995                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
 996                                              struct btrfs_extent_ref_v0);
 997                        owner = btrfs_ref_objectid_v0(leaf, ref0);
 998                        break;
 999                }
1000        }
1001        btrfs_release_path(path);
1002
1003        if (owner < BTRFS_FIRST_FREE_OBJECTID)
1004                new_size += sizeof(*bi);
1005
1006        new_size -= sizeof(*ei0);
1007        ret = btrfs_search_slot(trans, root, &key, path,
1008                                new_size + extra_size, 1);
1009        if (ret < 0)
1010                return ret;
1011        BUG_ON(ret);
1012
1013        ret = btrfs_extend_item(trans, root, path, new_size);
1014
1015        leaf = path->nodes[0];
1016        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1017        btrfs_set_extent_refs(leaf, item, refs);
1018        /* FIXME: get real generation */
1019        btrfs_set_extent_generation(leaf, item, 0);
1020        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1021                btrfs_set_extent_flags(leaf, item,
1022                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1023                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1024                bi = (struct btrfs_tree_block_info *)(item + 1);
1025                /* FIXME: get first key of the block */
1026                memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1027                btrfs_set_tree_block_level(leaf, bi, (int)owner);
1028        } else {
1029                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1030        }
1031        btrfs_mark_buffer_dirty(leaf);
1032        return 0;
1033}
1034#endif
1035
1036static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1037{
1038        u32 high_crc = ~(u32)0;
1039        u32 low_crc = ~(u32)0;
1040        __le64 lenum;
1041
1042        lenum = cpu_to_le64(root_objectid);
1043        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1044        lenum = cpu_to_le64(owner);
1045        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1046        lenum = cpu_to_le64(offset);
1047        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1048
1049        return ((u64)high_crc << 31) ^ (u64)low_crc;
1050}
1051
1052static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1053                                     struct btrfs_extent_data_ref *ref)
1054{
1055        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1056                                    btrfs_extent_data_ref_objectid(leaf, ref),
1057                                    btrfs_extent_data_ref_offset(leaf, ref));
1058}
1059
1060static int match_extent_data_ref(struct extent_buffer *leaf,
1061                                 struct btrfs_extent_data_ref *ref,
1062                                 u64 root_objectid, u64 owner, u64 offset)
1063{
1064        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1065            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1066            btrfs_extent_data_ref_offset(leaf, ref) != offset)
1067                return 0;
1068        return 1;
1069}
1070
1071static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1072                                           struct btrfs_root *root,
1073                                           struct btrfs_path *path,
1074                                           u64 bytenr, u64 parent,
1075                                           u64 root_objectid,
1076                                           u64 owner, u64 offset)
1077{
1078        struct btrfs_key key;
1079        struct btrfs_extent_data_ref *ref;
1080        struct extent_buffer *leaf;
1081        u32 nritems;
1082        int ret;
1083        int recow;
1084        int err = -ENOENT;
1085
1086        key.objectid = bytenr;
1087        if (parent) {
1088                key.type = BTRFS_SHARED_DATA_REF_KEY;
1089                key.offset = parent;
1090        } else {
1091                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1092                key.offset = hash_extent_data_ref(root_objectid,
1093                                                  owner, offset);
1094        }
1095again:
1096        recow = 0;
1097        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1098        if (ret < 0) {
1099                err = ret;
1100                goto fail;
1101        }
1102
1103        if (parent) {
1104                if (!ret)
1105                        return 0;
1106#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1107                key.type = BTRFS_EXTENT_REF_V0_KEY;
1108                btrfs_release_path(path);
1109                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1110                if (ret < 0) {
1111                        err = ret;
1112                        goto fail;
1113                }
1114                if (!ret)
1115                        return 0;
1116#endif
1117                goto fail;
1118        }
1119
1120        leaf = path->nodes[0];
1121        nritems = btrfs_header_nritems(leaf);
1122        while (1) {
1123                if (path->slots[0] >= nritems) {
1124                        ret = btrfs_next_leaf(root, path);
1125                        if (ret < 0)
1126                                err = ret;
1127                        if (ret)
1128                                goto fail;
1129
1130                        leaf = path->nodes[0];
1131                        nritems = btrfs_header_nritems(leaf);
1132                        recow = 1;
1133                }
1134
1135                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1136                if (key.objectid != bytenr ||
1137                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1138                        goto fail;
1139
1140                ref = btrfs_item_ptr(leaf, path->slots[0],
1141                                     struct btrfs_extent_data_ref);
1142
1143                if (match_extent_data_ref(leaf, ref, root_objectid,
1144                                          owner, offset)) {
1145                        if (recow) {
1146                                btrfs_release_path(path);
1147                                goto again;
1148                        }
1149                        err = 0;
1150                        break;
1151                }
1152                path->slots[0]++;
1153        }
1154fail:
1155        return err;
1156}
1157
1158static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1159                                           struct btrfs_root *root,
1160                                           struct btrfs_path *path,
1161                                           u64 bytenr, u64 parent,
1162                                           u64 root_objectid, u64 owner,
1163                                           u64 offset, int refs_to_add)
1164{
1165        struct btrfs_key key;
1166        struct extent_buffer *leaf;
1167        u32 size;
1168        u32 num_refs;
1169        int ret;
1170
1171        key.objectid = bytenr;
1172        if (parent) {
1173                key.type = BTRFS_SHARED_DATA_REF_KEY;
1174                key.offset = parent;
1175                size = sizeof(struct btrfs_shared_data_ref);
1176        } else {
1177                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1178                key.offset = hash_extent_data_ref(root_objectid,
1179                                                  owner, offset);
1180                size = sizeof(struct btrfs_extent_data_ref);
1181        }
1182
1183        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1184        if (ret && ret != -EEXIST)
1185                goto fail;
1186
1187        leaf = path->nodes[0];
1188        if (parent) {
1189                struct btrfs_shared_data_ref *ref;
1190                ref = btrfs_item_ptr(leaf, path->slots[0],
1191                                     struct btrfs_shared_data_ref);
1192                if (ret == 0) {
1193                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1194                } else {
1195                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
1196                        num_refs += refs_to_add;
1197                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1198                }
1199        } else {
1200                struct btrfs_extent_data_ref *ref;
1201                while (ret == -EEXIST) {
1202                        ref = btrfs_item_ptr(leaf, path->slots[0],
1203                                             struct btrfs_extent_data_ref);
1204                        if (match_extent_data_ref(leaf, ref, root_objectid,
1205                                                  owner, offset))
1206                                break;
1207                        btrfs_release_path(path);
1208                        key.offset++;
1209                        ret = btrfs_insert_empty_item(trans, root, path, &key,
1210                                                      size);
1211                        if (ret && ret != -EEXIST)
1212                                goto fail;
1213
1214                        leaf = path->nodes[0];
1215                }
1216                ref = btrfs_item_ptr(leaf, path->slots[0],
1217                                     struct btrfs_extent_data_ref);
1218                if (ret == 0) {
1219                        btrfs_set_extent_data_ref_root(leaf, ref,
1220                                                       root_objectid);
1221                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1222                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1223                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1224                } else {
1225                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
1226                        num_refs += refs_to_add;
1227                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1228                }
1229        }
1230        btrfs_mark_buffer_dirty(leaf);
1231        ret = 0;
1232fail:
1233        btrfs_release_path(path);
1234        return ret;
1235}
1236
1237static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1238                                           struct btrfs_root *root,
1239                                           struct btrfs_path *path,
1240                                           int refs_to_drop)
1241{
1242        struct btrfs_key key;
1243        struct btrfs_extent_data_ref *ref1 = NULL;
1244        struct btrfs_shared_data_ref *ref2 = NULL;
1245        struct extent_buffer *leaf;
1246        u32 num_refs = 0;
1247        int ret = 0;
1248
1249        leaf = path->nodes[0];
1250        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1251
1252        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1253                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1254                                      struct btrfs_extent_data_ref);
1255                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1256        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1257                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1258                                      struct btrfs_shared_data_ref);
1259                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1260#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1261        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1262                struct btrfs_extent_ref_v0 *ref0;
1263                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1264                                      struct btrfs_extent_ref_v0);
1265                num_refs = btrfs_ref_count_v0(leaf, ref0);
1266#endif
1267        } else {
1268                BUG();
1269        }
1270
1271        BUG_ON(num_refs < refs_to_drop);
1272        num_refs -= refs_to_drop;
1273
1274        if (num_refs == 0) {
1275                ret = btrfs_del_item(trans, root, path);
1276        } else {
1277                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1278                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1279                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1280                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1281#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1282                else {
1283                        struct btrfs_extent_ref_v0 *ref0;
1284                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1285                                        struct btrfs_extent_ref_v0);
1286                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1287                }
1288#endif
1289                btrfs_mark_buffer_dirty(leaf);
1290        }
1291        return ret;
1292}
1293
1294static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1295                                          struct btrfs_path *path,
1296                                          struct btrfs_extent_inline_ref *iref)
1297{
1298        struct btrfs_key key;
1299        struct extent_buffer *leaf;
1300        struct btrfs_extent_data_ref *ref1;
1301        struct btrfs_shared_data_ref *ref2;
1302        u32 num_refs = 0;
1303
1304        leaf = path->nodes[0];
1305        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1306        if (iref) {
1307                if (btrfs_extent_inline_ref_type(leaf, iref) ==
1308                    BTRFS_EXTENT_DATA_REF_KEY) {
1309                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1310                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1311                } else {
1312                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1313                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1314                }
1315        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1316                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1317                                      struct btrfs_extent_data_ref);
1318                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1319        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1320                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1321                                      struct btrfs_shared_data_ref);
1322                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1323#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1324        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1325                struct btrfs_extent_ref_v0 *ref0;
1326                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1327                                      struct btrfs_extent_ref_v0);
1328                num_refs = btrfs_ref_count_v0(leaf, ref0);
1329#endif
1330        } else {
1331                WARN_ON(1);
1332        }
1333        return num_refs;
1334}
1335
1336static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1337                                          struct btrfs_root *root,
1338                                          struct btrfs_path *path,
1339                                          u64 bytenr, u64 parent,
1340                                          u64 root_objectid)
1341{
1342        struct btrfs_key key;
1343        int ret;
1344
1345        key.objectid = bytenr;
1346        if (parent) {
1347                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1348                key.offset = parent;
1349        } else {
1350                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1351                key.offset = root_objectid;
1352        }
1353
1354        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1355        if (ret > 0)
1356                ret = -ENOENT;
1357#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1358        if (ret == -ENOENT && parent) {
1359                btrfs_release_path(path);
1360                key.type = BTRFS_EXTENT_REF_V0_KEY;
1361                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1362                if (ret > 0)
1363                        ret = -ENOENT;
1364        }
1365#endif
1366        return ret;
1367}
1368
1369static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1370                                          struct btrfs_root *root,
1371                                          struct btrfs_path *path,
1372                                          u64 bytenr, u64 parent,
1373                                          u64 root_objectid)
1374{
1375        struct btrfs_key key;
1376        int ret;
1377
1378        key.objectid = bytenr;
1379        if (parent) {
1380                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1381                key.offset = parent;
1382        } else {
1383                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1384                key.offset = root_objectid;
1385        }
1386
1387        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1388        btrfs_release_path(path);
1389        return ret;
1390}
1391
1392static inline int extent_ref_type(u64 parent, u64 owner)
1393{
1394        int type;
1395        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1396                if (parent > 0)
1397                        type = BTRFS_SHARED_BLOCK_REF_KEY;
1398                else
1399                        type = BTRFS_TREE_BLOCK_REF_KEY;
1400        } else {
1401                if (parent > 0)
1402                        type = BTRFS_SHARED_DATA_REF_KEY;
1403                else
1404                        type = BTRFS_EXTENT_DATA_REF_KEY;
1405        }
1406        return type;
1407}
1408
1409static int find_next_key(struct btrfs_path *path, int level,
1410                         struct btrfs_key *key)
1411
1412{
1413        for (; level < BTRFS_MAX_LEVEL; level++) {
1414                if (!path->nodes[level])
1415                        break;
1416                if (path->slots[level] + 1 >=
1417                    btrfs_header_nritems(path->nodes[level]))
1418                        continue;
1419                if (level == 0)
1420                        btrfs_item_key_to_cpu(path->nodes[level], key,
1421                                              path->slots[level] + 1);
1422                else
1423                        btrfs_node_key_to_cpu(path->nodes[level], key,
1424                                              path->slots[level] + 1);
1425                return 0;
1426        }
1427        return 1;
1428}
1429
1430/*
1431 * look for inline back ref. if back ref is found, *ref_ret is set
1432 * to the address of inline back ref, and 0 is returned.
1433 *
1434 * if back ref isn't found, *ref_ret is set to the address where it
1435 * should be inserted, and -ENOENT is returned.
1436 *
1437 * if insert is true and there are too many inline back refs, the path
1438 * points to the extent item, and -EAGAIN is returned.
1439 *
1440 * NOTE: inline back refs are ordered in the same way that back ref
1441 *       items in the tree are ordered.
1442 */
1443static noinline_for_stack
1444int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1445                                 struct btrfs_root *root,
1446                                 struct btrfs_path *path,
1447                                 struct btrfs_extent_inline_ref **ref_ret,
1448                                 u64 bytenr, u64 num_bytes,
1449                                 u64 parent, u64 root_objectid,
1450                                 u64 owner, u64 offset, int insert)
1451{
1452        struct btrfs_key key;
1453        struct extent_buffer *leaf;
1454        struct btrfs_extent_item *ei;
1455        struct btrfs_extent_inline_ref *iref;
1456        u64 flags;
1457        u64 item_size;
1458        unsigned long ptr;
1459        unsigned long end;
1460        int extra_size;
1461        int type;
1462        int want;
1463        int ret;
1464        int err = 0;
1465
1466        key.objectid = bytenr;
1467        key.type = BTRFS_EXTENT_ITEM_KEY;
1468        key.offset = num_bytes;
1469
1470        want = extent_ref_type(parent, owner);
1471        if (insert) {
1472                extra_size = btrfs_extent_inline_ref_size(want);
1473                path->keep_locks = 1;
1474        } else
1475                extra_size = -1;
1476        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1477        if (ret < 0) {
1478                err = ret;
1479                goto out;
1480        }
1481        BUG_ON(ret);
1482
1483        leaf = path->nodes[0];
1484        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1485#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1486        if (item_size < sizeof(*ei)) {
1487                if (!insert) {
1488                        err = -ENOENT;
1489                        goto out;
1490                }
1491                ret = convert_extent_item_v0(trans, root, path, owner,
1492                                             extra_size);
1493                if (ret < 0) {
1494                        err = ret;
1495                        goto out;
1496                }
1497                leaf = path->nodes[0];
1498                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1499        }
1500#endif
1501        BUG_ON(item_size < sizeof(*ei));
1502
1503        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1504        flags = btrfs_extent_flags(leaf, ei);
1505
1506        ptr = (unsigned long)(ei + 1);
1507        end = (unsigned long)ei + item_size;
1508
1509        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1510                ptr += sizeof(struct btrfs_tree_block_info);
1511                BUG_ON(ptr > end);
1512        } else {
1513                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1514        }
1515
1516        err = -ENOENT;
1517        while (1) {
1518                if (ptr >= end) {
1519                        WARN_ON(ptr > end);
1520                        break;
1521                }
1522                iref = (struct btrfs_extent_inline_ref *)ptr;
1523                type = btrfs_extent_inline_ref_type(leaf, iref);
1524                if (want < type)
1525                        break;
1526                if (want > type) {
1527                        ptr += btrfs_extent_inline_ref_size(type);
1528                        continue;
1529                }
1530
1531                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1532                        struct btrfs_extent_data_ref *dref;
1533                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1534                        if (match_extent_data_ref(leaf, dref, root_objectid,
1535                                                  owner, offset)) {
1536                                err = 0;
1537                                break;
1538                        }
1539                        if (hash_extent_data_ref_item(leaf, dref) <
1540                            hash_extent_data_ref(root_objectid, owner, offset))
1541                                break;
1542                } else {
1543                        u64 ref_offset;
1544                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1545                        if (parent > 0) {
1546                                if (parent == ref_offset) {
1547                                        err = 0;
1548                                        break;
1549                                }
1550                                if (ref_offset < parent)
1551                                        break;
1552                        } else {
1553                                if (root_objectid == ref_offset) {
1554                                        err = 0;
1555                                        break;
1556                                }
1557                                if (ref_offset < root_objectid)
1558                                        break;
1559                        }
1560                }
1561                ptr += btrfs_extent_inline_ref_size(type);
1562        }
1563        if (err == -ENOENT && insert) {
1564                if (item_size + extra_size >=
1565                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1566                        err = -EAGAIN;
1567                        goto out;
1568                }
1569                /*
1570                 * To add new inline back ref, we have to make sure
1571                 * there is no corresponding back ref item.
1572                 * For simplicity, we just do not add new inline back
1573                 * ref if there is any kind of item for this block
1574                 */
1575                if (find_next_key(path, 0, &key) == 0 &&
1576                    key.objectid == bytenr &&
1577                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1578                        err = -EAGAIN;
1579                        goto out;
1580                }
1581        }
1582        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1583out:
1584        if (insert) {
1585                path->keep_locks = 0;
1586                btrfs_unlock_up_safe(path, 1);
1587        }
1588        return err;
1589}
1590
1591/*
1592 * helper to add new inline back ref
1593 */
1594static noinline_for_stack
1595int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1596                                struct btrfs_root *root,
1597                                struct btrfs_path *path,
1598                                struct btrfs_extent_inline_ref *iref,
1599                                u64 parent, u64 root_objectid,
1600                                u64 owner, u64 offset, int refs_to_add,
1601                                struct btrfs_delayed_extent_op *extent_op)
1602{
1603        struct extent_buffer *leaf;
1604        struct btrfs_extent_item *ei;
1605        unsigned long ptr;
1606        unsigned long end;
1607        unsigned long item_offset;
1608        u64 refs;
1609        int size;
1610        int type;
1611        int ret;
1612
1613        leaf = path->nodes[0];
1614        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1615        item_offset = (unsigned long)iref - (unsigned long)ei;
1616
1617        type = extent_ref_type(parent, owner);
1618        size = btrfs_extent_inline_ref_size(type);
1619
1620        ret = btrfs_extend_item(trans, root, path, size);
1621
1622        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1623        refs = btrfs_extent_refs(leaf, ei);
1624        refs += refs_to_add;
1625        btrfs_set_extent_refs(leaf, ei, refs);
1626        if (extent_op)
1627                __run_delayed_extent_op(extent_op, leaf, ei);
1628
1629        ptr = (unsigned long)ei + item_offset;
1630        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1631        if (ptr < end - size)
1632                memmove_extent_buffer(leaf, ptr + size, ptr,
1633                                      end - size - ptr);
1634
1635        iref = (struct btrfs_extent_inline_ref *)ptr;
1636        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1637        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1638                struct btrfs_extent_data_ref *dref;
1639                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1640                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1641                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1642                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1643                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1644        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1645                struct btrfs_shared_data_ref *sref;
1646                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1647                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1648                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1649        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1650                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1651        } else {
1652                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1653        }
1654        btrfs_mark_buffer_dirty(leaf);
1655        return 0;
1656}
1657
1658static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1659                                 struct btrfs_root *root,
1660                                 struct btrfs_path *path,
1661                                 struct btrfs_extent_inline_ref **ref_ret,
1662                                 u64 bytenr, u64 num_bytes, u64 parent,
1663                                 u64 root_objectid, u64 owner, u64 offset)
1664{
1665        int ret;
1666
1667        ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1668                                           bytenr, num_bytes, parent,
1669                                           root_objectid, owner, offset, 0);
1670        if (ret != -ENOENT)
1671                return ret;
1672
1673        btrfs_release_path(path);
1674        *ref_ret = NULL;
1675
1676        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1677                ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1678                                            root_objectid);
1679        } else {
1680                ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1681                                             root_objectid, owner, offset);
1682        }
1683        return ret;
1684}
1685
1686/*
1687 * helper to update/remove inline back ref
1688 */
1689static noinline_for_stack
1690int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1691                                 struct btrfs_root *root,
1692                                 struct btrfs_path *path,
1693                                 struct btrfs_extent_inline_ref *iref,
1694                                 int refs_to_mod,
1695                                 struct btrfs_delayed_extent_op *extent_op)
1696{
1697        struct extent_buffer *leaf;
1698        struct btrfs_extent_item *ei;
1699        struct btrfs_extent_data_ref *dref = NULL;
1700        struct btrfs_shared_data_ref *sref = NULL;
1701        unsigned long ptr;
1702        unsigned long end;
1703        u32 item_size;
1704        int size;
1705        int type;
1706        int ret;
1707        u64 refs;
1708
1709        leaf = path->nodes[0];
1710        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1711        refs = btrfs_extent_refs(leaf, ei);
1712        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1713        refs += refs_to_mod;
1714        btrfs_set_extent_refs(leaf, ei, refs);
1715        if (extent_op)
1716                __run_delayed_extent_op(extent_op, leaf, ei);
1717
1718        type = btrfs_extent_inline_ref_type(leaf, iref);
1719
1720        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1721                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1722                refs = btrfs_extent_data_ref_count(leaf, dref);
1723        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1724                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1725                refs = btrfs_shared_data_ref_count(leaf, sref);
1726        } else {
1727                refs = 1;
1728                BUG_ON(refs_to_mod != -1);
1729        }
1730
1731        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1732        refs += refs_to_mod;
1733
1734        if (refs > 0) {
1735                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1736                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1737                else
1738                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1739        } else {
1740                size =  btrfs_extent_inline_ref_size(type);
1741                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1742                ptr = (unsigned long)iref;
1743                end = (unsigned long)ei + item_size;
1744                if (ptr + size < end)
1745                        memmove_extent_buffer(leaf, ptr, ptr + size,
1746                                              end - ptr - size);
1747                item_size -= size;
1748                ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1749        }
1750        btrfs_mark_buffer_dirty(leaf);
1751        return 0;
1752}
1753
1754static noinline_for_stack
1755int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1756                                 struct btrfs_root *root,
1757                                 struct btrfs_path *path,
1758                                 u64 bytenr, u64 num_bytes, u64 parent,
1759                                 u64 root_objectid, u64 owner,
1760                                 u64 offset, int refs_to_add,
1761                                 struct btrfs_delayed_extent_op *extent_op)
1762{
1763        struct btrfs_extent_inline_ref *iref;
1764        int ret;
1765
1766        ret = lookup_inline_extent_backref(trans, root, path, &iref,
1767                                           bytenr, num_bytes, parent,
1768                                           root_objectid, owner, offset, 1);
1769        if (ret == 0) {
1770                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1771                ret = update_inline_extent_backref(trans, root, path, iref,
1772                                                   refs_to_add, extent_op);
1773        } else if (ret == -ENOENT) {
1774                ret = setup_inline_extent_backref(trans, root, path, iref,
1775                                                  parent, root_objectid,
1776                                                  owner, offset, refs_to_add,
1777                                                  extent_op);
1778        }
1779        return ret;
1780}
1781
1782static int insert_extent_backref(struct btrfs_trans_handle *trans,
1783                                 struct btrfs_root *root,
1784                                 struct btrfs_path *path,
1785                                 u64 bytenr, u64 parent, u64 root_objectid,
1786                                 u64 owner, u64 offset, int refs_to_add)
1787{
1788        int ret;
1789        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1790                BUG_ON(refs_to_add != 1);
1791                ret = insert_tree_block_ref(trans, root, path, bytenr,
1792                                            parent, root_objectid);
1793        } else {
1794                ret = insert_extent_data_ref(trans, root, path, bytenr,
1795                                             parent, root_objectid,
1796                                             owner, offset, refs_to_add);
1797        }
1798        return ret;
1799}
1800
1801static int remove_extent_backref(struct btrfs_trans_handle *trans,
1802                                 struct btrfs_root *root,
1803                                 struct btrfs_path *path,
1804                                 struct btrfs_extent_inline_ref *iref,
1805                                 int refs_to_drop, int is_data)
1806{
1807        int ret;
1808
1809        BUG_ON(!is_data && refs_to_drop != 1);
1810        if (iref) {
1811                ret = update_inline_extent_backref(trans, root, path, iref,
1812                                                   -refs_to_drop, NULL);
1813        } else if (is_data) {
1814                ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1815        } else {
1816                ret = btrfs_del_item(trans, root, path);
1817        }
1818        return ret;
1819}
1820
1821static int btrfs_issue_discard(struct block_device *bdev,
1822                                u64 start, u64 len)
1823{
1824        return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1825}
1826
1827static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1828                                u64 num_bytes, u64 *actual_bytes)
1829{
1830        int ret;
1831        u64 discarded_bytes = 0;
1832        struct btrfs_bio *bbio = NULL;
1833
1834
1835        /* Tell the block device(s) that the sectors can be discarded */
1836        ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1837                              bytenr, &num_bytes, &bbio, 0);
1838        if (!ret) {
1839                struct btrfs_bio_stripe *stripe = bbio->stripes;
1840                int i;
1841
1842
1843                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1844                        if (!stripe->dev->can_discard)
1845                                continue;
1846
1847                        ret = btrfs_issue_discard(stripe->dev->bdev,
1848                                                  stripe->physical,
1849                                                  stripe->length);
1850                        if (!ret)
1851                                discarded_bytes += stripe->length;
1852                        else if (ret != -EOPNOTSUPP)
1853                                break;
1854
1855                        /*
1856                         * Just in case we get back EOPNOTSUPP for some reason,
1857                         * just ignore the return value so we don't screw up
1858                         * people calling discard_extent.
1859                         */
1860                        ret = 0;
1861                }
1862                kfree(bbio);
1863        }
1864
1865        if (actual_bytes)
1866                *actual_bytes = discarded_bytes;
1867
1868
1869        return ret;
1870}
1871
1872int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1873                         struct btrfs_root *root,
1874                         u64 bytenr, u64 num_bytes, u64 parent,
1875                         u64 root_objectid, u64 owner, u64 offset)
1876{
1877        int ret;
1878        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1879               root_objectid == BTRFS_TREE_LOG_OBJECTID);
1880
1881        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1882                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
1883                                        parent, root_objectid, (int)owner,
1884                                        BTRFS_ADD_DELAYED_REF, NULL);
1885        } else {
1886                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
1887                                        parent, root_objectid, owner, offset,
1888                                        BTRFS_ADD_DELAYED_REF, NULL);
1889        }
1890        return ret;
1891}
1892
1893static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1894                                  struct btrfs_root *root,
1895                                  u64 bytenr, u64 num_bytes,
1896                                  u64 parent, u64 root_objectid,
1897                                  u64 owner, u64 offset, int refs_to_add,
1898                                  struct btrfs_delayed_extent_op *extent_op)
1899{
1900        struct btrfs_path *path;
1901        struct extent_buffer *leaf;
1902        struct btrfs_extent_item *item;
1903        u64 refs;
1904        int ret;
1905        int err = 0;
1906
1907        path = btrfs_alloc_path();
1908        if (!path)
1909                return -ENOMEM;
1910
1911        path->reada = 1;
1912        path->leave_spinning = 1;
1913        /* this will setup the path even if it fails to insert the back ref */
1914        ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1915                                           path, bytenr, num_bytes, parent,
1916                                           root_objectid, owner, offset,
1917                                           refs_to_add, extent_op);
1918        if (ret == 0)
1919                goto out;
1920
1921        if (ret != -EAGAIN) {
1922                err = ret;
1923                goto out;
1924        }
1925
1926        leaf = path->nodes[0];
1927        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1928        refs = btrfs_extent_refs(leaf, item);
1929        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1930        if (extent_op)
1931                __run_delayed_extent_op(extent_op, leaf, item);
1932
1933        btrfs_mark_buffer_dirty(leaf);
1934        btrfs_release_path(path);
1935
1936        path->reada = 1;
1937        path->leave_spinning = 1;
1938
1939        /* now insert the actual backref */
1940        ret = insert_extent_backref(trans, root->fs_info->extent_root,
1941                                    path, bytenr, parent, root_objectid,
1942                                    owner, offset, refs_to_add);
1943        BUG_ON(ret);
1944out:
1945        btrfs_free_path(path);
1946        return err;
1947}
1948
1949static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1950                                struct btrfs_root *root,
1951                                struct btrfs_delayed_ref_node *node,
1952                                struct btrfs_delayed_extent_op *extent_op,
1953                                int insert_reserved)
1954{
1955        int ret = 0;
1956        struct btrfs_delayed_data_ref *ref;
1957        struct btrfs_key ins;
1958        u64 parent = 0;
1959        u64 ref_root = 0;
1960        u64 flags = 0;
1961
1962        ins.objectid = node->bytenr;
1963        ins.offset = node->num_bytes;
1964        ins.type = BTRFS_EXTENT_ITEM_KEY;
1965
1966        ref = btrfs_delayed_node_to_data_ref(node);
1967        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1968                parent = ref->parent;
1969        else
1970                ref_root = ref->root;
1971
1972        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1973                if (extent_op) {
1974                        BUG_ON(extent_op->update_key);
1975                        flags |= extent_op->flags_to_set;
1976                }
1977                ret = alloc_reserved_file_extent(trans, root,
1978                                                 parent, ref_root, flags,
1979                                                 ref->objectid, ref->offset,
1980                                                 &ins, node->ref_mod);
1981        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1982                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1983                                             node->num_bytes, parent,
1984                                             ref_root, ref->objectid,
1985                                             ref->offset, node->ref_mod,
1986                                             extent_op);
1987        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1988                ret = __btrfs_free_extent(trans, root, node->bytenr,
1989                                          node->num_bytes, parent,
1990                                          ref_root, ref->objectid,
1991                                          ref->offset, node->ref_mod,
1992                                          extent_op);
1993        } else {
1994                BUG();
1995        }
1996        return ret;
1997}
1998
1999static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2000                                    struct extent_buffer *leaf,
2001                                    struct btrfs_extent_item *ei)
2002{
2003        u64 flags = btrfs_extent_flags(leaf, ei);
2004        if (extent_op->update_flags) {
2005                flags |= extent_op->flags_to_set;
2006                btrfs_set_extent_flags(leaf, ei, flags);
2007        }
2008
2009        if (extent_op->update_key) {
2010                struct btrfs_tree_block_info *bi;
2011                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2012                bi = (struct btrfs_tree_block_info *)(ei + 1);
2013                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2014        }
2015}
2016
2017static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2018                                 struct btrfs_root *root,
2019                                 struct btrfs_delayed_ref_node *node,
2020                                 struct btrfs_delayed_extent_op *extent_op)
2021{
2022        struct btrfs_key key;
2023        struct btrfs_path *path;
2024        struct btrfs_extent_item *ei;
2025        struct extent_buffer *leaf;
2026        u32 item_size;
2027        int ret;
2028        int err = 0;
2029
2030        path = btrfs_alloc_path();
2031        if (!path)
2032                return -ENOMEM;
2033
2034        key.objectid = node->bytenr;
2035        key.type = BTRFS_EXTENT_ITEM_KEY;
2036        key.offset = node->num_bytes;
2037
2038        path->reada = 1;
2039        path->leave_spinning = 1;
2040        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2041                                path, 0, 1);
2042        if (ret < 0) {
2043                err = ret;
2044                goto out;
2045        }
2046        if (ret > 0) {
2047                err = -EIO;
2048                goto out;
2049        }
2050
2051        leaf = path->nodes[0];
2052        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2053#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2054        if (item_size < sizeof(*ei)) {
2055                ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2056                                             path, (u64)-1, 0);
2057                if (ret < 0) {
2058                        err = ret;
2059                        goto out;
2060                }
2061                leaf = path->nodes[0];
2062                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2063        }
2064#endif
2065        BUG_ON(item_size < sizeof(*ei));
2066        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2067        __run_delayed_extent_op(extent_op, leaf, ei);
2068
2069        btrfs_mark_buffer_dirty(leaf);
2070out:
2071        btrfs_free_path(path);
2072        return err;
2073}
2074
2075static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2076                                struct btrfs_root *root,
2077                                struct btrfs_delayed_ref_node *node,
2078                                struct btrfs_delayed_extent_op *extent_op,
2079                                int insert_reserved)
2080{
2081        int ret = 0;
2082        struct btrfs_delayed_tree_ref *ref;
2083        struct btrfs_key ins;
2084        u64 parent = 0;
2085        u64 ref_root = 0;
2086
2087        ins.objectid = node->bytenr;
2088        ins.offset = node->num_bytes;
2089        ins.type = BTRFS_EXTENT_ITEM_KEY;
2090
2091        ref = btrfs_delayed_node_to_tree_ref(node);
2092        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2093                parent = ref->parent;
2094        else
2095                ref_root = ref->root;
2096
2097        BUG_ON(node->ref_mod != 1);
2098        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2099                BUG_ON(!extent_op || !extent_op->update_flags ||
2100                       !extent_op->update_key);
2101                ret = alloc_reserved_tree_block(trans, root,
2102                                                parent, ref_root,
2103                                                extent_op->flags_to_set,
2104                                                &extent_op->key,
2105                                                ref->level, &ins);
2106        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2107                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2108                                             node->num_bytes, parent, ref_root,
2109                                             ref->level, 0, 1, extent_op);
2110        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2111                ret = __btrfs_free_extent(trans, root, node->bytenr,
2112                                          node->num_bytes, parent, ref_root,
2113                                          ref->level, 0, 1, extent_op);
2114        } else {
2115                BUG();
2116        }
2117        return ret;
2118}
2119
2120/* helper function to actually process a single delayed ref entry */
2121static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2122                               struct btrfs_root *root,
2123                               struct btrfs_delayed_ref_node *node,
2124                               struct btrfs_delayed_extent_op *extent_op,
2125                               int insert_reserved)
2126{
2127        int ret;
2128        if (btrfs_delayed_ref_is_head(node)) {
2129                struct btrfs_delayed_ref_head *head;
2130                /*
2131                 * we've hit the end of the chain and we were supposed
2132                 * to insert this extent into the tree.  But, it got
2133                 * deleted before we ever needed to insert it, so all
2134                 * we have to do is clean up the accounting
2135                 */
2136                BUG_ON(extent_op);
2137                head = btrfs_delayed_node_to_head(node);
2138                if (insert_reserved) {
2139                        btrfs_pin_extent(root, node->bytenr,
2140                                         node->num_bytes, 1);
2141                        if (head->is_data) {
2142                                ret = btrfs_del_csums(trans, root,
2143                                                      node->bytenr,
2144                                                      node->num_bytes);
2145                                BUG_ON(ret);
2146                        }
2147                }
2148                mutex_unlock(&head->mutex);
2149                return 0;
2150        }
2151
2152        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2153            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2154                ret = run_delayed_tree_ref(trans, root, node, extent_op,
2155                                           insert_reserved);
2156        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2157                 node->type == BTRFS_SHARED_DATA_REF_KEY)
2158                ret = run_delayed_data_ref(trans, root, node, extent_op,
2159                                           insert_reserved);
2160        else
2161                BUG();
2162        return ret;
2163}
2164
2165static noinline struct btrfs_delayed_ref_node *
2166select_delayed_ref(struct btrfs_delayed_ref_head *head)
2167{
2168        struct rb_node *node;
2169        struct btrfs_delayed_ref_node *ref;
2170        int action = BTRFS_ADD_DELAYED_REF;
2171again:
2172        /*
2173         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2174         * this prevents ref count from going down to zero when
2175         * there still are pending delayed ref.
2176         */
2177        node = rb_prev(&head->node.rb_node);
2178        while (1) {
2179                if (!node)
2180                        break;
2181                ref = rb_entry(node, struct btrfs_delayed_ref_node,
2182                                rb_node);
2183                if (ref->bytenr != head->node.bytenr)
2184                        break;
2185                if (ref->action == action)
2186                        return ref;
2187                node = rb_prev(node);
2188        }
2189        if (action == BTRFS_ADD_DELAYED_REF) {
2190                action = BTRFS_DROP_DELAYED_REF;
2191                goto again;
2192        }
2193        return NULL;
2194}
2195
2196static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2197                                       struct btrfs_root *root,
2198                                       struct list_head *cluster)
2199{
2200        struct btrfs_delayed_ref_root *delayed_refs;
2201        struct btrfs_delayed_ref_node *ref;
2202        struct btrfs_delayed_ref_head *locked_ref = NULL;
2203        struct btrfs_delayed_extent_op *extent_op;
2204        int ret;
2205        int count = 0;
2206        int must_insert_reserved = 0;
2207
2208        delayed_refs = &trans->transaction->delayed_refs;
2209        while (1) {
2210                if (!locked_ref) {
2211                        /* pick a new head ref from the cluster list */
2212                        if (list_empty(cluster))
2213                                break;
2214
2215                        locked_ref = list_entry(cluster->next,
2216                                     struct btrfs_delayed_ref_head, cluster);
2217
2218                        /* grab the lock that says we are going to process
2219                         * all the refs for this head */
2220                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
2221
2222                        /*
2223                         * we may have dropped the spin lock to get the head
2224                         * mutex lock, and that might have given someone else
2225                         * time to free the head.  If that's true, it has been
2226                         * removed from our list and we can move on.
2227                         */
2228                        if (ret == -EAGAIN) {
2229                                locked_ref = NULL;
2230                                count++;
2231                                continue;
2232                        }
2233                }
2234
2235                /*
2236                 * record the must insert reserved flag before we
2237                 * drop the spin lock.
2238                 */
2239                must_insert_reserved = locked_ref->must_insert_reserved;
2240                locked_ref->must_insert_reserved = 0;
2241
2242                extent_op = locked_ref->extent_op;
2243                locked_ref->extent_op = NULL;
2244
2245                /*
2246                 * locked_ref is the head node, so we have to go one
2247                 * node back for any delayed ref updates
2248                 */
2249                ref = select_delayed_ref(locked_ref);
2250                if (!ref) {
2251                        /* All delayed refs have been processed, Go ahead
2252                         * and send the head node to run_one_delayed_ref,
2253                         * so that any accounting fixes can happen
2254                         */
2255                        ref = &locked_ref->node;
2256
2257                        if (extent_op && must_insert_reserved) {
2258                                kfree(extent_op);
2259                                extent_op = NULL;
2260                        }
2261
2262                        if (extent_op) {
2263                                spin_unlock(&delayed_refs->lock);
2264
2265                                ret = run_delayed_extent_op(trans, root,
2266                                                            ref, extent_op);
2267                                BUG_ON(ret);
2268                                kfree(extent_op);
2269
2270                                cond_resched();
2271                                spin_lock(&delayed_refs->lock);
2272                                continue;
2273                        }
2274
2275                        list_del_init(&locked_ref->cluster);
2276                        locked_ref = NULL;
2277                }
2278
2279                ref->in_tree = 0;
2280                rb_erase(&ref->rb_node, &delayed_refs->root);
2281                delayed_refs->num_entries--;
2282
2283                spin_unlock(&delayed_refs->lock);
2284
2285                ret = run_one_delayed_ref(trans, root, ref, extent_op,
2286                                          must_insert_reserved);
2287                BUG_ON(ret);
2288
2289                btrfs_put_delayed_ref(ref);
2290                kfree(extent_op);
2291                count++;
2292
2293                cond_resched();
2294                spin_lock(&delayed_refs->lock);
2295        }
2296        return count;
2297}
2298
2299/*
2300 * this starts processing the delayed reference count updates and
2301 * extent insertions we have queued up so far.  count can be
2302 * 0, which means to process everything in the tree at the start
2303 * of the run (but not newly added entries), or it can be some target
2304 * number you'd like to process.
2305 */
2306int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2307                           struct btrfs_root *root, unsigned long count)
2308{
2309        struct rb_node *node;
2310        struct btrfs_delayed_ref_root *delayed_refs;
2311        struct btrfs_delayed_ref_node *ref;
2312        struct list_head cluster;
2313        int ret;
2314        int run_all = count == (unsigned long)-1;
2315        int run_most = 0;
2316
2317        if (root == root->fs_info->extent_root)
2318                root = root->fs_info->tree_root;
2319
2320        delayed_refs = &trans->transaction->delayed_refs;
2321        INIT_LIST_HEAD(&cluster);
2322again:
2323        spin_lock(&delayed_refs->lock);
2324        if (count == 0) {
2325                count = delayed_refs->num_entries * 2;
2326                run_most = 1;
2327        }
2328        while (1) {
2329                if (!(run_all || run_most) &&
2330                    delayed_refs->num_heads_ready < 64)
2331                        break;
2332
2333                /*
2334                 * go find something we can process in the rbtree.  We start at
2335                 * the beginning of the tree, and then build a cluster
2336                 * of refs to process starting at the first one we are able to
2337                 * lock
2338                 */
2339                ret = btrfs_find_ref_cluster(trans, &cluster,
2340                                             delayed_refs->run_delayed_start);
2341                if (ret)
2342                        break;
2343
2344                ret = run_clustered_refs(trans, root, &cluster);
2345                BUG_ON(ret < 0);
2346
2347                count -= min_t(unsigned long, ret, count);
2348
2349                if (count == 0)
2350                        break;
2351        }
2352
2353        if (run_all) {
2354                node = rb_first(&delayed_refs->root);
2355                if (!node)
2356                        goto out;
2357                count = (unsigned long)-1;
2358
2359                while (node) {
2360                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
2361                                       rb_node);
2362                        if (btrfs_delayed_ref_is_head(ref)) {
2363                                struct btrfs_delayed_ref_head *head;
2364
2365                                head = btrfs_delayed_node_to_head(ref);
2366                                atomic_inc(&ref->refs);
2367
2368                                spin_unlock(&delayed_refs->lock);
2369                                /*
2370                                 * Mutex was contended, block until it's
2371                                 * released and try again
2372                                 */
2373                                mutex_lock(&head->mutex);
2374                                mutex_unlock(&head->mutex);
2375
2376                                btrfs_put_delayed_ref(ref);
2377                                cond_resched();
2378                                goto again;
2379                        }
2380                        node = rb_next(node);
2381                }
2382                spin_unlock(&delayed_refs->lock);
2383                schedule_timeout(1);
2384                goto again;
2385        }
2386out:
2387        spin_unlock(&delayed_refs->lock);
2388        return 0;
2389}
2390
2391int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2392                                struct btrfs_root *root,
2393                                u64 bytenr, u64 num_bytes, u64 flags,
2394                                int is_data)
2395{
2396        struct btrfs_delayed_extent_op *extent_op;
2397        int ret;
2398
2399        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
2400        if (!extent_op)
2401                return -ENOMEM;
2402
2403        extent_op->flags_to_set = flags;
2404        extent_op->update_flags = 1;
2405        extent_op->update_key = 0;
2406        extent_op->is_data = is_data ? 1 : 0;
2407
2408        ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2409        if (ret)
2410                kfree(extent_op);
2411        return ret;
2412}
2413
2414static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2415                                      struct btrfs_root *root,
2416                                      struct btrfs_path *path,
2417                                      u64 objectid, u64 offset, u64 bytenr)
2418{
2419        struct btrfs_delayed_ref_head *head;
2420        struct btrfs_delayed_ref_node *ref;
2421        struct btrfs_delayed_data_ref *data_ref;
2422        struct btrfs_delayed_ref_root *delayed_refs;
2423        struct rb_node *node;
2424        int ret = 0;
2425
2426        ret = -ENOENT;
2427        delayed_refs = &trans->transaction->delayed_refs;
2428        spin_lock(&delayed_refs->lock);
2429        head = btrfs_find_delayed_ref_head(trans, bytenr);
2430        if (!head)
2431                goto out;
2432
2433        if (!mutex_trylock(&head->mutex)) {
2434                atomic_inc(&head->node.refs);
2435                spin_unlock(&delayed_refs->lock);
2436
2437                btrfs_release_path(path);
2438
2439                /*
2440                 * Mutex was contended, block until it's released and let
2441                 * caller try again
2442                 */
2443                mutex_lock(&head->mutex);
2444                mutex_unlock(&head->mutex);
2445                btrfs_put_delayed_ref(&head->node);
2446                return -EAGAIN;
2447        }
2448
2449        node = rb_prev(&head->node.rb_node);
2450        if (!node)
2451                goto out_unlock;
2452
2453        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2454
2455        if (ref->bytenr != bytenr)
2456                goto out_unlock;
2457
2458        ret = 1;
2459        if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2460                goto out_unlock;
2461
2462        data_ref = btrfs_delayed_node_to_data_ref(ref);
2463
2464        node = rb_prev(node);
2465        if (node) {
2466                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2467                if (ref->bytenr == bytenr)
2468                        goto out_unlock;
2469        }
2470
2471        if (data_ref->root != root->root_key.objectid ||
2472            data_ref->objectid != objectid || data_ref->offset != offset)
2473                goto out_unlock;
2474
2475        ret = 0;
2476out_unlock:
2477        mutex_unlock(&head->mutex);
2478out:
2479        spin_unlock(&delayed_refs->lock);
2480        return ret;
2481}
2482
2483static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2484                                        struct btrfs_root *root,
2485                                        struct btrfs_path *path,
2486                                        u64 objectid, u64 offset, u64 bytenr)
2487{
2488        struct btrfs_root *extent_root = root->fs_info->extent_root;
2489        struct extent_buffer *leaf;
2490        struct btrfs_extent_data_ref *ref;
2491        struct btrfs_extent_inline_ref *iref;
2492        struct btrfs_extent_item *ei;
2493        struct btrfs_key key;
2494        u32 item_size;
2495        int ret;
2496
2497        key.objectid = bytenr;
2498        key.offset = (u64)-1;
2499        key.type = BTRFS_EXTENT_ITEM_KEY;
2500
2501        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2502        if (ret < 0)
2503                goto out;
2504        BUG_ON(ret == 0);
2505
2506        ret = -ENOENT;
2507        if (path->slots[0] == 0)
2508                goto out;
2509
2510        path->slots[0]--;
2511        leaf = path->nodes[0];
2512        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2513
2514        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2515                goto out;
2516
2517        ret = 1;
2518        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2519#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2520        if (item_size < sizeof(*ei)) {
2521                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2522                goto out;
2523        }
2524#endif
2525        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2526
2527        if (item_size != sizeof(*ei) +
2528            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2529                goto out;
2530
2531        if (btrfs_extent_generation(leaf, ei) <=
2532            btrfs_root_last_snapshot(&root->root_item))
2533                goto out;
2534
2535        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2536        if (btrfs_extent_inline_ref_type(leaf, iref) !=
2537            BTRFS_EXTENT_DATA_REF_KEY)
2538                goto out;
2539
2540        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2541        if (btrfs_extent_refs(leaf, ei) !=
2542            btrfs_extent_data_ref_count(leaf, ref) ||
2543            btrfs_extent_data_ref_root(leaf, ref) !=
2544            root->root_key.objectid ||
2545            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2546            btrfs_extent_data_ref_offset(leaf, ref) != offset)
2547                goto out;
2548
2549        ret = 0;
2550out:
2551        return ret;
2552}
2553
2554int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2555                          struct btrfs_root *root,
2556                          u64 objectid, u64 offset, u64 bytenr)
2557{
2558        struct btrfs_path *path;
2559        int ret;
2560        int ret2;
2561
2562        path = btrfs_alloc_path();
2563        if (!path)
2564                return -ENOENT;
2565
2566        do {
2567                ret = check_committed_ref(trans, root, path, objectid,
2568                                          offset, bytenr);
2569                if (ret && ret != -ENOENT)
2570                        goto out;
2571
2572                ret2 = check_delayed_ref(trans, root, path, objectid,
2573                                         offset, bytenr);
2574        } while (ret2 == -EAGAIN);
2575
2576        if (ret2 && ret2 != -ENOENT) {
2577                ret = ret2;
2578                goto out;
2579        }
2580
2581        if (ret != -ENOENT || ret2 != -ENOENT)
2582                ret = 0;
2583out:
2584        btrfs_free_path(path);
2585        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2586                WARN_ON(ret > 0);
2587        return ret;
2588}
2589
2590static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2591                           struct btrfs_root *root,
2592                           struct extent_buffer *buf,
2593                           int full_backref, int inc)
2594{
2595        u64 bytenr;
2596        u64 num_bytes;
2597        u64 parent;
2598        u64 ref_root;
2599        u32 nritems;
2600        struct btrfs_key key;
2601        struct btrfs_file_extent_item *fi;
2602        int i;
2603        int level;
2604        int ret = 0;
2605        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2606                            u64, u64, u64, u64, u64, u64);
2607
2608        ref_root = btrfs_header_owner(buf);
2609        nritems = btrfs_header_nritems(buf);
2610        level = btrfs_header_level(buf);
2611
2612        if (!root->ref_cows && level == 0)
2613                return 0;
2614
2615        if (inc)
2616                process_func = btrfs_inc_extent_ref;
2617        else
2618                process_func = btrfs_free_extent;
2619
2620        if (full_backref)
2621                parent = buf->start;
2622        else
2623                parent = 0;
2624
2625        for (i = 0; i < nritems; i++) {
2626                if (level == 0) {
2627                        btrfs_item_key_to_cpu(buf, &key, i);
2628                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2629                                continue;
2630                        fi = btrfs_item_ptr(buf, i,
2631                                            struct btrfs_file_extent_item);
2632                        if (btrfs_file_extent_type(buf, fi) ==
2633                            BTRFS_FILE_EXTENT_INLINE)
2634                                continue;
2635                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2636                        if (bytenr == 0)
2637                                continue;
2638
2639                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2640                        key.offset -= btrfs_file_extent_offset(buf, fi);
2641                        ret = process_func(trans, root, bytenr, num_bytes,
2642                                           parent, ref_root, key.objectid,
2643                                           key.offset);
2644                        if (ret)
2645                                goto fail;
2646                } else {
2647                        bytenr = btrfs_node_blockptr(buf, i);
2648                        num_bytes = btrfs_level_size(root, level - 1);
2649                        ret = process_func(trans, root, bytenr, num_bytes,
2650                                           parent, ref_root, level - 1, 0);
2651                        if (ret)
2652                                goto fail;
2653                }
2654        }
2655        return 0;
2656fail:
2657        BUG();
2658        return ret;
2659}
2660
2661int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2662                  struct extent_buffer *buf, int full_backref)
2663{
2664        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2665}
2666
2667int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2668                  struct extent_buffer *buf, int full_backref)
2669{
2670        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2671}
2672
2673static int write_one_cache_group(struct btrfs_trans_handle *trans,
2674                                 struct btrfs_root *root,
2675                                 struct btrfs_path *path,
2676                                 struct btrfs_block_group_cache *cache)
2677{
2678        int ret;
2679        struct btrfs_root *extent_root = root->fs_info->extent_root;
2680        unsigned long bi;
2681        struct extent_buffer *leaf;
2682
2683        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2684        if (ret < 0)
2685                goto fail;
2686        BUG_ON(ret);
2687
2688        leaf = path->nodes[0];
2689        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2690        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2691        btrfs_mark_buffer_dirty(leaf);
2692        btrfs_release_path(path);
2693fail:
2694        if (ret)
2695                return ret;
2696        return 0;
2697
2698}
2699
2700static struct btrfs_block_group_cache *
2701next_block_group(struct btrfs_root *root,
2702                 struct btrfs_block_group_cache *cache)
2703{
2704        struct rb_node *node;
2705        spin_lock(&root->fs_info->block_group_cache_lock);
2706        node = rb_next(&cache->cache_node);
2707        btrfs_put_block_group(cache);
2708        if (node) {
2709                cache = rb_entry(node, struct btrfs_block_group_cache,
2710                                 cache_node);
2711                btrfs_get_block_group(cache);
2712        } else
2713                cache = NULL;
2714        spin_unlock(&root->fs_info->block_group_cache_lock);
2715        return cache;
2716}
2717
2718static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2719                            struct btrfs_trans_handle *trans,
2720                            struct btrfs_path *path)
2721{
2722        struct btrfs_root *root = block_group->fs_info->tree_root;
2723        struct inode *inode = NULL;
2724        u64 alloc_hint = 0;
2725        int dcs = BTRFS_DC_ERROR;
2726        int num_pages = 0;
2727        int retries = 0;
2728        int ret = 0;
2729
2730        /*
2731         * If this block group is smaller than 100 megs don't bother caching the
2732         * block group.
2733         */
2734        if (block_group->key.offset < (100 * 1024 * 1024)) {
2735                spin_lock(&block_group->lock);
2736                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2737                spin_unlock(&block_group->lock);
2738                return 0;
2739        }
2740
2741again:
2742        inode = lookup_free_space_inode(root, block_group, path);
2743        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2744                ret = PTR_ERR(inode);
2745                btrfs_release_path(path);
2746                goto out;
2747        }
2748
2749        if (IS_ERR(inode)) {
2750                BUG_ON(retries);
2751                retries++;
2752
2753                if (block_group->ro)
2754                        goto out_free;
2755
2756                ret = create_free_space_inode(root, trans, block_group, path);
2757                if (ret)
2758                        goto out_free;
2759                goto again;
2760        }
2761
2762        /* We've already setup this transaction, go ahead and exit */
2763        if (block_group->cache_generation == trans->transid &&
2764            i_size_read(inode)) {
2765                dcs = BTRFS_DC_SETUP;
2766                goto out_put;
2767        }
2768
2769        /*
2770         * We want to set the generation to 0, that way if anything goes wrong
2771         * from here on out we know not to trust this cache when we load up next
2772         * time.
2773         */
2774        BTRFS_I(inode)->generation = 0;
2775        ret = btrfs_update_inode(trans, root, inode);
2776        WARN_ON(ret);
2777
2778        if (i_size_read(inode) > 0) {
2779                ret = btrfs_truncate_free_space_cache(root, trans, path,
2780                                                      inode);
2781                if (ret)
2782                        goto out_put;
2783        }
2784
2785        spin_lock(&block_group->lock);
2786        if (block_group->cached != BTRFS_CACHE_FINISHED) {
2787                /* We're not cached, don't bother trying to write stuff out */
2788                dcs = BTRFS_DC_WRITTEN;
2789                spin_unlock(&block_group->lock);
2790                goto out_put;
2791        }
2792        spin_unlock(&block_group->lock);
2793
2794        num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2795        if (!num_pages)
2796                num_pages = 1;
2797
2798        /*
2799         * Just to make absolutely sure we have enough space, we're going to
2800         * preallocate 12 pages worth of space for each block group.  In
2801         * practice we ought to use at most 8, but we need extra space so we can
2802         * add our header and have a terminator between the extents and the
2803         * bitmaps.
2804         */
2805        num_pages *= 16;
2806        num_pages *= PAGE_CACHE_SIZE;
2807
2808        ret = btrfs_check_data_free_space(inode, num_pages);
2809        if (ret)
2810                goto out_put;
2811
2812        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2813                                              num_pages, num_pages,
2814                                              &alloc_hint);
2815        if (!ret)
2816                dcs = BTRFS_DC_SETUP;
2817        btrfs_free_reserved_data_space(inode, num_pages);
2818
2819out_put:
2820        iput(inode);
2821out_free:
2822        btrfs_release_path(path);
2823out:
2824        spin_lock(&block_group->lock);
2825        if (!ret && dcs == BTRFS_DC_SETUP)
2826                block_group->cache_generation = trans->transid;
2827        block_group->disk_cache_state = dcs;
2828        spin_unlock(&block_group->lock);
2829
2830        return ret;
2831}
2832
2833int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2834                                   struct btrfs_root *root)
2835{
2836        struct btrfs_block_group_cache *cache;
2837        int err = 0;
2838        struct btrfs_path *path;
2839        u64 last = 0;
2840
2841        path = btrfs_alloc_path();
2842        if (!path)
2843                return -ENOMEM;
2844
2845again:
2846        while (1) {
2847                cache = btrfs_lookup_first_block_group(root->fs_info, last);
2848                while (cache) {
2849                        if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2850                                break;
2851                        cache = next_block_group(root, cache);
2852                }
2853                if (!cache) {
2854                        if (last == 0)
2855                                break;
2856                        last = 0;
2857                        continue;
2858                }
2859                err = cache_save_setup(cache, trans, path);
2860                last = cache->key.objectid + cache->key.offset;
2861                btrfs_put_block_group(cache);
2862        }
2863
2864        while (1) {
2865                if (last == 0) {
2866                        err = btrfs_run_delayed_refs(trans, root,
2867                                                     (unsigned long)-1);
2868                        BUG_ON(err);
2869                }
2870
2871                cache = btrfs_lookup_first_block_group(root->fs_info, last);
2872                while (cache) {
2873                        if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2874                                btrfs_put_block_group(cache);
2875                                goto again;
2876                        }
2877
2878                        if (cache->dirty)
2879                                break;
2880                        cache = next_block_group(root, cache);
2881                }
2882                if (!cache) {
2883                        if (last == 0)
2884                                break;
2885                        last = 0;
2886                        continue;
2887                }
2888
2889                if (cache->disk_cache_state == BTRFS_DC_SETUP)
2890                        cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2891                cache->dirty = 0;
2892                last = cache->key.objectid + cache->key.offset;
2893
2894                err = write_one_cache_group(trans, root, path, cache);
2895                BUG_ON(err);
2896                btrfs_put_block_group(cache);
2897        }
2898
2899        while (1) {
2900                /*
2901                 * I don't think this is needed since we're just marking our
2902                 * preallocated extent as written, but just in case it can't
2903                 * hurt.
2904                 */
2905                if (last == 0) {
2906                        err = btrfs_run_delayed_refs(trans, root,
2907                                                     (unsigned long)-1);
2908                        BUG_ON(err);
2909                }
2910
2911                cache = btrfs_lookup_first_block_group(root->fs_info, last);
2912                while (cache) {
2913                        /*
2914                         * Really this shouldn't happen, but it could if we
2915                         * couldn't write the entire preallocated extent and
2916                         * splitting the extent resulted in a new block.
2917                         */
2918                        if (cache->dirty) {
2919                                btrfs_put_block_group(cache);
2920                                goto again;
2921                        }
2922                        if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2923                                break;
2924                        cache = next_block_group(root, cache);
2925                }
2926                if (!cache) {
2927                        if (last == 0)
2928                                break;
2929                        last = 0;
2930                        continue;
2931                }
2932
2933                btrfs_write_out_cache(root, trans, cache, path);
2934
2935                /*
2936                 * If we didn't have an error then the cache state is still
2937                 * NEED_WRITE, so we can set it to WRITTEN.
2938                 */
2939                if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2940                        cache->disk_cache_state = BTRFS_DC_WRITTEN;
2941                last = cache->key.objectid + cache->key.offset;
2942                btrfs_put_block_group(cache);
2943        }
2944
2945        btrfs_free_path(path);
2946        return 0;
2947}
2948
2949int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
2950{
2951        struct btrfs_block_group_cache *block_group;
2952        int readonly = 0;
2953
2954        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
2955        if (!block_group || block_group->ro)
2956                readonly = 1;
2957        if (block_group)
2958                btrfs_put_block_group(block_group);
2959        return readonly;
2960}
2961
2962static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2963                             u64 total_bytes, u64 bytes_used,
2964                             struct btrfs_space_info **space_info)
2965{
2966        struct btrfs_space_info *found;
2967        int i;
2968        int factor;
2969
2970        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2971                     BTRFS_BLOCK_GROUP_RAID10))
2972                factor = 2;
2973        else
2974                factor = 1;
2975
2976        found = __find_space_info(info, flags);
2977        if (found) {
2978                spin_lock(&found->lock);
2979                found->total_bytes += total_bytes;
2980                found->disk_total += total_bytes * factor;
2981                found->bytes_used += bytes_used;
2982                found->disk_used += bytes_used * factor;
2983                found->full = 0;
2984                spin_unlock(&found->lock);
2985                *space_info = found;
2986                return 0;
2987        }
2988        found = kzalloc(sizeof(*found), GFP_NOFS);
2989        if (!found)
2990                return -ENOMEM;
2991
2992        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2993                INIT_LIST_HEAD(&found->block_groups[i]);
2994        init_rwsem(&found->groups_sem);
2995        spin_lock_init(&found->lock);
2996        found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2997                                BTRFS_BLOCK_GROUP_SYSTEM |
2998                                BTRFS_BLOCK_GROUP_METADATA);
2999        found->total_bytes = total_bytes;
3000        found->disk_total = total_bytes * factor;
3001        found->bytes_used = bytes_used;
3002        found->disk_used = bytes_used * factor;
3003        found->bytes_pinned = 0;
3004        found->bytes_reserved = 0;
3005        found->bytes_readonly = 0;
3006        found->bytes_may_use = 0;
3007        found->full = 0;
3008        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3009        found->chunk_alloc = 0;
3010        found->flush = 0;
3011        init_waitqueue_head(&found->wait);
3012        *space_info = found;
3013        list_add_rcu(&found->list, &info->space_info);
3014        return 0;
3015}
3016
3017static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3018{
3019        u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
3020                                   BTRFS_BLOCK_GROUP_RAID1 |
3021                                   BTRFS_BLOCK_GROUP_RAID10 |
3022                                   BTRFS_BLOCK_GROUP_DUP);
3023        if (extra_flags) {
3024                if (flags & BTRFS_BLOCK_GROUP_DATA)
3025                        fs_info->avail_data_alloc_bits |= extra_flags;
3026                if (flags & BTRFS_BLOCK_GROUP_METADATA)
3027                        fs_info->avail_metadata_alloc_bits |= extra_flags;
3028                if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3029                        fs_info->avail_system_alloc_bits |= extra_flags;
3030        }
3031}
3032
3033u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3034{
3035        /*
3036         * we add in the count of missing devices because we want
3037         * to make sure that any RAID levels on a degraded FS
3038         * continue to be honored.
3039         */
3040        u64 num_devices = root->fs_info->fs_devices->rw_devices +
3041                root->fs_info->fs_devices->missing_devices;
3042
3043        if (num_devices == 1)
3044                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3045        if (num_devices < 4)
3046                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3047
3048        if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3049            (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3050                      BTRFS_BLOCK_GROUP_RAID10))) {
3051                flags &= ~BTRFS_BLOCK_GROUP_DUP;
3052        }
3053
3054        if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3055            (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3056                flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3057        }
3058
3059        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3060            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3061             (flags & BTRFS_BLOCK_GROUP_RAID10) |
3062             (flags & BTRFS_BLOCK_GROUP_DUP)))
3063                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3064        return flags;
3065}
3066
3067static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3068{
3069        if (flags & BTRFS_BLOCK_GROUP_DATA)
3070                flags |= root->fs_info->avail_data_alloc_bits &
3071                         root->fs_info->data_alloc_profile;
3072        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3073                flags |= root->fs_info->avail_system_alloc_bits &
3074                         root->fs_info->system_alloc_profile;
3075        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3076                flags |= root->fs_info->avail_metadata_alloc_bits &
3077                         root->fs_info->metadata_alloc_profile;
3078        return btrfs_reduce_alloc_profile(root, flags);
3079}
3080
3081u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3082{
3083        u64 flags;
3084
3085        if (data)
3086                flags = BTRFS_BLOCK_GROUP_DATA;
3087        else if (root == root->fs_info->chunk_root)
3088                flags = BTRFS_BLOCK_GROUP_SYSTEM;
3089        else
3090                flags = BTRFS_BLOCK_GROUP_METADATA;
3091
3092        return get_alloc_profile(root, flags);
3093}
3094
3095void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3096{
3097        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3098                                                       BTRFS_BLOCK_GROUP_DATA);
3099}
3100
3101/*
3102 * This will check the space that the inode allocates from to make sure we have
3103 * enough space for bytes.
3104 */
3105int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3106{
3107        struct btrfs_space_info *data_sinfo;
3108        struct btrfs_root *root = BTRFS_I(inode)->root;
3109        u64 used;
3110        int ret = 0, committed = 0, alloc_chunk = 1;
3111
3112        /* make sure bytes are sectorsize aligned */
3113        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3114
3115        if (root == root->fs_info->tree_root ||
3116            BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3117                alloc_chunk = 0;
3118                committed = 1;
3119        }
3120
3121        data_sinfo = BTRFS_I(inode)->space_info;
3122        if (!data_sinfo)
3123                goto alloc;
3124
3125again:
3126        /* make sure we have enough space to handle the data first */
3127        spin_lock(&data_sinfo->lock);
3128        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3129                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3130                data_sinfo->bytes_may_use;
3131
3132        if (used + bytes > data_sinfo->total_bytes) {
3133                struct btrfs_trans_handle *trans;
3134
3135                /*
3136                 * if we don't have enough free bytes in this space then we need
3137                 * to alloc a new chunk.
3138                 */
3139                if (!data_sinfo->full && alloc_chunk) {
3140                        u64 alloc_target;
3141
3142                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3143                        spin_unlock(&data_sinfo->lock);
3144alloc:
3145                        alloc_target = btrfs_get_alloc_profile(root, 1);
3146                        trans = btrfs_join_transaction(root);
3147                        if (IS_ERR(trans))
3148                                return PTR_ERR(trans);
3149
3150                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3151                                             bytes + 2 * 1024 * 1024,
3152                                             alloc_target,
3153                                             CHUNK_ALLOC_NO_FORCE);
3154                        btrfs_end_transaction(trans, root);
3155                        if (ret < 0) {
3156                                if (ret != -ENOSPC)
3157                                        return ret;
3158                                else
3159                                        goto commit_trans;
3160                        }
3161
3162                        if (!data_sinfo) {
3163                                btrfs_set_inode_space_info(root, inode);
3164                                data_sinfo = BTRFS_I(inode)->space_info;
3165                        }
3166                        goto again;
3167                }
3168
3169                /*
3170                 * If we have less pinned bytes than we want to allocate then
3171                 * don't bother committing the transaction, it won't help us.
3172                 */
3173                if (data_sinfo->bytes_pinned < bytes)
3174                        committed = 1;
3175                spin_unlock(&data_sinfo->lock);
3176
3177                /* commit the current transaction and try again */
3178commit_trans:
3179                if (!committed &&
3180                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3181                        committed = 1;
3182                        trans = btrfs_join_transaction(root);
3183                        if (IS_ERR(trans))
3184                                return PTR_ERR(trans);
3185                        ret = btrfs_commit_transaction(trans, root);
3186                        if (ret)
3187                                return ret;
3188                        goto again;
3189                }
3190
3191                return -ENOSPC;
3192        }
3193        data_sinfo->bytes_may_use += bytes;
3194        spin_unlock(&data_sinfo->lock);
3195
3196        return 0;
3197}
3198
3199/*
3200 * Called if we need to clear a data reservation for this inode.
3201 */
3202void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3203{
3204        struct btrfs_root *root = BTRFS_I(inode)->root;
3205        struct btrfs_space_info *data_sinfo;
3206
3207        /* make sure bytes are sectorsize aligned */
3208        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3209
3210        data_sinfo = BTRFS_I(inode)->space_info;
3211        spin_lock(&data_sinfo->lock);
3212        data_sinfo->bytes_may_use -= bytes;
3213        spin_unlock(&data_sinfo->lock);
3214}
3215
3216static void force_metadata_allocation(struct btrfs_fs_info *info)
3217{
3218        struct list_head *head = &info->space_info;
3219        struct btrfs_space_info *found;
3220
3221        rcu_read_lock();
3222        list_for_each_entry_rcu(found, head, list) {
3223                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3224                        found->force_alloc = CHUNK_ALLOC_FORCE;
3225        }
3226        rcu_read_unlock();
3227}
3228
3229static int should_alloc_chunk(struct btrfs_root *root,
3230                              struct btrfs_space_info *sinfo, u64 alloc_bytes,
3231                              int force)
3232{
3233        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3234        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3235        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3236        u64 thresh;
3237
3238        if (force == CHUNK_ALLOC_FORCE)
3239                return 1;
3240
3241        /*
3242         * We need to take into account the global rsv because for all intents
3243         * and purposes it's used space.  Don't worry about locking the
3244         * global_rsv, it doesn't change except when the transaction commits.
3245         */
3246        num_allocated += global_rsv->size;
3247
3248        /*
3249         * in limited mode, we want to have some free space up to
3250         * about 1% of the FS size.
3251         */
3252        if (force == CHUNK_ALLOC_LIMITED) {
3253                thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3254                thresh = max_t(u64, 64 * 1024 * 1024,
3255                               div_factor_fine(thresh, 1));
3256
3257                if (num_bytes - num_allocated < thresh)
3258                        return 1;
3259        }
3260
3261        /*
3262         * we have two similar checks here, one based on percentage
3263         * and once based on a hard number of 256MB.  The idea
3264         * is that if we have a good amount of free
3265         * room, don't allocate a chunk.  A good mount is
3266         * less than 80% utilized of the chunks we have allocated,
3267         * or more than 256MB free
3268         */
3269        if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3270                return 0;
3271
3272        if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3273                return 0;
3274
3275        thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3276
3277        /* 256MB or 5% of the FS */
3278        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3279
3280        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3281                return 0;
3282        return 1;
3283}
3284
3285static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3286                          struct btrfs_root *extent_root, u64 alloc_bytes,
3287                          u64 flags, int force)
3288{
3289        struct btrfs_space_info *space_info;
3290        struct btrfs_fs_info *fs_info = extent_root->fs_info;
3291        int wait_for_alloc = 0;
3292        int ret = 0;
3293
3294        flags = btrfs_reduce_alloc_profile(extent_root, flags);
3295
3296        space_info = __find_space_info(extent_root->fs_info, flags);
3297        if (!space_info) {
3298                ret = update_space_info(extent_root->fs_info, flags,
3299                                        0, 0, &space_info);
3300                BUG_ON(ret);
3301        }
3302        BUG_ON(!space_info);
3303
3304again:
3305        spin_lock(&space_info->lock);
3306        if (space_info->force_alloc)
3307                force = space_info->force_alloc;
3308        if (space_info->full) {
3309                spin_unlock(&space_info->lock);
3310                return 0;
3311        }
3312
3313        if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3314                spin_unlock(&space_info->lock);
3315                return 0;
3316        } else if (space_info->chunk_alloc) {
3317                wait_for_alloc = 1;
3318        } else {
3319                space_info->chunk_alloc = 1;
3320        }
3321
3322        spin_unlock(&space_info->lock);
3323
3324        mutex_lock(&fs_info->chunk_mutex);
3325
3326        /*
3327         * The chunk_mutex is held throughout the entirety of a chunk
3328         * allocation, so once we've acquired the chunk_mutex we know that the
3329         * other guy is done and we need to recheck and see if we should
3330         * allocate.
3331         */
3332        if (wait_for_alloc) {
3333                mutex_unlock(&fs_info->chunk_mutex);
3334                wait_for_alloc = 0;
3335                goto again;
3336        }
3337
3338        /*
3339         * If we have mixed data/metadata chunks we want to make sure we keep
3340         * allocating mixed chunks instead of individual chunks.
3341         */
3342        if (btrfs_mixed_space_info(space_info))
3343                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3344
3345        /*
3346         * if we're doing a data chunk, go ahead and make sure that
3347         * we keep a reasonable number of metadata chunks allocated in the
3348         * FS as well.
3349         */
3350        if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3351                fs_info->data_chunk_allocations++;
3352                if (!(fs_info->data_chunk_allocations %
3353                      fs_info->metadata_ratio))
3354                        force_metadata_allocation(fs_info);
3355        }
3356
3357        ret = btrfs_alloc_chunk(trans, extent_root, flags);
3358        if (ret < 0 && ret != -ENOSPC)
3359                goto out;
3360
3361        spin_lock(&space_info->lock);
3362        if (ret)
3363                space_info->full = 1;
3364        else
3365                ret = 1;
3366
3367        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3368        space_info->chunk_alloc = 0;
3369        spin_unlock(&space_info->lock);
3370out:
3371        mutex_unlock(&extent_root->fs_info->chunk_mutex);
3372        return ret;
3373}
3374
3375/*
3376 * shrink metadata reservation for delalloc
3377 */
3378static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3379                           bool wait_ordered)
3380{
3381        struct btrfs_block_rsv *block_rsv;
3382        struct btrfs_space_info *space_info;
3383        struct btrfs_trans_handle *trans;
3384        u64 reserved;
3385        u64 max_reclaim;
3386        u64 reclaimed = 0;
3387        long time_left;
3388        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3389        int loops = 0;
3390        unsigned long progress;
3391
3392        trans = (struct btrfs_trans_handle *)current->journal_info;
3393        block_rsv = &root->fs_info->delalloc_block_rsv;
3394        space_info = block_rsv->space_info;
3395
3396        smp_mb();
3397        reserved = space_info->bytes_may_use;
3398        progress = space_info->reservation_progress;
3399
3400        if (reserved == 0)
3401                return 0;
3402
3403        smp_mb();
3404        if (root->fs_info->delalloc_bytes == 0) {
3405                if (trans)
3406                        return 0;
3407                btrfs_wait_ordered_extents(root, 0, 0);
3408                return 0;
3409        }
3410
3411        max_reclaim = min(reserved, to_reclaim);
3412        nr_pages = max_t(unsigned long, nr_pages,
3413                         max_reclaim >> PAGE_CACHE_SHIFT);
3414        while (loops < 1024) {
3415                /* have the flusher threads jump in and do some IO */
3416                smp_mb();
3417                nr_pages = min_t(unsigned long, nr_pages,
3418                       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3419                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3420                                                WB_REASON_FS_FREE_SPACE);
3421
3422                spin_lock(&space_info->lock);
3423                if (reserved > space_info->bytes_may_use)
3424                        reclaimed += reserved - space_info->bytes_may_use;
3425                reserved = space_info->bytes_may_use;
3426                spin_unlock(&space_info->lock);
3427
3428                loops++;
3429
3430                if (reserved == 0 || reclaimed >= max_reclaim)
3431                        break;
3432
3433                if (trans && trans->transaction->blocked)
3434                        return -EAGAIN;
3435
3436                if (wait_ordered && !trans) {
3437                        btrfs_wait_ordered_extents(root, 0, 0);
3438                } else {
3439                        time_left = schedule_timeout_interruptible(1);
3440
3441                        /* We were interrupted, exit */
3442                        if (time_left)
3443                                break;
3444                }
3445
3446                /* we've kicked the IO a few times, if anything has been freed,
3447                 * exit.  There is no sense in looping here for a long time
3448                 * when we really need to commit the transaction, or there are
3449                 * just too many writers without enough free space
3450                 */
3451
3452                if (loops > 3) {
3453                        smp_mb();
3454                        if (progress != space_info->reservation_progress)
3455                                break;
3456                }
3457
3458        }
3459
3460        return reclaimed >= to_reclaim;
3461}
3462
3463/**
3464 * maybe_commit_transaction - possibly commit the transaction if its ok to
3465 * @root - the root we're allocating for
3466 * @bytes - the number of bytes we want to reserve
3467 * @force - force the commit
3468 *
3469 * This will check to make sure that committing the transaction will actually
3470 * get us somewhere and then commit the transaction if it does.  Otherwise it
3471 * will return -ENOSPC.
3472 */
3473static int may_commit_transaction(struct btrfs_root *root,
3474                                  struct btrfs_space_info *space_info,
3475                                  u64 bytes, int force)
3476{
3477        struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3478        struct btrfs_trans_handle *trans;
3479
3480        trans = (struct btrfs_trans_handle *)current->journal_info;
3481        if (trans)
3482                return -EAGAIN;
3483
3484        if (force)
3485                goto commit;
3486
3487        /* See if there is enough pinned space to make this reservation */
3488        spin_lock(&space_info->lock);
3489        if (space_info->bytes_pinned >= bytes) {
3490                spin_unlock(&space_info->lock);
3491                goto commit;
3492        }
3493        spin_unlock(&space_info->lock);
3494
3495        /*
3496         * See if there is some space in the delayed insertion reservation for
3497         * this reservation.
3498         */
3499        if (space_info != delayed_rsv->space_info)
3500                return -ENOSPC;
3501
3502        spin_lock(&delayed_rsv->lock);
3503        if (delayed_rsv->size < bytes) {
3504                spin_unlock(&delayed_rsv->lock);
3505                return -ENOSPC;
3506        }
3507        spin_unlock(&delayed_rsv->lock);
3508
3509commit:
3510        trans = btrfs_join_transaction(root);
3511        if (IS_ERR(trans))
3512                return -ENOSPC;
3513
3514        return btrfs_commit_transaction(trans, root);
3515}
3516
3517/**
3518 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3519 * @root - the root we're allocating for
3520 * @block_rsv - the block_rsv we're allocating for
3521 * @orig_bytes - the number of bytes we want
3522 * @flush - wether or not we can flush to make our reservation
3523 *
3524 * This will reserve orgi_bytes number of bytes from the space info associated
3525 * with the block_rsv.  If there is not enough space it will make an attempt to
3526 * flush out space to make room.  It will do this by flushing delalloc if
3527 * possible or committing the transaction.  If flush is 0 then no attempts to
3528 * regain reservations will be made and this will fail if there is not enough
3529 * space already.
3530 */
3531static int reserve_metadata_bytes(struct btrfs_root *root,
3532                                  struct btrfs_block_rsv *block_rsv,
3533                                  u64 orig_bytes, int flush)
3534{
3535        struct btrfs_space_info *space_info = block_rsv->space_info;
3536        u64 used;
3537        u64 num_bytes = orig_bytes;
3538        int retries = 0;
3539        int ret = 0;
3540        bool committed = false;
3541        bool flushing = false;
3542        bool wait_ordered = false;
3543
3544again:
3545        ret = 0;
3546        spin_lock(&space_info->lock);
3547        /*
3548         * We only want to wait if somebody other than us is flushing and we are
3549         * actually alloed to flush.
3550         */
3551        while (flush && !flushing && space_info->flush) {
3552                spin_unlock(&space_info->lock);
3553                /*
3554                 * If we have a trans handle we can't wait because the flusher
3555                 * may have to commit the transaction, which would mean we would
3556                 * deadlock since we are waiting for the flusher to finish, but
3557                 * hold the current transaction open.
3558                 */
3559                if (current->journal_info)
3560                        return -EAGAIN;
3561                ret = wait_event_interruptible(space_info->wait,
3562                                               !space_info->flush);
3563                /* Must have been interrupted, return */
3564                if (ret)
3565                        return -EINTR;
3566
3567                spin_lock(&space_info->lock);
3568        }
3569
3570        ret = -ENOSPC;
3571        used = space_info->bytes_used + space_info->bytes_reserved +
3572                space_info->bytes_pinned + space_info->bytes_readonly +
3573                space_info->bytes_may_use;
3574
3575        /*
3576         * The idea here is that we've not already over-reserved the block group
3577         * then we can go ahead and save our reservation first and then start
3578         * flushing if we need to.  Otherwise if we've already overcommitted
3579         * lets start flushing stuff first and then come back and try to make
3580         * our reservation.
3581         */
3582        if (used <= space_info->total_bytes) {
3583                if (used + orig_bytes <= space_info->total_bytes) {
3584                        space_info->bytes_may_use += orig_bytes;
3585                        ret = 0;
3586                } else {
3587                        /*
3588                         * Ok set num_bytes to orig_bytes since we aren't
3589                         * overocmmitted, this way we only try and reclaim what
3590                         * we need.
3591                         */
3592                        num_bytes = orig_bytes;
3593                }
3594        } else {
3595                /*
3596                 * Ok we're over committed, set num_bytes to the overcommitted
3597                 * amount plus the amount of bytes that we need for this
3598                 * reservation.
3599                 */
3600                wait_ordered = true;
3601                num_bytes = used - space_info->total_bytes +
3602                        (orig_bytes * (retries + 1));
3603        }
3604
3605        if (ret) {
3606                u64 profile = btrfs_get_alloc_profile(root, 0);
3607                u64 avail;
3608
3609                /*
3610                 * If we have a lot of space that's pinned, don't bother doing
3611                 * the overcommit dance yet and just commit the transaction.
3612                 */
3613                avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3614                do_div(avail, 10);
3615                if (space_info->bytes_pinned >= avail && flush && !committed) {
3616                        space_info->flush = 1;
3617                        flushing = true;
3618                        spin_unlock(&space_info->lock);
3619                        ret = may_commit_transaction(root, space_info,
3620                                                     orig_bytes, 1);
3621                        if (ret)
3622                                goto out;
3623                        committed = true;
3624                        goto again;
3625                }
3626
3627                spin_lock(&root->fs_info->free_chunk_lock);
3628                avail = root->fs_info->free_chunk_space;
3629
3630                /*
3631                 * If we have dup, raid1 or raid10 then only half of the free
3632                 * space is actually useable.
3633                 */
3634                if (profile & (BTRFS_BLOCK_GROUP_DUP |
3635                               BTRFS_BLOCK_GROUP_RAID1 |
3636                               BTRFS_BLOCK_GROUP_RAID10))
3637                        avail >>= 1;
3638
3639                /*
3640                 * If we aren't flushing don't let us overcommit too much, say
3641                 * 1/8th of the space.  If we can flush, let it overcommit up to
3642                 * 1/2 of the space.
3643                 */
3644                if (flush)
3645                        avail >>= 3;
3646                else
3647                        avail >>= 1;
3648                 spin_unlock(&root->fs_info->free_chunk_lock);
3649
3650                if (used + num_bytes < space_info->total_bytes + avail) {
3651                        space_info->bytes_may_use += orig_bytes;
3652                        ret = 0;
3653                } else {
3654                        wait_ordered = true;
3655                }
3656        }
3657
3658        /*
3659         * Couldn't make our reservation, save our place so while we're trying
3660         * to reclaim space we can actually use it instead of somebody else
3661         * stealing it from us.
3662         */
3663        if (ret && flush) {
3664                flushing = true;
3665                space_info->flush = 1;
3666        }
3667
3668        spin_unlock(&space_info->lock);
3669
3670        if (!ret || !flush)
3671                goto out;
3672
3673        /*
3674         * We do synchronous shrinking since we don't actually unreserve
3675         * metadata until after the IO is completed.
3676         */
3677        ret = shrink_delalloc(root, num_bytes, wait_ordered);
3678        if (ret < 0)
3679                goto out;
3680
3681        ret = 0;
3682
3683        /*
3684         * So if we were overcommitted it's possible that somebody else flushed
3685         * out enough space and we simply didn't have enough space to reclaim,
3686         * so go back around and try again.
3687         */
3688        if (retries < 2) {
3689                wait_ordered = true;
3690                retries++;
3691                goto again;
3692        }
3693
3694        ret = -ENOSPC;
3695        if (committed)
3696                goto out;
3697
3698        ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3699        if (!ret) {
3700                committed = true;
3701                goto again;
3702        }
3703
3704out:
3705        if (flushing) {
3706                spin_lock(&space_info->lock);
3707                space_info->flush = 0;
3708                wake_up_all(&space_info->wait);
3709                spin_unlock(&space_info->lock);
3710        }
3711        return ret;
3712}
3713
3714static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3715                                             struct btrfs_root *root)
3716{
3717        struct btrfs_block_rsv *block_rsv = NULL;
3718
3719        if (root->ref_cows || root == root->fs_info->csum_root)
3720                block_rsv = trans->block_rsv;
3721
3722        if (!block_rsv)
3723                block_rsv = root->block_rsv;
3724
3725        if (!block_rsv)
3726                block_rsv = &root->fs_info->empty_block_rsv;
3727
3728        return block_rsv;
3729}
3730
3731static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3732                               u64 num_bytes)
3733{
3734        int ret = -ENOSPC;
3735        spin_lock(&block_rsv->lock);
3736        if (block_rsv->reserved >= num_bytes) {
3737                block_rsv->reserved -= num_bytes;
3738                if (block_rsv->reserved < block_rsv->size)
3739                        block_rsv->full = 0;
3740                ret = 0;
3741        }
3742        spin_unlock(&block_rsv->lock);
3743        return ret;
3744}
3745
3746static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3747                                u64 num_bytes, int update_size)
3748{
3749        spin_lock(&block_rsv->lock);
3750        block_rsv->reserved += num_bytes;
3751        if (update_size)
3752                block_rsv->size += num_bytes;
3753        else if (block_rsv->reserved >= block_rsv->size)
3754                block_rsv->full = 1;
3755        spin_unlock(&block_rsv->lock);
3756}
3757
3758static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3759                                    struct btrfs_block_rsv *dest, u64 num_bytes)
3760{
3761        struct btrfs_space_info *space_info = block_rsv->space_info;
3762
3763        spin_lock(&block_rsv->lock);
3764        if (num_bytes == (u64)-1)
3765                num_bytes = block_rsv->size;
3766        block_rsv->size -= num_bytes;
3767        if (block_rsv->reserved >= block_rsv->size) {
3768                num_bytes = block_rsv->reserved - block_rsv->size;
3769                block_rsv->reserved = block_rsv->size;
3770                block_rsv->full = 1;
3771        } else {
3772                num_bytes = 0;
3773        }
3774        spin_unlock(&block_rsv->lock);
3775
3776        if (num_bytes > 0) {
3777                if (dest) {
3778                        spin_lock(&dest->lock);
3779                        if (!dest->full) {
3780                                u64 bytes_to_add;
3781
3782                                bytes_to_add = dest->size - dest->reserved;
3783                                bytes_to_add = min(num_bytes, bytes_to_add);
3784                                dest->reserved += bytes_to_add;
3785                                if (dest->reserved >= dest->size)
3786                                        dest->full = 1;
3787                                num_bytes -= bytes_to_add;
3788                        }
3789                        spin_unlock(&dest->lock);
3790                }
3791                if (num_bytes) {
3792                        spin_lock(&space_info->lock);
3793                        space_info->bytes_may_use -= num_bytes;
3794                        space_info->reservation_progress++;
3795                        spin_unlock(&space_info->lock);
3796                }
3797        }
3798}
3799
3800static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3801                                   struct btrfs_block_rsv *dst, u64 num_bytes)
3802{
3803        int ret;
3804
3805        ret = block_rsv_use_bytes(src, num_bytes);
3806        if (ret)
3807                return ret;
3808
3809        block_rsv_add_bytes(dst, num_bytes, 1);
3810        return 0;
3811}
3812
3813void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3814{
3815        memset(rsv, 0, sizeof(*rsv));
3816        spin_lock_init(&rsv->lock);
3817}
3818
3819struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3820{
3821        struct btrfs_block_rsv *block_rsv;
3822        struct btrfs_fs_info *fs_info = root->fs_info;
3823
3824        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3825        if (!block_rsv)
3826                return NULL;
3827
3828        btrfs_init_block_rsv(block_rsv);
3829        block_rsv->space_info = __find_space_info(fs_info,
3830                                                  BTRFS_BLOCK_GROUP_METADATA);
3831        return block_rsv;
3832}
3833
3834void btrfs_free_block_rsv(struct btrfs_root *root,
3835                          struct btrfs_block_rsv *rsv)
3836{
3837        btrfs_block_rsv_release(root, rsv, (u64)-1);
3838        kfree(rsv);
3839}
3840
3841static inline int __block_rsv_add(struct btrfs_root *root,
3842                                  struct btrfs_block_rsv *block_rsv,
3843                                  u64 num_bytes, int flush)
3844{
3845        int ret;
3846
3847        if (num_bytes == 0)
3848                return 0;
3849
3850        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3851        if (!ret) {
3852                block_rsv_add_bytes(block_rsv, num_bytes, 1);
3853                return 0;
3854        }
3855
3856        return ret;
3857}
3858
3859int btrfs_block_rsv_add(struct btrfs_root *root,
3860                        struct btrfs_block_rsv *block_rsv,
3861                        u64 num_bytes)
3862{
3863        return __block_rsv_add(root, block_rsv, num_bytes, 1);
3864}
3865
3866int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3867                                struct btrfs_block_rsv *block_rsv,
3868                                u64 num_bytes)
3869{
3870        return __block_rsv_add(root, block_rsv, num_bytes, 0);
3871}
3872
3873int btrfs_block_rsv_check(struct btrfs_root *root,
3874                          struct btrfs_block_rsv *block_rsv, int min_factor)
3875{
3876        u64 num_bytes = 0;
3877        int ret = -ENOSPC;
3878
3879        if (!block_rsv)
3880                return 0;
3881
3882        spin_lock(&block_rsv->lock);
3883        num_bytes = div_factor(block_rsv->size, min_factor);
3884        if (block_rsv->reserved >= num_bytes)
3885                ret = 0;
3886        spin_unlock(&block_rsv->lock);
3887
3888        return ret;
3889}
3890
3891static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
3892                                           struct btrfs_block_rsv *block_rsv,
3893                                           u64 min_reserved, int flush)
3894{
3895        u64 num_bytes = 0;
3896        int ret = -ENOSPC;
3897
3898        if (!block_rsv)
3899                return 0;
3900
3901        spin_lock(&block_rsv->lock);
3902        num_bytes = min_reserved;
3903        if (block_rsv->reserved >= num_bytes)
3904                ret = 0;
3905        else
3906                num_bytes -= block_rsv->reserved;
3907        spin_unlock(&block_rsv->lock);
3908
3909        if (!ret)
3910                return 0;
3911
3912        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3913        if (!ret) {
3914                block_rsv_add_bytes(block_rsv, num_bytes, 0);
3915                return 0;
3916        }
3917
3918        return ret;
3919}
3920
3921int btrfs_block_rsv_refill(struct btrfs_root *root,
3922                           struct btrfs_block_rsv *block_rsv,
3923                           u64 min_reserved)
3924{
3925        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
3926}
3927
3928int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
3929                                   struct btrfs_block_rsv *block_rsv,
3930                                   u64 min_reserved)
3931{
3932        return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
3933}
3934
3935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3936                            struct btrfs_block_rsv *dst_rsv,
3937                            u64 num_bytes)
3938{
3939        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3940}
3941
3942void btrfs_block_rsv_release(struct btrfs_root *root,
3943                             struct btrfs_block_rsv *block_rsv,
3944                             u64 num_bytes)
3945{
3946        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3947        if (global_rsv->full || global_rsv == block_rsv ||
3948            block_rsv->space_info != global_rsv->space_info)
3949                global_rsv = NULL;
3950        block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3951}
3952
3953/*
3954 * helper to calculate size of global block reservation.
3955 * the desired value is sum of space used by extent tree,
3956 * checksum tree and root tree
3957 */
3958static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3959{
3960        struct btrfs_space_info *sinfo;
3961        u64 num_bytes;
3962        u64 meta_used;
3963        u64 data_used;
3964        int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3965
3966        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3967        spin_lock(&sinfo->lock);
3968        data_used = sinfo->bytes_used;
3969        spin_unlock(&sinfo->lock);
3970
3971        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3972        spin_lock(&sinfo->lock);
3973        if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
3974                data_used = 0;
3975        meta_used = sinfo->bytes_used;
3976        spin_unlock(&sinfo->lock);
3977
3978        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3979                    csum_size * 2;
3980        num_bytes += div64_u64(data_used + meta_used, 50);
3981
3982        if (num_bytes * 3 > meta_used)
3983                num_bytes = div64_u64(meta_used, 3);
3984
3985        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3986}
3987
3988static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3989{
3990        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3991        struct btrfs_space_info *sinfo = block_rsv->space_info;
3992        u64 num_bytes;
3993
3994        num_bytes = calc_global_metadata_size(fs_info);
3995
3996        spin_lock(&block_rsv->lock);
3997        spin_lock(&sinfo->lock);
3998
3999        block_rsv->size = num_bytes;
4000
4001        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4002                    sinfo->bytes_reserved + sinfo->bytes_readonly +
4003                    sinfo->bytes_may_use;
4004
4005        if (sinfo->total_bytes > num_bytes) {
4006                num_bytes = sinfo->total_bytes - num_bytes;
4007                block_rsv->reserved += num_bytes;
4008                sinfo->bytes_may_use += num_bytes;
4009        }
4010
4011        if (block_rsv->reserved >= block_rsv->size) {
4012                num_bytes = block_rsv->reserved - block_rsv->size;
4013                sinfo->bytes_may_use -= num_bytes;
4014                sinfo->reservation_progress++;
4015                block_rsv->reserved = block_rsv->size;
4016                block_rsv->full = 1;
4017        }
4018
4019        spin_unlock(&sinfo->lock);
4020        spin_unlock(&block_rsv->lock);
4021}
4022
4023static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4024{
4025        struct btrfs_space_info *space_info;
4026
4027        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4028        fs_info->chunk_block_rsv.space_info = space_info;
4029
4030        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4031        fs_info->global_block_rsv.space_info = space_info;
4032        fs_info->delalloc_block_rsv.space_info = space_info;
4033        fs_info->trans_block_rsv.space_info = space_info;
4034        fs_info->empty_block_rsv.space_info = space_info;
4035        fs_info->delayed_block_rsv.space_info = space_info;
4036
4037        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4038        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4039        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4040        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4041        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4042
4043        update_global_block_rsv(fs_info);
4044}
4045
4046static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4047{
4048        block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
4049        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4050        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4051        WARN_ON(fs_info->trans_block_rsv.size > 0);
4052        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4053        WARN_ON(fs_info->chunk_block_rsv.size > 0);
4054        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4055        WARN_ON(fs_info->delayed_block_rsv.size > 0);
4056        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4057}
4058
4059void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4060                                  struct btrfs_root *root)
4061{
4062        if (!trans->bytes_reserved)
4063                return;
4064
4065        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4066        trans->bytes_reserved = 0;
4067}
4068
4069int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4070                                  struct inode *inode)
4071{
4072        struct btrfs_root *root = BTRFS_I(inode)->root;
4073        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4074        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4075
4076        /*
4077         * We need to hold space in order to delete our orphan item once we've
4078         * added it, so this takes the reservation so we can release it later
4079         * when we are truly done with the orphan item.
4080         */
4081        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4082        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4083}
4084
4085void btrfs_orphan_release_metadata(struct inode *inode)
4086{
4087        struct btrfs_root *root = BTRFS_I(inode)->root;
4088        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4089        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4090}
4091
4092int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4093                                struct btrfs_pending_snapshot *pending)
4094{
4095        struct btrfs_root *root = pending->root;
4096        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4097        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4098        /*
4099         * two for root back/forward refs, two for directory entries
4100         * and one for root of the snapshot.
4101         */
4102        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
4103        dst_rsv->space_info = src_rsv->space_info;
4104        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4105}
4106
4107/**
4108 * drop_outstanding_extent - drop an outstanding extent
4109 * @inode: the inode we're dropping the extent for
4110 *
4111 * This is called when we are freeing up an outstanding extent, either called
4112 * after an error or after an extent is written.  This will return the number of
4113 * reserved extents that need to be freed.  This must be called with
4114 * BTRFS_I(inode)->lock held.
4115 */
4116static unsigned drop_outstanding_extent(struct inode *inode)
4117{
4118        unsigned drop_inode_space = 0;
4119        unsigned dropped_extents = 0;
4120
4121        BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4122        BTRFS_I(inode)->outstanding_extents--;
4123
4124        if (BTRFS_I(inode)->outstanding_extents == 0 &&
4125            BTRFS_I(inode)->delalloc_meta_reserved) {
4126                drop_inode_space = 1;
4127                BTRFS_I(inode)->delalloc_meta_reserved = 0;
4128        }
4129
4130        /*
4131         * If we have more or the same amount of outsanding extents than we have
4132         * reserved then we need to leave the reserved extents count alone.
4133         */
4134        if (BTRFS_I(inode)->outstanding_extents >=
4135            BTRFS_I(inode)->reserved_extents)
4136                return drop_inode_space;
4137
4138        dropped_extents = BTRFS_I(inode)->reserved_extents -
4139                BTRFS_I(inode)->outstanding_extents;
4140        BTRFS_I(inode)->reserved_extents -= dropped_extents;
4141        return dropped_extents + drop_inode_space;
4142}
4143
4144/**
4145 * calc_csum_metadata_size - return the amount of metada space that must be
4146 *      reserved/free'd for the given bytes.
4147 * @inode: the inode we're manipulating
4148 * @num_bytes: the number of bytes in question
4149 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4150 *
4151 * This adjusts the number of csum_bytes in the inode and then returns the
4152 * correct amount of metadata that must either be reserved or freed.  We
4153 * calculate how many checksums we can fit into one leaf and then divide the
4154 * number of bytes that will need to be checksumed by this value to figure out
4155 * how many checksums will be required.  If we are adding bytes then the number
4156 * may go up and we will return the number of additional bytes that must be
4157 * reserved.  If it is going down we will return the number of bytes that must
4158 * be freed.
4159 *
4160 * This must be called with BTRFS_I(inode)->lock held.
4161 */
4162static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4163                                   int reserve)
4164{
4165        struct btrfs_root *root = BTRFS_I(inode)->root;
4166        u64 csum_size;
4167        int num_csums_per_leaf;
4168        int num_csums;
4169        int old_csums;
4170
4171        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4172            BTRFS_I(inode)->csum_bytes == 0)
4173                return 0;
4174
4175        old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4176        if (reserve)
4177                BTRFS_I(inode)->csum_bytes += num_bytes;
4178        else
4179                BTRFS_I(inode)->csum_bytes -= num_bytes;
4180        csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4181        num_csums_per_leaf = (int)div64_u64(csum_size,
4182                                            sizeof(struct btrfs_csum_item) +
4183                                            sizeof(struct btrfs_disk_key));
4184        num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4185        num_csums = num_csums + num_csums_per_leaf - 1;
4186        num_csums = num_csums / num_csums_per_leaf;
4187
4188        old_csums = old_csums + num_csums_per_leaf - 1;
4189        old_csums = old_csums / num_csums_per_leaf;
4190
4191        /* No change, no need to reserve more */
4192        if (old_csums == num_csums)
4193                return 0;
4194
4195        if (reserve)
4196                return btrfs_calc_trans_metadata_size(root,
4197                                                      num_csums - old_csums);
4198
4199        return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4200}
4201
4202int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4203{
4204        struct btrfs_root *root = BTRFS_I(inode)->root;
4205        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4206        u64 to_reserve = 0;
4207        u64 csum_bytes;
4208        unsigned nr_extents = 0;
4209        int extra_reserve = 0;
4210        int flush = 1;
4211        int ret;
4212
4213        /* Need to be holding the i_mutex here if we aren't free space cache */
4214        if (btrfs_is_free_space_inode(root, inode))
4215                flush = 0;
4216        else
4217                WARN_ON(!mutex_is_locked(&inode->i_mutex));
4218
4219        if (flush && btrfs_transaction_in_commit(root->fs_info))
4220                schedule_timeout(1);
4221
4222        num_bytes = ALIGN(num_bytes, root->sectorsize);
4223
4224        spin_lock(&BTRFS_I(inode)->lock);
4225        BTRFS_I(inode)->outstanding_extents++;
4226
4227        if (BTRFS_I(inode)->outstanding_extents >
4228            BTRFS_I(inode)->reserved_extents)
4229                nr_extents = BTRFS_I(inode)->outstanding_extents -
4230                        BTRFS_I(inode)->reserved_extents;
4231
4232        /*
4233         * Add an item to reserve for updating the inode when we complete the
4234         * delalloc io.
4235         */
4236        if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4237                nr_extents++;
4238                extra_reserve = 1;
4239        }
4240
4241        to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4242        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4243        csum_bytes = BTRFS_I(inode)->csum_bytes;
4244        spin_unlock(&BTRFS_I(inode)->lock);
4245
4246        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4247        if (ret) {
4248                u64 to_free = 0;
4249                unsigned dropped;
4250
4251                spin_lock(&BTRFS_I(inode)->lock);
4252                dropped = drop_outstanding_extent(inode);
4253                /*
4254                 * If the inodes csum_bytes is the same as the original
4255                 * csum_bytes then we know we haven't raced with any free()ers
4256                 * so we can just reduce our inodes csum bytes and carry on.
4257                 * Otherwise we have to do the normal free thing to account for
4258                 * the case that the free side didn't free up its reserve
4259                 * because of this outstanding reservation.
4260                 */
4261                if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4262                        calc_csum_metadata_size(inode, num_bytes, 0);
4263                else
4264                        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4265                spin_unlock(&BTRFS_I(inode)->lock);
4266                if (dropped)
4267                        to_free += btrfs_calc_trans_metadata_size(root, dropped);
4268
4269                if (to_free)
4270                        btrfs_block_rsv_release(root, block_rsv, to_free);
4271                return ret;
4272        }
4273
4274        spin_lock(&BTRFS_I(inode)->lock);
4275        if (extra_reserve) {
4276                BTRFS_I(inode)->delalloc_meta_reserved = 1;
4277                nr_extents--;
4278        }
4279        BTRFS_I(inode)->reserved_extents += nr_extents;
4280        spin_unlock(&BTRFS_I(inode)->lock);
4281
4282        block_rsv_add_bytes(block_rsv, to_reserve, 1);
4283
4284        return 0;
4285}
4286
4287/**
4288 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4289 * @inode: the inode to release the reservation for
4290 * @num_bytes: the number of bytes we're releasing
4291 *
4292 * This will release the metadata reservation for an inode.  This can be called
4293 * once we complete IO for a given set of bytes to release their metadata
4294 * reservations.
4295 */
4296void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4297{
4298        struct btrfs_root *root = BTRFS_I(inode)->root;
4299        u64 to_free = 0;
4300        unsigned dropped;
4301
4302        num_bytes = ALIGN(num_bytes, root->sectorsize);
4303        spin_lock(&BTRFS_I(inode)->lock);
4304        dropped = drop_outstanding_extent(inode);
4305
4306        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4307        spin_unlock(&BTRFS_I(inode)->lock);
4308        if (dropped > 0)
4309                to_free += btrfs_calc_trans_metadata_size(root, dropped);
4310
4311        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4312                                to_free);
4313}
4314
4315/**
4316 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4317 * @inode: inode we're writing to
4318 * @num_bytes: the number of bytes we want to allocate
4319 *
4320 * This will do the following things
4321 *
4322 * o reserve space in the data space info for num_bytes
4323 * o reserve space in the metadata space info based on number of outstanding
4324 *   extents and how much csums will be needed
4325 * o add to the inodes ->delalloc_bytes
4326 * o add it to the fs_info's delalloc inodes list.
4327 *
4328 * This will return 0 for success and -ENOSPC if there is no space left.
4329 */
4330int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4331{
4332        int ret;
4333
4334        ret = btrfs_check_data_free_space(inode, num_bytes);
4335        if (ret)
4336                return ret;
4337
4338        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4339        if (ret) {
4340                btrfs_free_reserved_data_space(inode, num_bytes);
4341                return ret;
4342        }
4343
4344        return 0;
4345}
4346
4347/**
4348 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4349 * @inode: inode we're releasing space for
4350 * @num_bytes: the number of bytes we want to free up
4351 *
4352 * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4353 * called in the case that we don't need the metadata AND data reservations
4354 * anymore.  So if there is an error or we insert an inline extent.
4355 *
4356 * This function will release the metadata space that was not used and will
4357 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4358 * list if there are no delalloc bytes left.
4359 */
4360void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4361{
4362        btrfs_delalloc_release_metadata(inode, num_bytes);
4363        btrfs_free_reserved_data_space(inode, num_bytes);
4364}
4365
4366static int update_block_group(struct btrfs_trans_handle *trans,
4367                              struct btrfs_root *root,
4368                              u64 bytenr, u64 num_bytes, int alloc)
4369{
4370        struct btrfs_block_group_cache *cache = NULL;
4371        struct btrfs_fs_info *info = root->fs_info;
4372        u64 total = num_bytes;
4373        u64 old_val;
4374        u64 byte_in_group;
4375        int factor;
4376
4377        /* block accounting for super block */
4378        spin_lock(&info->delalloc_lock);
4379        old_val = btrfs_super_bytes_used(info->super_copy);
4380        if (alloc)
4381                old_val += num_bytes;
4382        else
4383                old_val -= num_bytes;
4384        btrfs_set_super_bytes_used(info->super_copy, old_val);
4385        spin_unlock(&info->delalloc_lock);
4386
4387        while (total) {
4388                cache = btrfs_lookup_block_group(info, bytenr);
4389                if (!cache)
4390                        return -1;
4391                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4392                                    BTRFS_BLOCK_GROUP_RAID1 |
4393                                    BTRFS_BLOCK_GROUP_RAID10))
4394                        factor = 2;
4395                else
4396                        factor = 1;
4397                /*
4398                 * If this block group has free space cache written out, we
4399                 * need to make sure to load it if we are removing space.  This
4400                 * is because we need the unpinning stage to actually add the
4401                 * space back to the block group, otherwise we will leak space.
4402                 */
4403                if (!alloc && cache->cached == BTRFS_CACHE_NO)
4404                        cache_block_group(cache, trans, NULL, 1);
4405
4406                byte_in_group = bytenr - cache->key.objectid;
4407                WARN_ON(byte_in_group > cache->key.offset);
4408
4409                spin_lock(&cache->space_info->lock);
4410                spin_lock(&cache->lock);
4411
4412                if (btrfs_test_opt(root, SPACE_CACHE) &&
4413                    cache->disk_cache_state < BTRFS_DC_CLEAR)
4414                        cache->disk_cache_state = BTRFS_DC_CLEAR;
4415
4416                cache->dirty = 1;
4417                old_val = btrfs_block_group_used(&cache->item);
4418                num_bytes = min(total, cache->key.offset - byte_in_group);
4419                if (alloc) {
4420                        old_val += num_bytes;
4421                        btrfs_set_block_group_used(&cache->item, old_val);
4422                        cache->reserved -= num_bytes;
4423                        cache->space_info->bytes_reserved -= num_bytes;
4424                        cache->space_info->bytes_used += num_bytes;
4425                        cache->space_info->disk_used += num_bytes * factor;
4426                        spin_unlock(&cache->lock);
4427                        spin_unlock(&cache->space_info->lock);
4428                } else {
4429                        old_val -= num_bytes;
4430                        btrfs_set_block_group_used(&cache->item, old_val);
4431                        cache->pinned += num_bytes;
4432                        cache->space_info->bytes_pinned += num_bytes;
4433                        cache->space_info->bytes_used -= num_bytes;
4434                        cache->space_info->disk_used -= num_bytes * factor;
4435                        spin_unlock(&cache->lock);
4436                        spin_unlock(&cache->space_info->lock);
4437
4438                        set_extent_dirty(info->pinned_extents,
4439                                         bytenr, bytenr + num_bytes - 1,
4440                                         GFP_NOFS | __GFP_NOFAIL);
4441                }
4442                btrfs_put_block_group(cache);
4443                total -= num_bytes;
4444                bytenr += num_bytes;
4445        }
4446        return 0;
4447}
4448
4449static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4450{
4451        struct btrfs_block_group_cache *cache;
4452        u64 bytenr;
4453
4454        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4455        if (!cache)
4456                return 0;
4457
4458        bytenr = cache->key.objectid;
4459        btrfs_put_block_group(cache);
4460
4461        return bytenr;
4462}
4463
4464static int pin_down_extent(struct btrfs_root *root,
4465                           struct btrfs_block_group_cache *cache,
4466                           u64 bytenr, u64 num_bytes, int reserved)
4467{
4468        spin_lock(&cache->space_info->lock);
4469        spin_lock(&cache->lock);
4470        cache->pinned += num_bytes;
4471        cache->space_info->bytes_pinned += num_bytes;
4472        if (reserved) {
4473                cache->reserved -= num_bytes;
4474                cache->space_info->bytes_reserved -= num_bytes;
4475        }
4476        spin_unlock(&cache->lock);
4477        spin_unlock(&cache->space_info->lock);
4478
4479        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4480                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4481        return 0;
4482}
4483
4484/*
4485 * this function must be called within transaction
4486 */
4487int btrfs_pin_extent(struct btrfs_root *root,
4488                     u64 bytenr, u64 num_bytes, int reserved)
4489{
4490        struct btrfs_block_group_cache *cache;
4491
4492        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4493        BUG_ON(!cache);
4494
4495        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4496
4497        btrfs_put_block_group(cache);
4498        return 0;
4499}
4500
4501/*
4502 * this function must be called within transaction
4503 */
4504int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4505                                    struct btrfs_root *root,
4506                                    u64 bytenr, u64 num_bytes)
4507{
4508        struct btrfs_block_group_cache *cache;
4509
4510        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4511        BUG_ON(!cache);
4512
4513        /*
4514         * pull in the free space cache (if any) so that our pin
4515         * removes the free space from the cache.  We have load_only set
4516         * to one because the slow code to read in the free extents does check
4517         * the pinned extents.
4518         */
4519        cache_block_group(cache, trans, root, 1);
4520
4521        pin_down_extent(root, cache, bytenr, num_bytes, 0);
4522
4523        /* remove us from the free space cache (if we're there at all) */
4524        btrfs_remove_free_space(cache, bytenr, num_bytes);
4525        btrfs_put_block_group(cache);
4526        return 0;
4527}
4528
4529/**
4530 * btrfs_update_reserved_bytes - update the block_group and space info counters
4531 * @cache:      The cache we are manipulating
4532 * @num_bytes:  The number of bytes in question
4533 * @reserve:    One of the reservation enums
4534 *
4535 * This is called by the allocator when it reserves space, or by somebody who is
4536 * freeing space that was never actually used on disk.  For example if you
4537 * reserve some space for a new leaf in transaction A and before transaction A
4538 * commits you free that leaf, you call this with reserve set to 0 in order to
4539 * clear the reservation.
4540 *
4541 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4542 * ENOSPC accounting.  For data we handle the reservation through clearing the
4543 * delalloc bits in the io_tree.  We have to do this since we could end up
4544 * allocating less disk space for the amount of data we have reserved in the
4545 * case of compression.
4546 *
4547 * If this is a reservation and the block group has become read only we cannot
4548 * make the reservation and return -EAGAIN, otherwise this function always
4549 * succeeds.
4550 */
4551static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4552                                       u64 num_bytes, int reserve)
4553{
4554        struct btrfs_space_info *space_info = cache->space_info;
4555        int ret = 0;
4556        spin_lock(&space_info->lock);
4557        spin_lock(&cache->lock);
4558        if (reserve != RESERVE_FREE) {
4559                if (cache->ro) {
4560                        ret = -EAGAIN;
4561                } else {
4562                        cache->reserved += num_bytes;
4563                        space_info->bytes_reserved += num_bytes;
4564                        if (reserve == RESERVE_ALLOC) {
4565                                BUG_ON(space_info->bytes_may_use < num_bytes);
4566                                space_info->bytes_may_use -= num_bytes;
4567                        }
4568                }
4569        } else {
4570                if (cache->ro)
4571                        space_info->bytes_readonly += num_bytes;
4572                cache->reserved -= num_bytes;
4573                space_info->bytes_reserved -= num_bytes;
4574                space_info->reservation_progress++;
4575        }
4576        spin_unlock(&cache->lock);
4577        spin_unlock(&space_info->lock);
4578        return ret;
4579}
4580
4581int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4582                                struct btrfs_root *root)
4583{
4584        struct btrfs_fs_info *fs_info = root->fs_info;
4585        struct btrfs_caching_control *next;
4586        struct btrfs_caching_control *caching_ctl;
4587        struct btrfs_block_group_cache *cache;
4588
4589        down_write(&fs_info->extent_commit_sem);
4590
4591        list_for_each_entry_safe(caching_ctl, next,
4592                                 &fs_info->caching_block_groups, list) {
4593                cache = caching_ctl->block_group;
4594                if (block_group_cache_done(cache)) {
4595                        cache->last_byte_to_unpin = (u64)-1;
4596                        list_del_init(&caching_ctl->list);
4597                        put_caching_control(caching_ctl);
4598                } else {
4599                        cache->last_byte_to_unpin = caching_ctl->progress;
4600                }
4601        }
4602
4603        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4604                fs_info->pinned_extents = &fs_info->freed_extents[1];
4605        else
4606                fs_info->pinned_extents = &fs_info->freed_extents[0];
4607
4608        up_write(&fs_info->extent_commit_sem);
4609
4610        update_global_block_rsv(fs_info);
4611        return 0;
4612}
4613
4614static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4615{
4616        struct btrfs_fs_info *fs_info = root->fs_info;
4617        struct btrfs_block_group_cache *cache = NULL;
4618        u64 len;
4619
4620        while (start <= end) {
4621                if (!cache ||
4622                    start >= cache->key.objectid + cache->key.offset) {
4623                        if (cache)
4624                                btrfs_put_block_group(cache);
4625                        cache = btrfs_lookup_block_group(fs_info, start);
4626                        BUG_ON(!cache);
4627                }
4628
4629                len = cache->key.objectid + cache->key.offset - start;
4630                len = min(len, end + 1 - start);
4631
4632                if (start < cache->last_byte_to_unpin) {
4633                        len = min(len, cache->last_byte_to_unpin - start);
4634                        btrfs_add_free_space(cache, start, len);
4635                }
4636
4637                start += len;
4638
4639                spin_lock(&cache->space_info->lock);
4640                spin_lock(&cache->lock);
4641                cache->pinned -= len;
4642                cache->space_info->bytes_pinned -= len;
4643                if (cache->ro)
4644                        cache->space_info->bytes_readonly += len;
4645                spin_unlock(&cache->lock);
4646                spin_unlock(&cache->space_info->lock);
4647        }
4648
4649        if (cache)
4650                btrfs_put_block_group(cache);
4651        return 0;
4652}
4653
4654int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4655                               struct btrfs_root *root)
4656{
4657        struct btrfs_fs_info *fs_info = root->fs_info;
4658        struct extent_io_tree *unpin;
4659        u64 start;
4660        u64 end;
4661        int ret;
4662
4663        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
4664                unpin = &fs_info->freed_extents[1];
4665        else
4666                unpin = &fs_info->freed_extents[0];
4667
4668        while (1) {
4669                ret = find_first_extent_bit(unpin, 0, &start, &end,
4670                                            EXTENT_DIRTY);
4671                if (ret)
4672                        break;
4673
4674                if (btrfs_test_opt(root, DISCARD))
4675                        ret = btrfs_discard_extent(root, start,
4676                                                   end + 1 - start, NULL);
4677
4678                clear_extent_dirty(unpin, start, end, GFP_NOFS);
4679                unpin_extent_range(root, start, end);
4680                cond_resched();
4681        }
4682
4683        return 0;
4684}
4685
4686static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4687                                struct btrfs_root *root,
4688                                u64 bytenr, u64 num_bytes, u64 parent,
4689                                u64 root_objectid, u64 owner_objectid,
4690                                u64 owner_offset, int refs_to_drop,
4691                                struct btrfs_delayed_extent_op *extent_op)
4692{
4693        struct btrfs_key key;
4694        struct btrfs_path *path;
4695        struct btrfs_fs_info *info = root->fs_info;
4696        struct btrfs_root *extent_root = info->extent_root;
4697        struct extent_buffer *leaf;
4698        struct btrfs_extent_item *ei;
4699        struct btrfs_extent_inline_ref *iref;
4700        int ret;
4701        int is_data;
4702        int extent_slot = 0;
4703        int found_extent = 0;
4704        int num_to_del = 1;
4705        u32 item_size;
4706        u64 refs;
4707
4708        path = btrfs_alloc_path();
4709        if (!path)
4710                return -ENOMEM;
4711
4712        path->reada = 1;
4713        path->leave_spinning = 1;
4714
4715        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
4716        BUG_ON(!is_data && refs_to_drop != 1);
4717
4718        ret = lookup_extent_backref(trans, extent_root, path, &iref,
4719                                    bytenr, num_bytes, parent,
4720                                    root_objectid, owner_objectid,
4721                                    owner_offset);
4722        if (ret == 0) {
4723                extent_slot = path->slots[0];
4724                while (extent_slot >= 0) {
4725                        btrfs_item_key_to_cpu(path->nodes[0], &key,
4726                                              extent_slot);
4727                        if (key.objectid != bytenr)
4728                                break;
4729                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
4730                            key.offset == num_bytes) {
4731                                found_extent = 1;
4732                                break;
4733                        }
4734                        if (path->slots[0] - extent_slot > 5)
4735                                break;
4736                        extent_slot--;
4737                }
4738#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4739                item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
4740                if (found_extent && item_size < sizeof(*ei))
4741                        found_extent = 0;
4742#endif
4743                if (!found_extent) {
4744                        BUG_ON(iref);
4745                        ret = remove_extent_backref(trans, extent_root, path,
4746                                                    NULL, refs_to_drop,
4747                                                    is_data);
4748                        BUG_ON(ret);
4749                        btrfs_release_path(path);
4750                        path->leave_spinning = 1;
4751
4752                        key.objectid = bytenr;
4753                        key.type = BTRFS_EXTENT_ITEM_KEY;
4754                        key.offset = num_bytes;
4755
4756                        ret = btrfs_search_slot(trans, extent_root,
4757                                                &key, path, -1, 1);
4758                        if (ret) {
4759                                printk(KERN_ERR "umm, got %d back from search"
4760                                       ", was looking for %llu\n", ret,
4761                                       (unsigned long long)bytenr);
4762                                if (ret > 0)
4763                                        btrfs_print_leaf(extent_root,
4764                                                         path->nodes[0]);
4765                        }
4766                        BUG_ON(ret);
4767                        extent_slot = path->slots[0];
4768                }
4769        } else {
4770                btrfs_print_leaf(extent_root, path->nodes[0]);
4771                WARN_ON(1);
4772                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
4773                       "parent %llu root %llu  owner %llu offset %llu\n",
4774                       (unsigned long long)bytenr,
4775                       (unsigned long long)parent,
4776                       (unsigned long long)root_objectid,
4777                       (unsigned long long)owner_objectid,
4778                       (unsigned long long)owner_offset);
4779        }
4780
4781        leaf = path->nodes[0];
4782        item_size = btrfs_item_size_nr(leaf, extent_slot);
4783#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
4784        if (item_size < sizeof(*ei)) {
4785                BUG_ON(found_extent || extent_slot != path->slots[0]);
4786                ret = convert_extent_item_v0(trans, extent_root, path,
4787                                             owner_objectid, 0);
4788                BUG_ON(ret < 0);
4789
4790                btrfs_release_path(path);
4791                path->leave_spinning = 1;
4792
4793                key.objectid = bytenr;
4794                key.type = BTRFS_EXTENT_ITEM_KEY;
4795                key.offset = num_bytes;
4796
4797                ret = btrfs_search_slot(trans, extent_root, &key, path,
4798                                        -1, 1);
4799                if (ret) {
4800                        printk(KERN_ERR "umm, got %d back from search"
4801                               ", was looking for %llu\n", ret,
4802                               (unsigned long long)bytenr);
4803                        btrfs_print_leaf(extent_root, path->nodes[0]);
4804                }
4805                BUG_ON(ret);
4806                extent_slot = path->slots[0];
4807                leaf = path->nodes[0];
4808                item_size = btrfs_item_size_nr(leaf, extent_slot);
4809        }
4810#endif
4811        BUG_ON(item_size < sizeof(*ei));
4812        ei = btrfs_item_ptr(leaf, extent_slot,
4813                            struct btrfs_extent_item);
4814        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4815                struct btrfs_tree_block_info *bi;
4816                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
4817                bi = (struct btrfs_tree_block_info *)(ei + 1);
4818                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
4819        }
4820
4821        refs = btrfs_extent_refs(leaf, ei);
4822        BUG_ON(refs < refs_to_drop);
4823        refs -= refs_to_drop;
4824
4825        if (refs > 0) {
4826                if (extent_op)
4827                        __run_delayed_extent_op(extent_op, leaf, ei);
4828                /*
4829                 * In the case of inline back ref, reference count will
4830                 * be updated by remove_extent_backref
4831                 */
4832                if (iref) {
4833                        BUG_ON(!found_extent);
4834                } else {
4835                        btrfs_set_extent_refs(leaf, ei, refs);
4836                        btrfs_mark_buffer_dirty(leaf);
4837                }
4838                if (found_extent) {
4839                        ret = remove_extent_backref(trans, extent_root, path,
4840                                                    iref, refs_to_drop,
4841                                                    is_data);
4842                        BUG_ON(ret);
4843                }
4844        } else {
4845                if (found_extent) {
4846                        BUG_ON(is_data && refs_to_drop !=
4847                               extent_data_ref_count(root, path, iref));
4848                        if (iref) {
4849                                BUG_ON(path->slots[0] != extent_slot);
4850                        } else {
4851                                BUG_ON(path->slots[0] != extent_slot + 1);
4852                                path->slots[0] = extent_slot;
4853                                num_to_del = 2;
4854                        }
4855                }
4856
4857                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4858                                      num_to_del);
4859                BUG_ON(ret);
4860                btrfs_release_path(path);
4861
4862                if (is_data) {
4863                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
4864                        BUG_ON(ret);
4865                } else {
4866                        invalidate_mapping_pages(info->btree_inode->i_mapping,
4867                             bytenr >> PAGE_CACHE_SHIFT,
4868                             (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
4869                }
4870
4871                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
4872                BUG_ON(ret);
4873        }
4874        btrfs_free_path(path);
4875        return ret;
4876}
4877
4878/*
4879 * when we free an block, it is possible (and likely) that we free the last
4880 * delayed ref for that extent as well.  This searches the delayed ref tree for
4881 * a given extent, and if there are no other delayed refs to be processed, it
4882 * removes it from the tree.
4883 */
4884static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4885                                      struct btrfs_root *root, u64 bytenr)
4886{
4887        struct btrfs_delayed_ref_head *head;
4888        struct btrfs_delayed_ref_root *delayed_refs;
4889        struct btrfs_delayed_ref_node *ref;
4890        struct rb_node *node;
4891        int ret = 0;
4892
4893        delayed_refs = &trans->transaction->delayed_refs;
4894        spin_lock(&delayed_refs->lock);
4895        head = btrfs_find_delayed_ref_head(trans, bytenr);
4896        if (!head)
4897                goto out;
4898
4899        node = rb_prev(&head->node.rb_node);
4900        if (!node)
4901                goto out;
4902
4903        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
4904
4905        /* there are still entries for this ref, we can't drop it */
4906        if (ref->bytenr == bytenr)
4907                goto out;
4908
4909        if (head->extent_op) {
4910                if (!head->must_insert_reserved)
4911                        goto out;
4912                kfree(head->extent_op);
4913                head->extent_op = NULL;
4914        }
4915
4916        /*
4917         * waiting for the lock here would deadlock.  If someone else has it
4918         * locked they are already in the process of dropping it anyway
4919         */
4920        if (!mutex_trylock(&head->mutex))
4921                goto out;
4922
4923        /*
4924         * at this point we have a head with no other entries.  Go
4925         * ahead and process it.
4926         */
4927        head->node.in_tree = 0;
4928        rb_erase(&head->node.rb_node, &delayed_refs->root);
4929
4930        delayed_refs->num_entries--;
4931
4932        /*
4933         * we don't take a ref on the node because we're removing it from the
4934         * tree, so we just steal the ref the tree was holding.
4935         */
4936        delayed_refs->num_heads--;
4937        if (list_empty(&head->cluster))
4938                delayed_refs->num_heads_ready--;
4939
4940        list_del_init(&head->cluster);
4941        spin_unlock(&delayed_refs->lock);
4942
4943        BUG_ON(head->extent_op);
4944        if (head->must_insert_reserved)
4945                ret = 1;
4946
4947        mutex_unlock(&head->mutex);
4948        btrfs_put_delayed_ref(&head->node);
4949        return ret;
4950out:
4951        spin_unlock(&delayed_refs->lock);
4952        return 0;
4953}
4954
4955void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4956                           struct btrfs_root *root,
4957                           struct extent_buffer *buf,
4958                           u64 parent, int last_ref)
4959{
4960        struct btrfs_block_group_cache *cache = NULL;
4961        int ret;
4962
4963        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4964                ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4965                                                parent, root->root_key.objectid,
4966                                                btrfs_header_level(buf),
4967                                                BTRFS_DROP_DELAYED_REF, NULL);
4968                BUG_ON(ret);
4969        }
4970
4971        if (!last_ref)
4972                return;
4973
4974        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4975
4976        if (btrfs_header_generation(buf) == trans->transid) {
4977                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4978                        ret = check_ref_cleanup(trans, root, buf->start);
4979                        if (!ret)
4980                                goto out;
4981                }
4982
4983                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4984                        pin_down_extent(root, cache, buf->start, buf->len, 1);
4985                        goto out;
4986                }
4987
4988                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4989
4990                btrfs_add_free_space(cache, buf->start, buf->len);
4991                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4992        }
4993out:
4994        /*
4995         * Deleting the buffer, clear the corrupt flag since it doesn't matter
4996         * anymore.
4997         */
4998        clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
4999        btrfs_put_block_group(cache);
5000}
5001
5002int btrfs_free_extent(struct btrfs_trans_handle *trans,
5003                      struct btrfs_root *root,
5004                      u64 bytenr, u64 num_bytes, u64 parent,
5005                      u64 root_objectid, u64 owner, u64 offset)
5006{
5007        int ret;
5008
5009        /*
5010         * tree log blocks never actually go into the extent allocation
5011         * tree, just update pinning info and exit early.
5012         */
5013        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5014                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5015                /* unlocks the pinned mutex */
5016                btrfs_pin_extent(root, bytenr, num_bytes, 1);
5017                ret = 0;
5018        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5019                ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
5020                                        parent, root_objectid, (int)owner,
5021                                        BTRFS_DROP_DELAYED_REF, NULL);
5022                BUG_ON(ret);
5023        } else {
5024                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
5025                                        parent, root_objectid, owner,
5026                                        offset, BTRFS_DROP_DELAYED_REF, NULL);
5027                BUG_ON(ret);
5028        }
5029        return ret;
5030}
5031
5032static u64 stripe_align(struct btrfs_root *root, u64 val)
5033{
5034        u64 mask = ((u64)root->stripesize - 1);
5035        u64 ret = (val + mask) & ~mask;
5036        return ret;
5037}
5038
5039/*
5040 * when we wait for progress in the block group caching, its because
5041 * our allocation attempt failed at least once.  So, we must sleep
5042 * and let some progress happen before we try again.
5043 *
5044 * This function will sleep at least once waiting for new free space to
5045 * show up, and then it will check the block group free space numbers
5046 * for our min num_bytes.  Another option is to have it go ahead
5047 * and look in the rbtree for a free extent of a given size, but this
5048 * is a good start.
5049 */
5050static noinline int
5051wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5052                                u64 num_bytes)
5053{
5054        struct btrfs_caching_control *caching_ctl;
5055        DEFINE_WAIT(wait);
5056
5057        caching_ctl = get_caching_control(cache);
5058        if (!caching_ctl)
5059                return 0;
5060
5061        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5062                   (cache->free_space_ctl->free_space >= num_bytes));
5063
5064        put_caching_control(caching_ctl);
5065        return 0;
5066}
5067
5068static noinline int
5069wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5070{
5071        struct btrfs_caching_control *caching_ctl;
5072        DEFINE_WAIT(wait);
5073
5074        caching_ctl = get_caching_control(cache);
5075        if (!caching_ctl)
5076                return 0;
5077
5078        wait_event(caching_ctl->wait, block_group_cache_done(cache));
5079
5080        put_caching_control(caching_ctl);
5081        return 0;
5082}
5083
5084static int get_block_group_index(struct btrfs_block_group_cache *cache)
5085{
5086        int index;
5087        if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
5088                index = 0;
5089        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
5090                index = 1;
5091        else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
5092                index = 2;
5093        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
5094                index = 3;
5095        else
5096                index = 4;
5097        return index;
5098}
5099
5100enum btrfs_loop_type {
5101        LOOP_FIND_IDEAL = 0,
5102        LOOP_CACHING_NOWAIT = 1,
5103        LOOP_CACHING_WAIT = 2,
5104        LOOP_ALLOC_CHUNK = 3,
5105        LOOP_NO_EMPTY_SIZE = 4,
5106};
5107
5108/*
5109 * walks the btree of allocated extents and find a hole of a given size.
5110 * The key ins is changed to record the hole:
5111 * ins->objectid == block start
5112 * ins->flags = BTRFS_EXTENT_ITEM_KEY
5113 * ins->offset == number of blocks
5114 * Any available blocks before search_start are skipped.
5115 */
5116static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5117                                     struct btrfs_root *orig_root,
5118                                     u64 num_bytes, u64 empty_size,
5119                                     u64 search_start, u64 search_end,
5120                                     u64 hint_byte, struct btrfs_key *ins,
5121                                     u64 data)
5122{
5123        int ret = 0;
5124        struct btrfs_root *root = orig_root->fs_info->extent_root;
5125        struct btrfs_free_cluster *last_ptr = NULL;
5126        struct btrfs_block_group_cache *block_group = NULL;
5127        struct btrfs_block_group_cache *used_block_group;
5128        int empty_cluster = 2 * 1024 * 1024;
5129        int allowed_chunk_alloc = 0;
5130        int done_chunk_alloc = 0;
5131        struct btrfs_space_info *space_info;
5132        int loop = 0;
5133        int index = 0;
5134        int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5135                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5136        bool found_uncached_bg = false;
5137        bool failed_cluster_refill = false;
5138        bool failed_alloc = false;
5139        bool use_cluster = true;
5140        bool have_caching_bg = false;
5141        u64 ideal_cache_percent = 0;
5142        u64 ideal_cache_offset = 0;
5143
5144        WARN_ON(num_bytes < root->sectorsize);
5145        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5146        ins->objectid = 0;
5147        ins->offset = 0;
5148
5149        space_info = __find_space_info(root->fs_info, data);
5150        if (!space_info) {
5151                printk(KERN_ERR "No space info for %llu\n", data);
5152                return -ENOSPC;
5153        }
5154
5155        /*
5156         * If the space info is for both data and metadata it means we have a
5157         * small filesystem and we can't use the clustering stuff.
5158         */
5159        if (btrfs_mixed_space_info(space_info))
5160                use_cluster = false;
5161
5162        if (orig_root->ref_cows || empty_size)
5163                allowed_chunk_alloc = 1;
5164
5165        if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5166                last_ptr = &root->fs_info->meta_alloc_cluster;
5167                if (!btrfs_test_opt(root, SSD))
5168                        empty_cluster = 64 * 1024;
5169        }
5170
5171        if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5172            btrfs_test_opt(root, SSD)) {
5173                last_ptr = &root->fs_info->data_alloc_cluster;
5174        }
5175
5176        if (last_ptr) {
5177                spin_lock(&last_ptr->lock);
5178                if (last_ptr->block_group)
5179                        hint_byte = last_ptr->window_start;
5180                spin_unlock(&last_ptr->lock);
5181        }
5182
5183        search_start = max(search_start, first_logical_byte(root, 0));
5184        search_start = max(search_start, hint_byte);
5185
5186        if (!last_ptr)
5187                empty_cluster = 0;
5188
5189        if (search_start == hint_byte) {
5190ideal_cache:
5191                block_group = btrfs_lookup_block_group(root->fs_info,
5192                                                       search_start);
5193                used_block_group = block_group;
5194                /*
5195                 * we don't want to use the block group if it doesn't match our
5196                 * allocation bits, or if its not cached.
5197                 *
5198                 * However if we are re-searching with an ideal block group
5199                 * picked out then we don't care that the block group is cached.
5200                 */
5201                if (block_group && block_group_bits(block_group, data) &&
5202                    (block_group->cached != BTRFS_CACHE_NO ||
5203                     search_start == ideal_cache_offset)) {
5204                        down_read(&space_info->groups_sem);
5205                        if (list_empty(&block_group->list) ||
5206                            block_group->ro) {
5207                                /*
5208                                 * someone is removing this block group,
5209                                 * we can't jump into the have_block_group
5210                                 * target because our list pointers are not
5211                                 * valid
5212                                 */
5213                                btrfs_put_block_group(block_group);
5214                                up_read(&space_info->groups_sem);
5215                        } else {
5216                                index = get_block_group_index(block_group);
5217                                goto have_block_group;
5218                        }
5219                } else if (block_group) {
5220                        btrfs_put_block_group(block_group);
5221                }
5222        }
5223search:
5224        have_caching_bg = false;
5225        down_read(&space_info->groups_sem);
5226        list_for_each_entry(block_group, &space_info->block_groups[index],
5227                            list) {
5228                u64 offset;
5229                int cached;
5230
5231                used_block_group = block_group;
5232                btrfs_get_block_group(block_group);
5233                search_start = block_group->key.objectid;
5234
5235                /*
5236                 * this can happen if we end up cycling through all the
5237                 * raid types, but we want to make sure we only allocate
5238                 * for the proper type.
5239                 */
5240                if (!block_group_bits(block_group, data)) {
5241                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
5242                                BTRFS_BLOCK_GROUP_RAID1 |
5243                                BTRFS_BLOCK_GROUP_RAID10;
5244
5245                        /*
5246                         * if they asked for extra copies and this block group
5247                         * doesn't provide them, bail.  This does allow us to
5248                         * fill raid0 from raid1.
5249                         */
5250                        if ((data & extra) && !(block_group->flags & extra))
5251                                goto loop;
5252                }
5253
5254have_block_group:
5255                cached = block_group_cache_done(block_group);
5256                if (unlikely(!cached)) {
5257                        u64 free_percent;
5258
5259                        found_uncached_bg = true;
5260                        ret = cache_block_group(block_group, trans,
5261                                                orig_root, 1);
5262                        if (block_group->cached == BTRFS_CACHE_FINISHED)
5263                                goto alloc;
5264
5265                        free_percent = btrfs_block_group_used(&block_group->item);
5266                        free_percent *= 100;
5267                        free_percent = div64_u64(free_percent,
5268                                                 block_group->key.offset);
5269                        free_percent = 100 - free_percent;
5270                        if (free_percent > ideal_cache_percent &&
5271                            likely(!block_group->ro)) {
5272                                ideal_cache_offset = block_group->key.objectid;
5273                                ideal_cache_percent = free_percent;
5274                        }
5275
5276                        /*
5277                         * The caching workers are limited to 2 threads, so we
5278                         * can queue as much work as we care to.
5279                         */
5280                        if (loop > LOOP_FIND_IDEAL) {
5281                                ret = cache_block_group(block_group, trans,
5282                                                        orig_root, 0);
5283                                BUG_ON(ret);
5284                        }
5285
5286                        /*
5287                         * If loop is set for cached only, try the next block
5288                         * group.
5289                         */
5290                        if (loop == LOOP_FIND_IDEAL)
5291                                goto loop;
5292                }
5293
5294alloc:
5295                if (unlikely(block_group->ro))
5296                        goto loop;
5297
5298                spin_lock(&block_group->free_space_ctl->tree_lock);
5299                if (cached &&
5300                    block_group->free_space_ctl->free_space <
5301                    num_bytes + empty_cluster + empty_size) {
5302                        spin_unlock(&block_group->free_space_ctl->tree_lock);
5303                        goto loop;
5304                }
5305                spin_unlock(&block_group->free_space_ctl->tree_lock);
5306
5307                /*
5308                 * Ok we want to try and use the cluster allocator, so
5309                 * lets look there
5310                 */
5311                if (last_ptr) {
5312                        /*
5313                         * the refill lock keeps out other
5314                         * people trying to start a new cluster
5315                         */
5316                        spin_lock(&last_ptr->refill_lock);
5317                        used_block_group = last_ptr->block_group;
5318                        if (used_block_group != block_group &&
5319                            (!used_block_group ||
5320                             used_block_group->ro ||
5321                             !block_group_bits(used_block_group, data))) {
5322                                used_block_group = block_group;
5323                                goto refill_cluster;
5324                        }
5325
5326                        if (used_block_group != block_group)
5327                                btrfs_get_block_group(used_block_group);
5328
5329                        offset = btrfs_alloc_from_cluster(used_block_group,
5330                          last_ptr, num_bytes, used_block_group->key.objectid);
5331                        if (offset) {
5332                                /* we have a block, we're done */
5333                                spin_unlock(&last_ptr->refill_lock);
5334                                goto checks;
5335                        }
5336
5337                        WARN_ON(last_ptr->block_group != used_block_group);
5338                        if (used_block_group != block_group) {
5339                                btrfs_put_block_group(used_block_group);
5340                                used_block_group = block_group;
5341                        }
5342refill_cluster:
5343                        BUG_ON(used_block_group != block_group);
5344                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5345                         * set up a new clusters, so lets just skip it
5346                         * and let the allocator find whatever block
5347                         * it can find.  If we reach this point, we
5348                         * will have tried the cluster allocator
5349                         * plenty of times and not have found
5350                         * anything, so we are likely way too
5351                         * fragmented for the clustering stuff to find
5352                         * anything.  */
5353                        if (loop >= LOOP_NO_EMPTY_SIZE) {
5354                                spin_unlock(&last_ptr->refill_lock);
5355                                goto unclustered_alloc;
5356                        }
5357
5358                        /*
5359                         * this cluster didn't work out, free it and
5360                         * start over
5361                         */
5362                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
5363
5364                        /* allocate a cluster in this block group */
5365                        ret = btrfs_find_space_cluster(trans, root,
5366                                               block_group, last_ptr,
5367                                               search_start, num_bytes,
5368                                               empty_cluster + empty_size);
5369                        if (ret == 0) {
5370                                /*
5371                                 * now pull our allocation out of this
5372                                 * cluster
5373                                 */
5374                                offset = btrfs_alloc_from_cluster(block_group,
5375                                                  last_ptr, num_bytes,
5376                                                  search_start);
5377                                if (offset) {
5378                                        /* we found one, proceed */
5379                                        spin_unlock(&last_ptr->refill_lock);
5380                                        goto checks;
5381                                }
5382                        } else if (!cached && loop > LOOP_CACHING_NOWAIT
5383                                   && !failed_cluster_refill) {
5384                                spin_unlock(&last_ptr->refill_lock);
5385
5386                                failed_cluster_refill = true;
5387                                wait_block_group_cache_progress(block_group,
5388                                       num_bytes + empty_cluster + empty_size);
5389                                goto have_block_group;
5390                        }
5391
5392                        /*
5393                         * at this point we either didn't find a cluster
5394                         * or we weren't able to allocate a block from our
5395                         * cluster.  Free the cluster we've been trying
5396                         * to use, and go to the next block group
5397                         */
5398                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
5399                        spin_unlock(&last_ptr->refill_lock);
5400                        goto loop;
5401                }
5402
5403unclustered_alloc:
5404                offset = btrfs_find_space_for_alloc(block_group, search_start,
5405                                                    num_bytes, empty_size);
5406                /*
5407                 * If we didn't find a chunk, and we haven't failed on this
5408                 * block group before, and this block group is in the middle of
5409                 * caching and we are ok with waiting, then go ahead and wait
5410                 * for progress to be made, and set failed_alloc to true.
5411                 *
5412                 * If failed_alloc is true then we've already waited on this
5413                 * block group once and should move on to the next block group.
5414                 */
5415                if (!offset && !failed_alloc && !cached &&
5416                    loop > LOOP_CACHING_NOWAIT) {
5417                        wait_block_group_cache_progress(block_group,
5418                                                num_bytes + empty_size);
5419                        failed_alloc = true;
5420                        goto have_block_group;
5421                } else if (!offset) {
5422                        if (!cached)
5423                                have_caching_bg = true;
5424                        goto loop;
5425                }
5426checks:
5427                search_start = stripe_align(root, offset);
5428                /* move on to the next group */
5429                if (search_start + num_bytes >= search_end) {
5430                        btrfs_add_free_space(used_block_group, offset, num_bytes);
5431                        goto loop;
5432                }
5433
5434                /* move on to the next group */
5435                if (search_start + num_bytes >
5436                    used_block_group->key.objectid + used_block_group->key.offset) {
5437                        btrfs_add_free_space(used_block_group, offset, num_bytes);
5438                        goto loop;
5439                }
5440
5441                ins->objectid = search_start;
5442                ins->offset = num_bytes;
5443
5444                if (offset < search_start)
5445                        btrfs_add_free_space(used_block_group, offset,
5446                                             search_start - offset);
5447                BUG_ON(offset > search_start);
5448
5449                ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5450                                                  alloc_type);
5451                if (ret == -EAGAIN) {
5452                        btrfs_add_free_space(used_block_group, offset, num_bytes);
5453                        goto loop;
5454                }
5455
5456                /* we are all good, lets return */
5457                ins->objectid = search_start;
5458                ins->offset = num_bytes;
5459
5460                if (offset < search_start)
5461                        btrfs_add_free_space(used_block_group, offset,
5462                                             search_start - offset);
5463                BUG_ON(offset > search_start);
5464                if (used_block_group != block_group)
5465                        btrfs_put_block_group(used_block_group);
5466                btrfs_put_block_group(block_group);
5467                break;
5468loop:
5469                failed_cluster_refill = false;
5470                failed_alloc = false;
5471                BUG_ON(index != get_block_group_index(block_group));
5472                if (used_block_group != block_group)
5473                        btrfs_put_block_group(used_block_group);
5474                btrfs_put_block_group(block_group);
5475        }
5476        up_read(&space_info->groups_sem);
5477
5478        if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5479                goto search;
5480
5481        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5482                goto search;
5483
5484        /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
5485         *                      for them to make caching progress.  Also
5486         *                      determine the best possible bg to cache
5487         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5488         *                      caching kthreads as we move along
5489         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5490         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5491         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5492         *                      again
5493         */
5494        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
5495                index = 0;
5496                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
5497                        found_uncached_bg = false;
5498                        loop++;
5499                        if (!ideal_cache_percent)
5500                                goto search;
5501
5502                        /*
5503                         * 1 of the following 2 things have happened so far
5504                         *
5505                         * 1) We found an ideal block group for caching that
5506                         * is mostly full and will cache quickly, so we might
5507                         * as well wait for it.
5508                         *
5509                         * 2) We searched for cached only and we didn't find
5510                         * anything, and we didn't start any caching kthreads
5511                         * either, so chances are we will loop through and
5512                         * start a couple caching kthreads, and then come back
5513                         * around and just wait for them.  This will be slower
5514                         * because we will have 2 caching kthreads reading at
5515                         * the same time when we could have just started one
5516                         * and waited for it to get far enough to give us an
5517                         * allocation, so go ahead and go to the wait caching
5518                         * loop.
5519                         */
5520                        loop = LOOP_CACHING_WAIT;
5521                        search_start = ideal_cache_offset;
5522                        ideal_cache_percent = 0;
5523                        goto ideal_cache;
5524                } else if (loop == LOOP_FIND_IDEAL) {
5525                        /*
5526                         * Didn't find a uncached bg, wait on anything we find
5527                         * next.
5528                         */
5529                        loop = LOOP_CACHING_WAIT;
5530                        goto search;
5531                }
5532
5533                loop++;
5534
5535                if (loop == LOOP_ALLOC_CHUNK) {
5536                       if (allowed_chunk_alloc) {
5537                                ret = do_chunk_alloc(trans, root, num_bytes +
5538                                                     2 * 1024 * 1024, data,
5539                                                     CHUNK_ALLOC_LIMITED);
5540                                allowed_chunk_alloc = 0;
5541                                if (ret == 1)
5542                                        done_chunk_alloc = 1;
5543                        } else if (!done_chunk_alloc &&
5544                                   space_info->force_alloc ==
5545                                   CHUNK_ALLOC_NO_FORCE) {
5546                                space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5547                        }
5548
5549                       /*
5550                        * We didn't allocate a chunk, go ahead and drop the
5551                        * empty size and loop again.
5552                        */
5553                       if (!done_chunk_alloc)
5554                               loop = LOOP_NO_EMPTY_SIZE;
5555                }
5556
5557                if (loop == LOOP_NO_EMPTY_SIZE) {
5558                        empty_size = 0;
5559                        empty_cluster = 0;
5560                }
5561
5562                goto search;
5563        } else if (!ins->objectid) {
5564                ret = -ENOSPC;
5565        } else if (ins->objectid) {
5566                ret = 0;
5567        }
5568
5569        return ret;
5570}
5571
5572static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5573                            int dump_block_groups)
5574{
5575        struct btrfs_block_group_cache *cache;
5576        int index = 0;
5577
5578        spin_lock(&info->lock);
5579        printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5580               (unsigned long long)info->flags,
5581               (unsigned long long)(info->total_bytes - info->bytes_used -
5582                                    info->bytes_pinned - info->bytes_reserved -
5583                                    info->bytes_readonly),
5584               (info->full) ? "" : "not ");
5585        printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5586               "reserved=%llu, may_use=%llu, readonly=%llu\n",
5587               (unsigned long long)info->total_bytes,
5588               (unsigned long long)info->bytes_used,
5589               (unsigned long long)info->bytes_pinned,
5590               (unsigned long long)info->bytes_reserved,
5591               (unsigned long long)info->bytes_may_use,
5592               (unsigned long long)info->bytes_readonly);
5593        spin_unlock(&info->lock);
5594
5595        if (!dump_block_groups)
5596                return;
5597
5598        down_read(&info->groups_sem);
5599again:
5600        list_for_each_entry(cache, &info->block_groups[index], list) {
5601                spin_lock(&cache->lock);
5602                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
5603                       "%llu pinned %llu reserved\n",
5604                       (unsigned long long)cache->key.objectid,
5605                       (unsigned long long)cache->key.offset,
5606                       (unsigned long long)btrfs_block_group_used(&cache->item),
5607                       (unsigned long long)cache->pinned,
5608                       (unsigned long long)cache->reserved);
5609                btrfs_dump_free_space(cache, bytes);
5610                spin_unlock(&cache->lock);
5611        }
5612        if (++index < BTRFS_NR_RAID_TYPES)
5613                goto again;
5614        up_read(&info->groups_sem);
5615}
5616
5617int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5618                         struct btrfs_root *root,
5619                         u64 num_bytes, u64 min_alloc_size,
5620                         u64 empty_size, u64 hint_byte,
5621                         u64 search_end, struct btrfs_key *ins,
5622                         u64 data)
5623{
5624        int ret;
5625        u64 search_start = 0;
5626
5627        data = btrfs_get_alloc_profile(root, data);
5628again:
5629        /*
5630         * the only place that sets empty_size is btrfs_realloc_node, which
5631         * is not called recursively on allocations
5632         */
5633        if (empty_size || root->ref_cows)
5634                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5635                                     num_bytes + 2 * 1024 * 1024, data,
5636                                     CHUNK_ALLOC_NO_FORCE);
5637
5638        WARN_ON(num_bytes < root->sectorsize);
5639        ret = find_free_extent(trans, root, num_bytes, empty_size,
5640                               search_start, search_end, hint_byte,
5641                               ins, data);
5642
5643        if (ret == -ENOSPC && num_bytes > min_alloc_size) {
5644                num_bytes = num_bytes >> 1;
5645                num_bytes = num_bytes & ~(root->sectorsize - 1);
5646                num_bytes = max(num_bytes, min_alloc_size);
5647                do_chunk_alloc(trans, root->fs_info->extent_root,
5648                               num_bytes, data, CHUNK_ALLOC_FORCE);
5649                goto again;
5650        }
5651        if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
5652                struct btrfs_space_info *sinfo;
5653
5654                sinfo = __find_space_info(root->fs_info, data);
5655                printk(KERN_ERR "btrfs allocation failed flags %llu, "
5656                       "wanted %llu\n", (unsigned long long)data,
5657                       (unsigned long long)num_bytes);
5658                dump_space_info(sinfo, num_bytes, 1);
5659        }
5660
5661        trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5662
5663        return ret;
5664}
5665
5666static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5667                                        u64 start, u64 len, int pin)
5668{
5669        struct btrfs_block_group_cache *cache;
5670        int ret = 0;
5671
5672        cache = btrfs_lookup_block_group(root->fs_info, start);
5673        if (!cache) {
5674                printk(KERN_ERR "Unable to find block group for %llu\n",
5675                       (unsigned long long)start);
5676                return -ENOSPC;
5677        }
5678
5679        if (btrfs_test_opt(root, DISCARD))
5680                ret = btrfs_discard_extent(root, start, len, NULL);
5681
5682        if (pin)
5683                pin_down_extent(root, cache, start, len, 1);
5684        else {
5685                btrfs_add_free_space(cache, start, len);
5686                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5687        }
5688        btrfs_put_block_group(cache);
5689
5690        trace_btrfs_reserved_extent_free(root, start, len);
5691
5692        return ret;
5693}
5694
5695int btrfs_free_reserved_extent(struct btrfs_root *root,
5696                                        u64 start, u64 len)
5697{
5698        return __btrfs_free_reserved_extent(root, start, len, 0);
5699}
5700
5701int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5702                                       u64 start, u64 len)
5703{
5704        return __btrfs_free_reserved_extent(root, start, len, 1);
5705}
5706
5707static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5708                                      struct btrfs_root *root,
5709                                      u64 parent, u64 root_objectid,
5710                                      u64 flags, u64 owner, u64 offset,
5711                                      struct btrfs_key *ins, int ref_mod)
5712{
5713        int ret;
5714        struct btrfs_fs_info *fs_info = root->fs_info;
5715        struct btrfs_extent_item *extent_item;
5716        struct btrfs_extent_inline_ref *iref;
5717        struct btrfs_path *path;
5718        struct extent_buffer *leaf;
5719        int type;
5720        u32 size;
5721
5722        if (parent > 0)
5723                type = BTRFS_SHARED_DATA_REF_KEY;
5724        else
5725                type = BTRFS_EXTENT_DATA_REF_KEY;
5726
5727        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5728
5729        path = btrfs_alloc_path();
5730        if (!path)
5731                return -ENOMEM;
5732
5733        path->leave_spinning = 1;
5734        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5735                                      ins, size);
5736        BUG_ON(ret);
5737
5738        leaf = path->nodes[0];
5739        extent_item = btrfs_item_ptr(leaf, path->slots[0],
5740                                     struct btrfs_extent_item);
5741        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
5742        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5743        btrfs_set_extent_flags(leaf, extent_item,
5744                               flags | BTRFS_EXTENT_FLAG_DATA);
5745
5746        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
5747        btrfs_set_extent_inline_ref_type(leaf, iref, type);
5748        if (parent > 0) {
5749                struct btrfs_shared_data_ref *ref;
5750                ref = (struct btrfs_shared_data_ref *)(iref + 1);
5751                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5752                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
5753        } else {
5754                struct btrfs_extent_data_ref *ref;
5755                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
5756                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
5757                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
5758                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
5759                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
5760        }
5761
5762        btrfs_mark_buffer_dirty(path->nodes[0]);
5763        btrfs_free_path(path);
5764
5765        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5766        if (ret) {
5767                printk(KERN_ERR "btrfs update block group failed for %llu "
5768                       "%llu\n", (unsigned long long)ins->objectid,
5769                       (unsigned long long)ins->offset);
5770                BUG();
5771        }
5772        return ret;
5773}
5774
5775static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5776                                     struct btrfs_root *root,
5777                                     u64 parent, u64 root_objectid,
5778                                     u64 flags, struct btrfs_disk_key *key,
5779                                     int level, struct btrfs_key *ins)
5780{
5781        int ret;
5782        struct btrfs_fs_info *fs_info = root->fs_info;
5783        struct btrfs_extent_item *extent_item;
5784        struct btrfs_tree_block_info *block_info;
5785        struct btrfs_extent_inline_ref *iref;
5786        struct btrfs_path *path;
5787        struct extent_buffer *leaf;
5788        u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5789
5790        path = btrfs_alloc_path();
5791        if (!path)
5792                return -ENOMEM;
5793
5794        path->leave_spinning = 1;
5795        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
5796                                      ins, size);
5797        BUG_ON(ret);
5798
5799        leaf = path->nodes[0];
5800        extent_item = btrfs_item_ptr(leaf, path->slots[0],
5801                                     struct btrfs_extent_item);
5802        btrfs_set_extent_refs(leaf, extent_item, 1);
5803        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
5804        btrfs_set_extent_flags(leaf, extent_item,
5805                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
5806        block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
5807
5808        btrfs_set_tree_block_key(leaf, block_info, key);
5809        btrfs_set_tree_block_level(leaf, block_info, level);
5810
5811        iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
5812        if (parent > 0) {
5813                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
5814                btrfs_set_extent_inline_ref_type(leaf, iref,
5815                                                 BTRFS_SHARED_BLOCK_REF_KEY);
5816                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
5817        } else {
5818                btrfs_set_extent_inline_ref_type(leaf, iref,
5819                                                 BTRFS_TREE_BLOCK_REF_KEY);
5820                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
5821        }
5822
5823        btrfs_mark_buffer_dirty(leaf);
5824        btrfs_free_path(path);
5825
5826        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
5827        if (ret) {
5828                printk(KERN_ERR "btrfs update block group failed for %llu "
5829                       "%llu\n", (unsigned long long)ins->objectid,
5830                       (unsigned long long)ins->offset);
5831                BUG();
5832        }
5833        return ret;
5834}
5835
5836int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5837                                     struct btrfs_root *root,
5838                                     u64 root_objectid, u64 owner,
5839                                     u64 offset, struct btrfs_key *ins)
5840{
5841        int ret;
5842
5843        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5844
5845        ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
5846                                         0, root_objectid, owner, offset,
5847                                         BTRFS_ADD_DELAYED_EXTENT, NULL);
5848        return ret;
5849}
5850
5851/*
5852 * this is used by the tree logging recovery code.  It records that
5853 * an extent has been allocated and makes sure to clear the free
5854 * space cache bits as well
5855 */
5856int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5857                                   struct btrfs_root *root,
5858                                   u64 root_objectid, u64 owner, u64 offset,
5859                                   struct btrfs_key *ins)
5860{
5861        int ret;
5862        struct btrfs_block_group_cache *block_group;
5863        struct btrfs_caching_control *caching_ctl;
5864        u64 start = ins->objectid;
5865        u64 num_bytes = ins->offset;
5866
5867        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5868        cache_block_group(block_group, trans, NULL, 0);
5869        caching_ctl = get_caching_control(block_group);
5870
5871        if (!caching_ctl) {
5872                BUG_ON(!block_group_cache_done(block_group));
5873                ret = btrfs_remove_free_space(block_group, start, num_bytes);
5874                BUG_ON(ret);
5875        } else {
5876                mutex_lock(&caching_ctl->mutex);
5877
5878                if (start >= caching_ctl->progress) {
5879                        ret = add_excluded_extent(root, start, num_bytes);
5880                        BUG_ON(ret);
5881                } else if (start + num_bytes <= caching_ctl->progress) {
5882                        ret = btrfs_remove_free_space(block_group,
5883                                                      start, num_bytes);
5884                        BUG_ON(ret);
5885                } else {
5886                        num_bytes = caching_ctl->progress - start;
5887                        ret = btrfs_remove_free_space(block_group,
5888                                                      start, num_bytes);
5889                        BUG_ON(ret);
5890
5891                        start = caching_ctl->progress;
5892                        num_bytes = ins->objectid + ins->offset -
5893                                    caching_ctl->progress;
5894                        ret = add_excluded_extent(root, start, num_bytes);
5895                        BUG_ON(ret);
5896                }
5897
5898                mutex_unlock(&caching_ctl->mutex);
5899                put_caching_control(caching_ctl);
5900        }
5901
5902        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5903                                          RESERVE_ALLOC_NO_ACCOUNT);
5904        BUG_ON(ret);
5905        btrfs_put_block_group(block_group);
5906        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
5907                                         0, owner, offset, ins, 1);
5908        return ret;
5909}
5910
5911struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5912                                            struct btrfs_root *root,
5913                                            u64 bytenr, u32 blocksize,
5914                                            int level)
5915{
5916        struct extent_buffer *buf;
5917
5918        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
5919        if (!buf)
5920                return ERR_PTR(-ENOMEM);
5921        btrfs_set_header_generation(buf, trans->transid);
5922        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
5923        btrfs_tree_lock(buf);
5924        clean_tree_block(trans, root, buf);
5925
5926        btrfs_set_lock_blocking(buf);
5927        btrfs_set_buffer_uptodate(buf);
5928
5929        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
5930                /*
5931                 * we allow two log transactions at a time, use different
5932                 * EXENT bit to differentiate dirty pages.
5933                 */
5934                if (root->log_transid % 2 == 0)
5935                        set_extent_dirty(&root->dirty_log_pages, buf->start,
5936                                        buf->start + buf->len - 1, GFP_NOFS);
5937                else
5938                        set_extent_new(&root->dirty_log_pages, buf->start,
5939                                        buf->start + buf->len - 1, GFP_NOFS);
5940        } else {
5941                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
5942                         buf->start + buf->len - 1, GFP_NOFS);
5943        }
5944        trans->blocks_used++;
5945        /* this returns a buffer locked for blocking */
5946        return buf;
5947}
5948
5949static struct btrfs_block_rsv *
5950use_block_rsv(struct btrfs_trans_handle *trans,
5951              struct btrfs_root *root, u32 blocksize)
5952{
5953        struct btrfs_block_rsv *block_rsv;
5954        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5955        int ret;
5956
5957        block_rsv = get_block_rsv(trans, root);
5958
5959        if (block_rsv->size == 0) {
5960                ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5961                /*
5962                 * If we couldn't reserve metadata bytes try and use some from
5963                 * the global reserve.
5964                 */
5965                if (ret && block_rsv != global_rsv) {
5966                        ret = block_rsv_use_bytes(global_rsv, blocksize);
5967                        if (!ret)
5968                                return global_rsv;
5969                        return ERR_PTR(ret);
5970                } else if (ret) {
5971                        return ERR_PTR(ret);
5972                }
5973                return block_rsv;
5974        }
5975
5976        ret = block_rsv_use_bytes(block_rsv, blocksize);
5977        if (!ret)
5978                return block_rsv;
5979        if (ret) {
5980                static DEFINE_RATELIMIT_STATE(_rs,
5981                                DEFAULT_RATELIMIT_INTERVAL,
5982                                /*DEFAULT_RATELIMIT_BURST*/ 2);
5983                if (__ratelimit(&_rs)) {
5984                        printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5985                        WARN_ON(1);
5986                }
5987                ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5988                if (!ret) {
5989                        return block_rsv;
5990                } else if (ret && block_rsv != global_rsv) {
5991                        ret = block_rsv_use_bytes(global_rsv, blocksize);
5992                        if (!ret)
5993                                return global_rsv;
5994                }
5995        }
5996
5997        return ERR_PTR(-ENOSPC);
5998}
5999
6000static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
6001{
6002        block_rsv_add_bytes(block_rsv, blocksize, 0);
6003        block_rsv_release_bytes(block_rsv, NULL, 0);
6004}
6005
6006/*
6007 * finds a free extent and does all the dirty work required for allocation
6008 * returns the key for the extent through ins, and a tree buffer for
6009 * the first block of the extent through buf.
6010 *
6011 * returns the tree buffer or NULL.
6012 */
6013struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6014                                        struct btrfs_root *root, u32 blocksize,
6015                                        u64 parent, u64 root_objectid,
6016                                        struct btrfs_disk_key *key, int level,
6017                                        u64 hint, u64 empty_size)
6018{
6019        struct btrfs_key ins;
6020        struct btrfs_block_rsv *block_rsv;
6021        struct extent_buffer *buf;
6022        u64 flags = 0;
6023        int ret;
6024
6025
6026        block_rsv = use_block_rsv(trans, root, blocksize);
6027        if (IS_ERR(block_rsv))
6028                return ERR_CAST(block_rsv);
6029
6030        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6031                                   empty_size, hint, (u64)-1, &ins, 0);
6032        if (ret) {
6033                unuse_block_rsv(block_rsv, blocksize);
6034                return ERR_PTR(ret);
6035        }
6036
6037        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6038                                    blocksize, level);
6039        BUG_ON(IS_ERR(buf));
6040
6041        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6042                if (parent == 0)
6043                        parent = ins.objectid;
6044                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6045        } else
6046                BUG_ON(parent > 0);
6047
6048        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6049                struct btrfs_delayed_extent_op *extent_op;
6050                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
6051                BUG_ON(!extent_op);
6052                if (key)
6053                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
6054                else
6055                        memset(&extent_op->key, 0, sizeof(extent_op->key));
6056                extent_op->flags_to_set = flags;
6057                extent_op->update_key = 1;
6058                extent_op->update_flags = 1;
6059                extent_op->is_data = 0;
6060
6061                ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
6062                                        ins.offset, parent, root_objectid,
6063                                        level, BTRFS_ADD_DELAYED_EXTENT,
6064                                        extent_op);
6065                BUG_ON(ret);
6066        }
6067        return buf;
6068}
6069
6070struct walk_control {
6071        u64 refs[BTRFS_MAX_LEVEL];
6072        u64 flags[BTRFS_MAX_LEVEL];
6073        struct btrfs_key update_progress;
6074        int stage;
6075        int level;
6076        int shared_level;
6077        int update_ref;
6078        int keep_locks;
6079        int reada_slot;
6080        int reada_count;
6081};
6082
6083#define DROP_REFERENCE  1
6084#define UPDATE_BACKREF  2
6085
6086static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6087                                     struct btrfs_root *root,
6088                                     struct walk_control *wc,
6089                                     struct btrfs_path *path)
6090{
6091        u64 bytenr;
6092        u64 generation;
6093        u64 refs;
6094        u64 flags;
6095        u32 nritems;
6096        u32 blocksize;
6097        struct btrfs_key key;
6098        struct extent_buffer *eb;
6099        int ret;
6100        int slot;
6101        int nread = 0;
6102
6103        if (path->slots[wc->level] < wc->reada_slot) {
6104                wc->reada_count = wc->reada_count * 2 / 3;
6105                wc->reada_count = max(wc->reada_count, 2);
6106        } else {
6107                wc->reada_count = wc->reada_count * 3 / 2;
6108                wc->reada_count = min_t(int, wc->reada_count,
6109                                        BTRFS_NODEPTRS_PER_BLOCK(root));
6110        }
6111
6112        eb = path->nodes[wc->level];
6113        nritems = btrfs_header_nritems(eb);
6114        blocksize = btrfs_level_size(root, wc->level - 1);
6115
6116        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6117                if (nread >= wc->reada_count)
6118                        break;
6119
6120                cond_resched();
6121                bytenr = btrfs_node_blockptr(eb, slot);
6122                generation = btrfs_node_ptr_generation(eb, slot);
6123
6124                if (slot == path->slots[wc->level])
6125                        goto reada;
6126
6127                if (wc->stage == UPDATE_BACKREF &&
6128                    generation <= root->root_key.offset)
6129                        continue;
6130
6131                /* We don't lock the tree block, it's OK to be racy here */
6132                ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6133                                               &refs, &flags);
6134                BUG_ON(ret);
6135                BUG_ON(refs == 0);
6136
6137                if (wc->stage == DROP_REFERENCE) {
6138                        if (refs == 1)
6139                                goto reada;
6140
6141                        if (wc->level == 1 &&
6142                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6143                                continue;
6144                        if (!wc->update_ref ||
6145                            generation <= root->root_key.offset)
6146                                continue;
6147                        btrfs_node_key_to_cpu(eb, &key, slot);
6148                        ret = btrfs_comp_cpu_keys(&key,
6149                                                  &wc->update_progress);
6150                        if (ret < 0)
6151                                continue;
6152                } else {
6153                        if (wc->level == 1 &&
6154                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6155                                continue;
6156                }
6157reada:
6158                ret = readahead_tree_block(root, bytenr, blocksize,
6159                                           generation);
6160                if (ret)
6161                        break;
6162                nread++;
6163        }
6164        wc->reada_slot = slot;
6165}
6166
6167/*
6168 * hepler to process tree block while walking down the tree.
6169 *
6170 * when wc->stage == UPDATE_BACKREF, this function updates
6171 * back refs for pointers in the block.
6172 *
6173 * NOTE: return value 1 means we should stop walking down.
6174 */
6175static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6176                                   struct btrfs_root *root,
6177                                   struct btrfs_path *path,
6178                                   struct walk_control *wc, int lookup_info)
6179{
6180        int level = wc->level;
6181        struct extent_buffer *eb = path->nodes[level];
6182        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6183        int ret;
6184
6185        if (wc->stage == UPDATE_BACKREF &&
6186            btrfs_header_owner(eb) != root->root_key.objectid)
6187                return 1;
6188
6189        /*
6190         * when reference count of tree block is 1, it won't increase
6191         * again. once full backref flag is set, we never clear it.
6192         */
6193        if (lookup_info &&
6194            ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6195             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6196                BUG_ON(!path->locks[level]);
6197                ret = btrfs_lookup_extent_info(trans, root,
6198                                               eb->start, eb->len,
6199                                               &wc->refs[level],
6200                                               &wc->flags[level]);
6201                BUG_ON(ret);
6202                BUG_ON(wc->refs[level] == 0);
6203        }
6204
6205        if (wc->stage == DROP_REFERENCE) {
6206                if (wc->refs[level] > 1)
6207                        return 1;
6208
6209                if (path->locks[level] && !wc->keep_locks) {
6210                        btrfs_tree_unlock_rw(eb, path->locks[level]);
6211                        path->locks[level] = 0;
6212                }
6213                return 0;
6214        }
6215
6216        /* wc->stage == UPDATE_BACKREF */
6217        if (!(wc->flags[level] & flag)) {
6218                BUG_ON(!path->locks[level]);
6219                ret = btrfs_inc_ref(trans, root, eb, 1);
6220                BUG_ON(ret);
6221                ret = btrfs_dec_ref(trans, root, eb, 0);
6222                BUG_ON(ret);
6223                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6224                                                  eb->len, flag, 0);
6225                BUG_ON(ret);
6226                wc->flags[level] |= flag;
6227        }
6228
6229        /*
6230         * the block is shared by multiple trees, so it's not good to
6231         * keep the tree lock
6232         */
6233        if (path->locks[level] && level > 0) {
6234                btrfs_tree_unlock_rw(eb, path->locks[level]);
6235                path->locks[level] = 0;
6236        }
6237        return 0;
6238}
6239
6240/*
6241 * hepler to process tree block pointer.
6242 *
6243 * when wc->stage == DROP_REFERENCE, this function checks
6244 * reference count of the block pointed to. if the block
6245 * is shared and we need update back refs for the subtree
6246 * rooted at the block, this function changes wc->stage to
6247 * UPDATE_BACKREF. if the block is shared and there is no
6248 * need to update back, this function drops the reference
6249 * to the block.
6250 *
6251 * NOTE: return value 1 means we should stop walking down.
6252 */
6253static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6254                                 struct btrfs_root *root,
6255                                 struct btrfs_path *path,
6256                                 struct walk_control *wc, int *lookup_info)
6257{
6258        u64 bytenr;
6259        u64 generation;
6260        u64 parent;
6261        u32 blocksize;
6262        struct btrfs_key key;
6263        struct extent_buffer *next;
6264        int level = wc->level;
6265        int reada = 0;
6266        int ret = 0;
6267
6268        generation = btrfs_node_ptr_generation(path->nodes[level],
6269                                               path->slots[level]);
6270        /*
6271         * if the lower level block was created before the snapshot
6272         * was created, we know there is no need to update back refs
6273         * for the subtree
6274         */
6275        if (wc->stage == UPDATE_BACKREF &&
6276            generation <= root->root_key.offset) {
6277                *lookup_info = 1;
6278                return 1;
6279        }
6280
6281        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6282        blocksize = btrfs_level_size(root, level - 1);
6283
6284        next = btrfs_find_tree_block(root, bytenr, blocksize);
6285        if (!next) {
6286                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6287                if (!next)
6288                        return -ENOMEM;
6289                reada = 1;
6290        }
6291        btrfs_tree_lock(next);
6292        btrfs_set_lock_blocking(next);
6293
6294        ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6295                                       &wc->refs[level - 1],
6296                                       &wc->flags[level - 1]);
6297        BUG_ON(ret);
6298        BUG_ON(wc->refs[level - 1] == 0);
6299        *lookup_info = 0;
6300
6301        if (wc->stage == DROP_REFERENCE) {
6302                if (wc->refs[level - 1] > 1) {
6303                        if (level == 1 &&
6304                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6305                                goto skip;
6306
6307                        if (!wc->update_ref ||
6308                            generation <= root->root_key.offset)
6309                                goto skip;
6310
6311                        btrfs_node_key_to_cpu(path->nodes[level], &key,
6312                                              path->slots[level]);
6313                        ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6314                        if (ret < 0)
6315                                goto skip;
6316
6317                        wc->stage = UPDATE_BACKREF;
6318                        wc->shared_level = level - 1;
6319                }
6320        } else {
6321                if (level == 1 &&
6322                    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6323                        goto skip;
6324        }
6325
6326        if (!btrfs_buffer_uptodate(next, generation)) {
6327                btrfs_tree_unlock(next);
6328                free_extent_buffer(next);
6329                next = NULL;
6330                *lookup_info = 1;
6331        }
6332
6333        if (!next) {
6334                if (reada && level == 1)
6335                        reada_walk_down(trans, root, wc, path);
6336                next = read_tree_block(root, bytenr, blocksize, generation);
6337                if (!next)
6338                        return -EIO;
6339                btrfs_tree_lock(next);
6340                btrfs_set_lock_blocking(next);
6341        }
6342
6343        level--;
6344        BUG_ON(level != btrfs_header_level(next));
6345        path->nodes[level] = next;
6346        path->slots[level] = 0;
6347        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6348        wc->level = level;
6349        if (wc->level == 1)
6350                wc->reada_slot = 0;
6351        return 0;
6352skip:
6353        wc->refs[level - 1] = 0;
6354        wc->flags[level - 1] = 0;
6355        if (wc->stage == DROP_REFERENCE) {
6356                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6357                        parent = path->nodes[level]->start;
6358                } else {
6359                        BUG_ON(root->root_key.objectid !=
6360                               btrfs_header_owner(path->nodes[level]));
6361                        parent = 0;
6362                }
6363
6364                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6365                                        root->root_key.objectid, level - 1, 0);
6366                BUG_ON(ret);
6367        }
6368        btrfs_tree_unlock(next);
6369        free_extent_buffer(next);
6370        *lookup_info = 1;
6371        return 1;
6372}
6373
6374/*
6375 * hepler to process tree block while walking up the tree.
6376 *
6377 * when wc->stage == DROP_REFERENCE, this function drops
6378 * reference count on the block.
6379 *
6380 * when wc->stage == UPDATE_BACKREF, this function changes
6381 * wc->stage back to DROP_REFERENCE if we changed wc->stage
6382 * to UPDATE_BACKREF previously while processing the block.
6383 *
6384 * NOTE: return value 1 means we should stop walking up.
6385 */
6386static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6387                                 struct btrfs_root *root,
6388                                 struct btrfs_path *path,
6389                                 struct walk_control *wc)
6390{
6391        int ret;
6392        int level = wc->level;
6393        struct extent_buffer *eb = path->nodes[level];
6394        u64 parent = 0;
6395
6396        if (wc->stage == UPDATE_BACKREF) {
6397                BUG_ON(wc->shared_level < level);
6398                if (level < wc->shared_level)
6399                        goto out;
6400
6401                ret = find_next_key(path, level + 1, &wc->update_progress);
6402                if (ret > 0)
6403                        wc->update_ref = 0;
6404
6405                wc->stage = DROP_REFERENCE;
6406                wc->shared_level = -1;
6407                path->slots[level] = 0;
6408
6409                /*
6410                 * check reference count again if the block isn't locked.
6411                 * we should start walking down the tree again if reference
6412                 * count is one.
6413                 */
6414                if (!path->locks[level]) {
6415                        BUG_ON(level == 0);
6416                        btrfs_tree_lock(eb);
6417                        btrfs_set_lock_blocking(eb);
6418                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6419
6420                        ret = btrfs_lookup_extent_info(trans, root,
6421                                                       eb->start, eb->len,
6422                                                       &wc->refs[level],
6423                                                       &wc->flags[level]);
6424                        BUG_ON(ret);
6425                        BUG_ON(wc->refs[level] == 0);
6426                        if (wc->refs[level] == 1) {
6427                                btrfs_tree_unlock_rw(eb, path->locks[level]);
6428                                return 1;
6429                        }
6430                }
6431        }
6432
6433        /* wc->stage == DROP_REFERENCE */
6434        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6435
6436        if (wc->refs[level] == 1) {
6437                if (level == 0) {
6438                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6439                                ret = btrfs_dec_ref(trans, root, eb, 1);
6440                        else
6441                                ret = btrfs_dec_ref(trans, root, eb, 0);
6442                        BUG_ON(ret);
6443                }
6444                /* make block locked assertion in clean_tree_block happy */
6445                if (!path->locks[level] &&
6446                    btrfs_header_generation(eb) == trans->transid) {
6447                        btrfs_tree_lock(eb);
6448                        btrfs_set_lock_blocking(eb);
6449                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6450                }
6451                clean_tree_block(trans, root, eb);
6452        }
6453
6454        if (eb == root->node) {
6455                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6456                        parent = eb->start;
6457                else
6458                        BUG_ON(root->root_key.objectid !=
6459                               btrfs_header_owner(eb));
6460        } else {
6461                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6462                        parent = path->nodes[level + 1]->start;
6463                else
6464                        BUG_ON(root->root_key.objectid !=
6465                               btrfs_header_owner(path->nodes[level + 1]));
6466        }
6467
6468        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6469out:
6470        wc->refs[level] = 0;
6471        wc->flags[level] = 0;
6472        return 0;
6473}
6474
6475static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6476                                   struct btrfs_root *root,
6477                                   struct btrfs_path *path,
6478                                   struct walk_control *wc)
6479{
6480        int level = wc->level;
6481        int lookup_info = 1;
6482        int ret;
6483
6484        while (level >= 0) {
6485                ret = walk_down_proc(trans, root, path, wc, lookup_info);
6486                if (ret > 0)
6487                        break;
6488
6489                if (level == 0)
6490                        break;
6491
6492                if (path->slots[level] >=
6493                    btrfs_header_nritems(path->nodes[level]))
6494                        break;
6495
6496                ret = do_walk_down(trans, root, path, wc, &lookup_info);
6497                if (ret > 0) {
6498                        path->slots[level]++;
6499                        continue;
6500                } else if (ret < 0)
6501                        return ret;
6502                level = wc->level;
6503        }
6504        return 0;
6505}
6506
6507static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6508                                 struct btrfs_root *root,
6509                                 struct btrfs_path *path,
6510                                 struct walk_control *wc, int max_level)
6511{
6512        int level = wc->level;
6513        int ret;
6514
6515        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6516        while (level < max_level && path->nodes[level]) {
6517                wc->level = level;
6518                if (path->slots[level] + 1 <
6519                    btrfs_header_nritems(path->nodes[level])) {
6520                        path->slots[level]++;
6521                        return 0;
6522                } else {
6523                        ret = walk_up_proc(trans, root, path, wc);
6524                        if (ret > 0)
6525                                return 0;
6526
6527                        if (path->locks[level]) {
6528                                btrfs_tree_unlock_rw(path->nodes[level],
6529                                                     path->locks[level]);
6530                                path->locks[level] = 0;
6531                        }
6532                        free_extent_buffer(path->nodes[level]);
6533                        path->nodes[level] = NULL;
6534                        level++;
6535                }
6536        }
6537        return 1;
6538}
6539
6540/*
6541 * drop a subvolume tree.
6542 *
6543 * this function traverses the tree freeing any blocks that only
6544 * referenced by the tree.
6545 *
6546 * when a shared tree block is found. this function decreases its
6547 * reference count by one. if update_ref is true, this function
6548 * also make sure backrefs for the shared block and all lower level
6549 * blocks are properly updated.
6550 */
6551void btrfs_drop_snapshot(struct btrfs_root *root,
6552                         struct btrfs_block_rsv *block_rsv, int update_ref)
6553{
6554        struct btrfs_path *path;
6555        struct btrfs_trans_handle *trans;
6556        struct btrfs_root *tree_root = root->fs_info->tree_root;
6557        struct btrfs_root_item *root_item = &root->root_item;
6558        struct walk_control *wc;
6559        struct btrfs_key key;
6560        int err = 0;
6561        int ret;
6562        int level;
6563
6564        path = btrfs_alloc_path();
6565        if (!path) {
6566                err = -ENOMEM;
6567                goto out;
6568        }
6569
6570        wc = kzalloc(sizeof(*wc), GFP_NOFS);
6571        if (!wc) {
6572                btrfs_free_path(path);
6573                err = -ENOMEM;
6574                goto out;
6575        }
6576
6577        trans = btrfs_start_transaction(tree_root, 0);
6578        BUG_ON(IS_ERR(trans));
6579
6580        if (block_rsv)
6581                trans->block_rsv = block_rsv;
6582
6583        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6584                level = btrfs_header_level(root->node);
6585                path->nodes[level] = btrfs_lock_root_node(root);
6586                btrfs_set_lock_blocking(path->nodes[level]);
6587                path->slots[level] = 0;
6588                path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6589                memset(&wc->update_progress, 0,
6590                       sizeof(wc->update_progress));
6591        } else {
6592                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
6593                memcpy(&wc->update_progress, &key,
6594                       sizeof(wc->update_progress));
6595
6596                level = root_item->drop_level;
6597                BUG_ON(level == 0);
6598                path->lowest_level = level;
6599                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6600                path->lowest_level = 0;
6601                if (ret < 0) {
6602                        err = ret;
6603                        goto out_free;
6604                }
6605                WARN_ON(ret > 0);
6606
6607                /*
6608                 * unlock our path, this is safe because only this
6609                 * function is allowed to delete this snapshot
6610                 */
6611                btrfs_unlock_up_safe(path, 0);
6612
6613                level = btrfs_header_level(root->node);
6614                while (1) {
6615                        btrfs_tree_lock(path->nodes[level]);
6616                        btrfs_set_lock_blocking(path->nodes[level]);
6617
6618                        ret = btrfs_lookup_extent_info(trans, root,
6619                                                path->nodes[level]->start,
6620                                                path->nodes[level]->len,
6621                                                &wc->refs[level],
6622                                                &wc->flags[level]);
6623                        BUG_ON(ret);
6624                        BUG_ON(wc->refs[level] == 0);
6625
6626                        if (level == root_item->drop_level)
6627                                break;
6628
6629                        btrfs_tree_unlock(path->nodes[level]);
6630                        WARN_ON(wc->refs[level] != 1);
6631                        level--;
6632                }
6633        }
6634
6635        wc->level = level;
6636        wc->shared_level = -1;
6637        wc->stage = DROP_REFERENCE;
6638        wc->update_ref = update_ref;
6639        wc->keep_locks = 0;
6640        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6641
6642        while (1) {
6643                ret = walk_down_tree(trans, root, path, wc);
6644                if (ret < 0) {
6645                        err = ret;
6646                        break;
6647                }
6648
6649                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
6650                if (ret < 0) {
6651                        err = ret;
6652                        break;
6653                }
6654
6655                if (ret > 0) {
6656                        BUG_ON(wc->stage != DROP_REFERENCE);
6657                        break;
6658                }
6659
6660                if (wc->stage == DROP_REFERENCE) {
6661                        level = wc->level;
6662                        btrfs_node_key(path->nodes[level],
6663                                       &root_item->drop_progress,
6664                                       path->slots[level]);
6665                        root_item->drop_level = level;
6666                }
6667
6668                BUG_ON(wc->level == 0);
6669                if (btrfs_should_end_transaction(trans, tree_root)) {
6670                        ret = btrfs_update_root(trans, tree_root,
6671                                                &root->root_key,
6672                                                root_item);
6673                        BUG_ON(ret);
6674
6675                        btrfs_end_transaction_throttle(trans, tree_root);
6676                        trans = btrfs_start_transaction(tree_root, 0);
6677                        BUG_ON(IS_ERR(trans));
6678                        if (block_rsv)
6679                                trans->block_rsv = block_rsv;
6680                }
6681        }
6682        btrfs_release_path(path);
6683        BUG_ON(err);
6684
6685        ret = btrfs_del_root(trans, tree_root, &root->root_key);
6686        BUG_ON(ret);
6687
6688        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
6689                ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
6690                                           NULL, NULL);
6691                BUG_ON(ret < 0);
6692                if (ret > 0) {
6693                        /* if we fail to delete the orphan item this time
6694                         * around, it'll get picked up the next time.
6695                         *
6696                         * The most common failure here is just -ENOENT.
6697                         */
6698                        btrfs_del_orphan_item(trans, tree_root,
6699                                              root->root_key.objectid);
6700                }
6701        }
6702
6703        if (root->in_radix) {
6704                btrfs_free_fs_root(tree_root->fs_info, root);
6705        } else {
6706                free_extent_buffer(root->node);
6707                free_extent_buffer(root->commit_root);
6708                kfree(root);
6709        }
6710out_free:
6711        btrfs_end_transaction_throttle(trans, tree_root);
6712        kfree(wc);
6713        btrfs_free_path(path);
6714out:
6715        if (err)
6716                btrfs_std_error(root->fs_info, err);
6717        return;
6718}
6719
6720/*
6721 * drop subtree rooted at tree block 'node'.
6722 *
6723 * NOTE: this function will unlock and release tree block 'node'
6724 */
6725int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6726                        struct btrfs_root *root,
6727                        struct extent_buffer *node,
6728                        struct extent_buffer *parent)
6729{
6730        struct btrfs_path *path;
6731        struct walk_control *wc;
6732        int level;
6733        int parent_level;
6734        int ret = 0;
6735        int wret;
6736
6737        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6738
6739        path = btrfs_alloc_path();
6740        if (!path)
6741                return -ENOMEM;
6742
6743        wc = kzalloc(sizeof(*wc), GFP_NOFS);
6744        if (!wc) {
6745                btrfs_free_path(path);
6746                return -ENOMEM;
6747        }
6748
6749        btrfs_assert_tree_locked(parent);
6750        parent_level = btrfs_header_level(parent);
6751        extent_buffer_get(parent);
6752        path->nodes[parent_level] = parent;
6753        path->slots[parent_level] = btrfs_header_nritems(parent);
6754
6755        btrfs_assert_tree_locked(node);
6756        level = btrfs_header_level(node);
6757        path->nodes[level] = node;
6758        path->slots[level] = 0;
6759        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6760
6761        wc->refs[parent_level] = 1;
6762        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6763        wc->level = level;
6764        wc->shared_level = -1;
6765        wc->stage = DROP_REFERENCE;
6766        wc->update_ref = 0;
6767        wc->keep_locks = 1;
6768        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6769
6770        while (1) {
6771                wret = walk_down_tree(trans, root, path, wc);
6772                if (wret < 0) {
6773                        ret = wret;
6774                        break;
6775                }
6776
6777                wret = walk_up_tree(trans, root, path, wc, parent_level);
6778                if (wret < 0)
6779                        ret = wret;
6780                if (wret != 0)
6781                        break;
6782        }
6783
6784        kfree(wc);
6785        btrfs_free_path(path);
6786        return ret;
6787}
6788
6789static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6790{
6791        u64 num_devices;
6792        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6793                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6794
6795        /*
6796         * we add in the count of missing devices because we want
6797         * to make sure that any RAID levels on a degraded FS
6798         * continue to be honored.
6799         */
6800        num_devices = root->fs_info->fs_devices->rw_devices +
6801                root->fs_info->fs_devices->missing_devices;
6802
6803        if (num_devices == 1) {
6804                stripped |= BTRFS_BLOCK_GROUP_DUP;
6805                stripped = flags & ~stripped;
6806
6807                /* turn raid0 into single device chunks */
6808                if (flags & BTRFS_BLOCK_GROUP_RAID0)
6809                        return stripped;
6810
6811                /* turn mirroring into duplication */
6812                if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
6813                             BTRFS_BLOCK_GROUP_RAID10))
6814                        return stripped | BTRFS_BLOCK_GROUP_DUP;
6815                return flags;
6816        } else {
6817                /* they already had raid on here, just return */
6818                if (flags & stripped)
6819                        return flags;
6820
6821                stripped |= BTRFS_BLOCK_GROUP_DUP;
6822                stripped = flags & ~stripped;
6823
6824                /* switch duplicated blocks with raid1 */
6825                if (flags & BTRFS_BLOCK_GROUP_DUP)
6826                        return stripped | BTRFS_BLOCK_GROUP_RAID1;
6827
6828                /* turn single device chunks into raid0 */
6829                return stripped | BTRFS_BLOCK_GROUP_RAID0;
6830        }
6831        return flags;
6832}
6833
6834static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6835{
6836        struct btrfs_space_info *sinfo = cache->space_info;
6837        u64 num_bytes;
6838        u64 min_allocable_bytes;
6839        int ret = -ENOSPC;
6840
6841
6842        /*
6843         * We need some metadata space and system metadata space for
6844         * allocating chunks in some corner cases until we force to set
6845         * it to be readonly.
6846         */
6847        if ((sinfo->flags &
6848             (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
6849            !force)
6850                min_allocable_bytes = 1 * 1024 * 1024;
6851        else
6852                min_allocable_bytes = 0;
6853
6854        spin_lock(&sinfo->lock);
6855        spin_lock(&cache->lock);
6856
6857        if (cache->ro) {
6858                ret = 0;
6859                goto out;
6860        }
6861
6862        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6863                    cache->bytes_super - btrfs_block_group_used(&cache->item);
6864
6865        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6866            sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6867            min_allocable_bytes <= sinfo->total_bytes) {
6868                sinfo->bytes_readonly += num_bytes;
6869                cache->ro = 1;
6870                ret = 0;
6871        }
6872out:
6873        spin_unlock(&cache->lock);
6874        spin_unlock(&sinfo->lock);
6875        return ret;
6876}
6877
6878int btrfs_set_block_group_ro(struct btrfs_root *root,
6879                             struct btrfs_block_group_cache *cache)
6880
6881{
6882        struct btrfs_trans_handle *trans;
6883        u64 alloc_flags;
6884        int ret;
6885
6886        BUG_ON(cache->ro);
6887
6888        trans = btrfs_join_transaction(root);
6889        BUG_ON(IS_ERR(trans));
6890
6891        alloc_flags = update_block_group_flags(root, cache->flags);
6892        if (alloc_flags != cache->flags)
6893                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6894                               CHUNK_ALLOC_FORCE);
6895
6896        ret = set_block_group_ro(cache, 0);
6897        if (!ret)
6898                goto out;
6899        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
6900        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6901                             CHUNK_ALLOC_FORCE);
6902        if (ret < 0)
6903                goto out;
6904        ret = set_block_group_ro(cache, 0);
6905out:
6906        btrfs_end_transaction(trans, root);
6907        return ret;
6908}
6909
6910int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
6911                            struct btrfs_root *root, u64 type)
6912{
6913        u64 alloc_flags = get_alloc_profile(root, type);
6914        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6915                              CHUNK_ALLOC_FORCE);
6916}
6917
6918/*
6919 * helper to account the unused space of all the readonly block group in the
6920 * list. takes mirrors into account.
6921 */
6922static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
6923{
6924        struct btrfs_block_group_cache *block_group;
6925        u64 free_bytes = 0;
6926        int factor;
6927
6928        list_for_each_entry(block_group, groups_list, list) {
6929                spin_lock(&block_group->lock);
6930
6931                if (!block_group->ro) {
6932                        spin_unlock(&block_group->lock);
6933                        continue;
6934                }
6935
6936                if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
6937                                          BTRFS_BLOCK_GROUP_RAID10 |
6938                                          BTRFS_BLOCK_GROUP_DUP))
6939                        factor = 2;
6940                else
6941                        factor = 1;
6942
6943                free_bytes += (block_group->key.offset -
6944                               btrfs_block_group_used(&block_group->item)) *
6945                               factor;
6946
6947                spin_unlock(&block_group->lock);
6948        }
6949
6950        return free_bytes;
6951}
6952
6953/*
6954 * helper to account the unused space of all the readonly block group in the
6955 * space_info. takes mirrors into account.
6956 */
6957u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
6958{
6959        int i;
6960        u64 free_bytes = 0;
6961
6962        spin_lock(&sinfo->lock);
6963
6964        for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
6965                if (!list_empty(&sinfo->block_groups[i]))
6966                        free_bytes += __btrfs_get_ro_block_group_free_space(
6967                                                &sinfo->block_groups[i]);
6968
6969        spin_unlock(&sinfo->lock);
6970
6971        return free_bytes;
6972}
6973
6974int btrfs_set_block_group_rw(struct btrfs_root *root,
6975                              struct btrfs_block_group_cache *cache)
6976{
6977        struct btrfs_space_info *sinfo = cache->space_info;
6978        u64 num_bytes;
6979
6980        BUG_ON(!cache->ro);
6981
6982        spin_lock(&sinfo->lock);
6983        spin_lock(&cache->lock);
6984        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6985                    cache->bytes_super - btrfs_block_group_used(&cache->item);
6986        sinfo->bytes_readonly -= num_bytes;
6987        cache->ro = 0;
6988        spin_unlock(&cache->lock);
6989        spin_unlock(&sinfo->lock);
6990        return 0;
6991}
6992
6993/*
6994 * checks to see if its even possible to relocate this block group.
6995 *
6996 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
6997 * ok to go ahead and try.
6998 */
6999int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7000{
7001        struct btrfs_block_group_cache *block_group;
7002        struct btrfs_space_info *space_info;
7003        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7004        struct btrfs_device *device;
7005        u64 min_free;
7006        u64 dev_min = 1;
7007        u64 dev_nr = 0;
7008        int index;
7009        int full = 0;
7010        int ret = 0;
7011
7012        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7013
7014        /* odd, couldn't find the block group, leave it alone */
7015        if (!block_group)
7016                return -1;
7017
7018        min_free = btrfs_block_group_used(&block_group->item);
7019
7020        /* no bytes used, we're good */
7021        if (!min_free)
7022                goto out;
7023
7024        space_info = block_group->space_info;
7025        spin_lock(&space_info->lock);
7026
7027        full = space_info->full;
7028
7029        /*
7030         * if this is the last block group we have in this space, we can't
7031         * relocate it unless we're able to allocate a new chunk below.
7032         *
7033         * Otherwise, we need to make sure we have room in the space to handle
7034         * all of the extents from this block group.  If we can, we're good
7035         */
7036        if ((space_info->total_bytes != block_group->key.offset) &&
7037            (space_info->bytes_used + space_info->bytes_reserved +
7038             space_info->bytes_pinned + space_info->bytes_readonly +
7039             min_free < space_info->total_bytes)) {
7040                spin_unlock(&space_info->lock);
7041                goto out;
7042        }
7043        spin_unlock(&space_info->lock);
7044
7045        /*
7046         * ok we don't have enough space, but maybe we have free space on our
7047         * devices to allocate new chunks for relocation, so loop through our
7048         * alloc devices and guess if we have enough space.  However, if we
7049         * were marked as full, then we know there aren't enough chunks, and we
7050         * can just return.
7051         */
7052        ret = -1;
7053        if (full)
7054                goto out;
7055
7056        /*
7057         * index:
7058         *      0: raid10
7059         *      1: raid1
7060         *      2: dup
7061         *      3: raid0
7062         *      4: single
7063         */
7064        index = get_block_group_index(block_group);
7065        if (index == 0) {
7066                dev_min = 4;
7067                /* Divide by 2 */
7068                min_free >>= 1;
7069        } else if (index == 1) {
7070                dev_min = 2;
7071        } else if (index == 2) {
7072                /* Multiply by 2 */
7073                min_free <<= 1;
7074        } else if (index == 3) {
7075                dev_min = fs_devices->rw_devices;
7076                do_div(min_free, dev_min);
7077        }
7078
7079        mutex_lock(&root->fs_info->chunk_mutex);
7080        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7081                u64 dev_offset;
7082
7083                /*
7084                 * check to make sure we can actually find a chunk with enough
7085                 * space to fit our block group in.
7086                 */
7087                if (device->total_bytes > device->bytes_used + min_free) {
7088                        ret = find_free_dev_extent(NULL, device, min_free,
7089                                                   &dev_offset, NULL);
7090                        if (!ret)
7091                                dev_nr++;
7092
7093                        if (dev_nr >= dev_min)
7094                                break;
7095
7096                        ret = -1;
7097                }
7098        }
7099        mutex_unlock(&root->fs_info->chunk_mutex);
7100out:
7101        btrfs_put_block_group(block_group);
7102        return ret;
7103}
7104
7105static int find_first_block_group(struct btrfs_root *root,
7106                struct btrfs_path *path, struct btrfs_key *key)
7107{
7108        int ret = 0;
7109        struct btrfs_key found_key;
7110        struct extent_buffer *leaf;
7111        int slot;
7112
7113        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7114        if (ret < 0)
7115                goto out;
7116
7117        while (1) {
7118                slot = path->slots[0];
7119                leaf = path->nodes[0];
7120                if (slot >= btrfs_header_nritems(leaf)) {
7121                        ret = btrfs_next_leaf(root, path);
7122                        if (ret == 0)
7123                                continue;
7124                        if (ret < 0)
7125                                goto out;
7126                        break;
7127                }
7128                btrfs_item_key_to_cpu(leaf, &found_key, slot);
7129
7130                if (found_key.objectid >= key->objectid &&
7131                    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7132                        ret = 0;
7133                        goto out;
7134                }
7135                path->slots[0]++;
7136        }
7137out:
7138        return ret;
7139}
7140
7141void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7142{
7143        struct btrfs_block_group_cache *block_group;
7144        u64 last = 0;
7145
7146        while (1) {
7147                struct inode *inode;
7148
7149                block_group = btrfs_lookup_first_block_group(info, last);
7150                while (block_group) {
7151                        spin_lock(&block_group->lock);
7152                        if (block_group->iref)
7153                                break;
7154                        spin_unlock(&block_group->lock);
7155                        block_group = next_block_group(info->tree_root,
7156                                                       block_group);
7157                }
7158                if (!block_group) {
7159                        if (last == 0)
7160                                break;
7161                        last = 0;
7162                        continue;
7163                }
7164
7165                inode = block_group->inode;
7166                block_group->iref = 0;
7167                block_group->inode = NULL;
7168                spin_unlock(&block_group->lock);
7169                iput(inode);
7170                last = block_group->key.objectid + block_group->key.offset;
7171                btrfs_put_block_group(block_group);
7172        }
7173}
7174
7175int btrfs_free_block_groups(struct btrfs_fs_info *info)
7176{
7177        struct btrfs_block_group_cache *block_group;
7178        struct btrfs_space_info *space_info;
7179        struct btrfs_caching_control *caching_ctl;
7180        struct rb_node *n;
7181
7182        down_write(&info->extent_commit_sem);
7183        while (!list_empty(&info->caching_block_groups)) {
7184                caching_ctl = list_entry(info->caching_block_groups.next,
7185                                         struct btrfs_caching_control, list);
7186                list_del(&caching_ctl->list);
7187                put_caching_control(caching_ctl);
7188        }
7189        up_write(&info->extent_commit_sem);
7190
7191        spin_lock(&info->block_group_cache_lock);
7192        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7193                block_group = rb_entry(n, struct btrfs_block_group_cache,
7194                                       cache_node);
7195                rb_erase(&block_group->cache_node,
7196                         &info->block_group_cache_tree);
7197                spin_unlock(&info->block_group_cache_lock);
7198
7199                down_write(&block_group->space_info->groups_sem);
7200                list_del(&block_group->list);
7201                up_write(&block_group->space_info->groups_sem);
7202
7203                if (block_group->cached == BTRFS_CACHE_STARTED)
7204                        wait_block_group_cache_done(block_group);
7205
7206                /*
7207                 * We haven't cached this block group, which means we could
7208                 * possibly have excluded extents on this block group.
7209                 */
7210                if (block_group->cached == BTRFS_CACHE_NO)
7211                        free_excluded_extents(info->extent_root, block_group);
7212
7213                btrfs_remove_free_space_cache(block_group);
7214                btrfs_put_block_group(block_group);
7215
7216                spin_lock(&info->block_group_cache_lock);
7217        }
7218        spin_unlock(&info->block_group_cache_lock);
7219
7220        /* now that all the block groups are freed, go through and
7221         * free all the space_info structs.  This is only called during
7222         * the final stages of unmount, and so we know nobody is
7223         * using them.  We call synchronize_rcu() once before we start,
7224         * just to be on the safe side.
7225         */
7226        synchronize_rcu();
7227
7228        release_global_block_rsv(info);
7229
7230        while(!list_empty(&info->space_info)) {
7231                space_info = list_entry(info->space_info.next,
7232                                        struct btrfs_space_info,
7233                                        list);
7234                if (space_info->bytes_pinned > 0 ||
7235                    space_info->bytes_reserved > 0 ||
7236                    space_info->bytes_may_use > 0) {
7237                        WARN_ON(1);
7238                        dump_space_info(space_info, 0, 0);
7239                }
7240                list_del(&space_info->list);
7241                kfree(space_info);
7242        }
7243        return 0;
7244}
7245
7246static void __link_block_group(struct btrfs_space_info *space_info,
7247                               struct btrfs_block_group_cache *cache)
7248{
7249        int index = get_block_group_index(cache);
7250
7251        down_write(&space_info->groups_sem);
7252        list_add_tail(&cache->list, &space_info->block_groups[index]);
7253        up_write(&space_info->groups_sem);
7254}
7255
7256int btrfs_read_block_groups(struct btrfs_root *root)
7257{
7258        struct btrfs_path *path;
7259        int ret;
7260        struct btrfs_block_group_cache *cache;
7261        struct btrfs_fs_info *info = root->fs_info;
7262        struct btrfs_space_info *space_info;
7263        struct btrfs_key key;
7264        struct btrfs_key found_key;
7265        struct extent_buffer *leaf;
7266        int need_clear = 0;
7267        u64 cache_gen;
7268
7269        root = info->extent_root;
7270        key.objectid = 0;
7271        key.offset = 0;
7272        btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7273        path = btrfs_alloc_path();
7274        if (!path)
7275                return -ENOMEM;
7276        path->reada = 1;
7277
7278        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7279        if (btrfs_test_opt(root, SPACE_CACHE) &&
7280            btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7281                need_clear = 1;
7282        if (btrfs_test_opt(root, CLEAR_CACHE))
7283                need_clear = 1;
7284
7285        while (1) {
7286                ret = find_first_block_group(root, path, &key);
7287                if (ret > 0)
7288                        break;
7289                if (ret != 0)
7290                        goto error;
7291                leaf = path->nodes[0];
7292                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7293                cache = kzalloc(sizeof(*cache), GFP_NOFS);
7294                if (!cache) {
7295                        ret = -ENOMEM;
7296                        goto error;
7297                }
7298                cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7299                                                GFP_NOFS);
7300                if (!cache->free_space_ctl) {
7301                        kfree(cache);
7302                        ret = -ENOMEM;
7303                        goto error;
7304                }
7305
7306                atomic_set(&cache->count, 1);
7307                spin_lock_init(&cache->lock);
7308                cache->fs_info = info;
7309                INIT_LIST_HEAD(&cache->list);
7310                INIT_LIST_HEAD(&cache->cluster_list);
7311
7312                if (need_clear)
7313                        cache->disk_cache_state = BTRFS_DC_CLEAR;
7314
7315                read_extent_buffer(leaf, &cache->item,
7316                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
7317                                   sizeof(cache->item));
7318                memcpy(&cache->key, &found_key, sizeof(found_key));
7319
7320                key.objectid = found_key.objectid + found_key.offset;
7321                btrfs_release_path(path);
7322                cache->flags = btrfs_block_group_flags(&cache->item);
7323                cache->sectorsize = root->sectorsize;
7324
7325                btrfs_init_free_space_ctl(cache);
7326
7327                /*
7328                 * We need to exclude the super stripes now so that the space
7329                 * info has super bytes accounted for, otherwise we'll think
7330                 * we have more space than we actually do.
7331                 */
7332                exclude_super_stripes(root, cache);
7333
7334                /*
7335                 * check for two cases, either we are full, and therefore
7336                 * don't need to bother with the caching work since we won't
7337                 * find any space, or we are empty, and we can just add all
7338                 * the space in and be done with it.  This saves us _alot_ of
7339                 * time, particularly in the full case.
7340                 */
7341                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7342                        cache->last_byte_to_unpin = (u64)-1;
7343                        cache->cached = BTRFS_CACHE_FINISHED;
7344                        free_excluded_extents(root, cache);
7345                } else if (btrfs_block_group_used(&cache->item) == 0) {
7346                        cache->last_byte_to_unpin = (u64)-1;
7347                        cache->cached = BTRFS_CACHE_FINISHED;
7348                        add_new_free_space(cache, root->fs_info,
7349                                           found_key.objectid,
7350                                           found_key.objectid +
7351                                           found_key.offset);
7352                        free_excluded_extents(root, cache);
7353                }
7354
7355                ret = update_space_info(info, cache->flags, found_key.offset,
7356                                        btrfs_block_group_used(&cache->item),
7357                                        &space_info);
7358                BUG_ON(ret);
7359                cache->space_info = space_info;
7360                spin_lock(&cache->space_info->lock);
7361                cache->space_info->bytes_readonly += cache->bytes_super;
7362                spin_unlock(&cache->space_info->lock);
7363
7364                __link_block_group(space_info, cache);
7365
7366                ret = btrfs_add_block_group_cache(root->fs_info, cache);
7367                BUG_ON(ret);
7368
7369                set_avail_alloc_bits(root->fs_info, cache->flags);
7370                if (btrfs_chunk_readonly(root, cache->key.objectid))
7371                        set_block_group_ro(cache, 1);
7372        }
7373
7374        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7375                if (!(get_alloc_profile(root, space_info->flags) &
7376                      (BTRFS_BLOCK_GROUP_RAID10 |
7377                       BTRFS_BLOCK_GROUP_RAID1 |
7378                       BTRFS_BLOCK_GROUP_DUP)))
7379                        continue;
7380                /*
7381                 * avoid allocating from un-mirrored block group if there are
7382                 * mirrored block groups.
7383                 */
7384                list_for_each_entry(cache, &space_info->block_groups[3], list)
7385                        set_block_group_ro(cache, 1);
7386                list_for_each_entry(cache, &space_info->block_groups[4], list)
7387                        set_block_group_ro(cache, 1);
7388        }
7389
7390        init_global_block_rsv(info);
7391        ret = 0;
7392error:
7393        btrfs_free_path(path);
7394        return ret;
7395}
7396
7397int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7398                           struct btrfs_root *root, u64 bytes_used,
7399                           u64 type, u64 chunk_objectid, u64 chunk_offset,
7400                           u64 size)
7401{
7402        int ret;
7403        struct btrfs_root *extent_root;
7404        struct btrfs_block_group_cache *cache;
7405
7406        extent_root = root->fs_info->extent_root;
7407
7408        root->fs_info->last_trans_log_full_commit = trans->transid;
7409
7410        cache = kzalloc(sizeof(*cache), GFP_NOFS);
7411        if (!cache)
7412                return -ENOMEM;
7413        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7414                                        GFP_NOFS);
7415        if (!cache->free_space_ctl) {
7416                kfree(cache);
7417                return -ENOMEM;
7418        }
7419
7420        cache->key.objectid = chunk_offset;
7421        cache->key.offset = size;
7422        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7423        cache->sectorsize = root->sectorsize;
7424        cache->fs_info = root->fs_info;
7425
7426        atomic_set(&cache->count, 1);
7427        spin_lock_init(&cache->lock);
7428        INIT_LIST_HEAD(&cache->list);
7429        INIT_LIST_HEAD(&cache->cluster_list);
7430
7431        btrfs_init_free_space_ctl(cache);
7432
7433        btrfs_set_block_group_used(&cache->item, bytes_used);
7434        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7435        cache->flags = type;
7436        btrfs_set_block_group_flags(&cache->item, type);
7437
7438        cache->last_byte_to_unpin = (u64)-1;
7439        cache->cached = BTRFS_CACHE_FINISHED;
7440        exclude_super_stripes(root, cache);
7441
7442        add_new_free_space(cache, root->fs_info, chunk_offset,
7443                           chunk_offset + size);
7444
7445        free_excluded_extents(root, cache);
7446
7447        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7448                                &cache->space_info);
7449        BUG_ON(ret);
7450
7451        spin_lock(&cache->space_info->lock);
7452        cache->space_info->bytes_readonly += cache->bytes_super;
7453        spin_unlock(&cache->space_info->lock);
7454
7455        __link_block_group(cache->space_info, cache);
7456
7457        ret = btrfs_add_block_group_cache(root->fs_info, cache);
7458        BUG_ON(ret);
7459
7460        ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
7461                                sizeof(cache->item));
7462        BUG_ON(ret);
7463
7464        set_avail_alloc_bits(extent_root->fs_info, type);
7465
7466        return 0;
7467}
7468
7469int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7470                             struct btrfs_root *root, u64 group_start)
7471{
7472        struct btrfs_path *path;
7473        struct btrfs_block_group_cache *block_group;
7474        struct btrfs_free_cluster *cluster;
7475        struct btrfs_root *tree_root = root->fs_info->tree_root;
7476        struct btrfs_key key;
7477        struct inode *inode;
7478        int ret;
7479        int factor;
7480
7481        root = root->fs_info->extent_root;
7482
7483        block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7484        BUG_ON(!block_group);
7485        BUG_ON(!block_group->ro);
7486
7487        /*
7488         * Free the reserved super bytes from this block group before
7489         * remove it.
7490         */
7491        free_excluded_extents(root, block_group);
7492
7493        memcpy(&key, &block_group->key, sizeof(key));
7494        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7495                                  BTRFS_BLOCK_GROUP_RAID1 |
7496                                  BTRFS_BLOCK_GROUP_RAID10))
7497                factor = 2;
7498        else
7499                factor = 1;
7500
7501        /* make sure this block group isn't part of an allocation cluster */
7502        cluster = &root->fs_info->data_alloc_cluster;
7503        spin_lock(&cluster->refill_lock);
7504        btrfs_return_cluster_to_free_space(block_group, cluster);
7505        spin_unlock(&cluster->refill_lock);
7506
7507        /*
7508         * make sure this block group isn't part of a metadata
7509         * allocation cluster
7510         */
7511        cluster = &root->fs_info->meta_alloc_cluster;
7512        spin_lock(&cluster->refill_lock);
7513        btrfs_return_cluster_to_free_space(block_group, cluster);
7514        spin_unlock(&cluster->refill_lock);
7515
7516        path = btrfs_alloc_path();
7517        if (!path) {
7518                ret = -ENOMEM;
7519                goto out;
7520        }
7521
7522        inode = lookup_free_space_inode(tree_root, block_group, path);
7523        if (!IS_ERR(inode)) {
7524                ret = btrfs_orphan_add(trans, inode);
7525                BUG_ON(ret);
7526                clear_nlink(inode);
7527                /* One for the block groups ref */
7528                spin_lock(&block_group->lock);
7529                if (block_group->iref) {
7530                        block_group->iref = 0;
7531                        block_group->inode = NULL;
7532                        spin_unlock(&block_group->lock);
7533                        iput(inode);
7534                } else {
7535                        spin_unlock(&block_group->lock);
7536                }
7537                /* One for our lookup ref */
7538                btrfs_add_delayed_iput(inode);
7539        }
7540
7541        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
7542        key.offset = block_group->key.objectid;
7543        key.type = 0;
7544
7545        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
7546        if (ret < 0)
7547                goto out;
7548        if (ret > 0)
7549                btrfs_release_path(path);
7550        if (ret == 0) {
7551                ret = btrfs_del_item(trans, tree_root, path);
7552                if (ret)
7553                        goto out;
7554                btrfs_release_path(path);
7555        }
7556
7557        spin_lock(&root->fs_info->block_group_cache_lock);
7558        rb_erase(&block_group->cache_node,
7559                 &root->fs_info->block_group_cache_tree);
7560        spin_unlock(&root->fs_info->block_group_cache_lock);
7561
7562        down_write(&block_group->space_info->groups_sem);
7563        /*
7564         * we must use list_del_init so people can check to see if they
7565         * are still on the list after taking the semaphore
7566         */
7567        list_del_init(&block_group->list);
7568        up_write(&block_group->space_info->groups_sem);
7569
7570        if (block_group->cached == BTRFS_CACHE_STARTED)
7571                wait_block_group_cache_done(block_group);
7572
7573        btrfs_remove_free_space_cache(block_group);
7574
7575        spin_lock(&block_group->space_info->lock);
7576        block_group->space_info->total_bytes -= block_group->key.offset;
7577        block_group->space_info->bytes_readonly -= block_group->key.offset;
7578        block_group->space_info->disk_total -= block_group->key.offset * factor;
7579        spin_unlock(&block_group->space_info->lock);
7580
7581        memcpy(&key, &block_group->key, sizeof(key));
7582
7583        btrfs_clear_space_info_full(root->fs_info);
7584
7585        btrfs_put_block_group(block_group);
7586        btrfs_put_block_group(block_group);
7587
7588        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
7589        if (ret > 0)
7590                ret = -EIO;
7591        if (ret < 0)
7592                goto out;
7593
7594        ret = btrfs_del_item(trans, root, path);
7595out:
7596        btrfs_free_path(path);
7597        return ret;
7598}
7599
7600int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7601{
7602        struct btrfs_space_info *space_info;
7603        struct btrfs_super_block *disk_super;
7604        u64 features;
7605        u64 flags;
7606        int mixed = 0;
7607        int ret;
7608
7609        disk_super = fs_info->super_copy;
7610        if (!btrfs_super_root(disk_super))
7611                return 1;
7612
7613        features = btrfs_super_incompat_flags(disk_super);
7614        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
7615                mixed = 1;
7616
7617        flags = BTRFS_BLOCK_GROUP_SYSTEM;
7618        ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7619        if (ret)
7620                goto out;
7621
7622        if (mixed) {
7623                flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
7624                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7625        } else {
7626                flags = BTRFS_BLOCK_GROUP_METADATA;
7627                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7628                if (ret)
7629                        goto out;
7630
7631                flags = BTRFS_BLOCK_GROUP_DATA;
7632                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7633        }
7634out:
7635        return ret;
7636}
7637
7638int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
7639{
7640        return unpin_extent_range(root, start, end);
7641}
7642
7643int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
7644                               u64 num_bytes, u64 *actual_bytes)
7645{
7646        return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
7647}
7648
7649int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
7650{
7651        struct btrfs_fs_info *fs_info = root->fs_info;
7652        struct btrfs_block_group_cache *cache = NULL;
7653        u64 group_trimmed;
7654        u64 start;
7655        u64 end;
7656        u64 trimmed = 0;
7657        int ret = 0;
7658
7659        cache = btrfs_lookup_block_group(fs_info, range->start);
7660
7661        while (cache) {
7662                if (cache->key.objectid >= (range->start + range->len)) {
7663                        btrfs_put_block_group(cache);
7664                        break;
7665                }
7666
7667                start = max(range->start, cache->key.objectid);
7668                end = min(range->start + range->len,
7669                                cache->key.objectid + cache->key.offset);
7670
7671                if (end - start >= range->minlen) {
7672                        if (!block_group_cache_done(cache)) {
7673                                ret = cache_block_group(cache, NULL, root, 0);
7674                                if (!ret)
7675                                        wait_block_group_cache_done(cache);
7676                        }
7677                        ret = btrfs_trim_block_group(cache,
7678                                                     &group_trimmed,
7679                                                     start,
7680                                                     end,
7681                                                     range->minlen);
7682
7683                        trimmed += group_trimmed;
7684                        if (ret) {
7685                                btrfs_put_block_group(cache);
7686                                break;
7687                        }
7688                }
7689
7690                cache = next_block_group(fs_info->tree_root, cache);
7691        }
7692
7693        range->len = trimmed;
7694        return ret;
7695}
7696