linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/pagemap.h>
  20#include <linux/writeback.h>
  21#include <linux/blkdev.h>
  22#include <linux/sort.h>
  23#include <linux/rcupdate.h>
  24#include <linux/kthread.h>
  25#include <linux/slab.h>
  26#include <linux/ratelimit.h>
  27#include <linux/percpu_counter.h>
  28#include "hash.h"
  29#include "tree-log.h"
  30#include "disk-io.h"
  31#include "print-tree.h"
  32#include "volumes.h"
  33#include "raid56.h"
  34#include "locking.h"
  35#include "free-space-cache.h"
  36#include "math.h"
  37#include "sysfs.h"
  38#include "qgroup.h"
  39
  40#undef SCRAMBLE_DELAYED_REFS
  41
  42/*
  43 * control flags for do_chunk_alloc's force field
  44 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  45 * if we really need one.
  46 *
  47 * CHUNK_ALLOC_LIMITED means to only try and allocate one
  48 * if we have very few chunks already allocated.  This is
  49 * used as part of the clustering code to help make sure
  50 * we have a good pool of storage to cluster in, without
  51 * filling the FS with empty chunks
  52 *
  53 * CHUNK_ALLOC_FORCE means it must try to allocate one
  54 *
  55 */
  56enum {
  57        CHUNK_ALLOC_NO_FORCE = 0,
  58        CHUNK_ALLOC_LIMITED = 1,
  59        CHUNK_ALLOC_FORCE = 2,
  60};
  61
  62/*
  63 * Control how reservations are dealt with.
  64 *
  65 * RESERVE_FREE - freeing a reservation.
  66 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
  67 *   ENOSPC accounting
  68 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
  69 *   bytes_may_use as the ENOSPC accounting is done elsewhere
  70 */
  71enum {
  72        RESERVE_FREE = 0,
  73        RESERVE_ALLOC = 1,
  74        RESERVE_ALLOC_NO_ACCOUNT = 2,
  75};
  76
  77static int update_block_group(struct btrfs_trans_handle *trans,
  78                              struct btrfs_root *root, u64 bytenr,
  79                              u64 num_bytes, int alloc);
  80static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  81                                struct btrfs_root *root,
  82                                u64 bytenr, u64 num_bytes, u64 parent,
  83                                u64 root_objectid, u64 owner_objectid,
  84                                u64 owner_offset, int refs_to_drop,
  85                                struct btrfs_delayed_extent_op *extra_op,
  86                                int no_quota);
  87static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  88                                    struct extent_buffer *leaf,
  89                                    struct btrfs_extent_item *ei);
  90static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  91                                      struct btrfs_root *root,
  92                                      u64 parent, u64 root_objectid,
  93                                      u64 flags, u64 owner, u64 offset,
  94                                      struct btrfs_key *ins, int ref_mod);
  95static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  96                                     struct btrfs_root *root,
  97                                     u64 parent, u64 root_objectid,
  98                                     u64 flags, struct btrfs_disk_key *key,
  99                                     int level, struct btrfs_key *ins,
 100                                     int no_quota);
 101static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 102                          struct btrfs_root *extent_root, u64 flags,
 103                          int force);
 104static int find_next_key(struct btrfs_path *path, int level,
 105                         struct btrfs_key *key);
 106static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 107                            int dump_block_groups);
 108static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 109                                       u64 num_bytes, int reserve,
 110                                       int delalloc);
 111static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
 112                               u64 num_bytes);
 113int btrfs_pin_extent(struct btrfs_root *root,
 114                     u64 bytenr, u64 num_bytes, int reserved);
 115
 116static noinline int
 117block_group_cache_done(struct btrfs_block_group_cache *cache)
 118{
 119        smp_mb();
 120        return cache->cached == BTRFS_CACHE_FINISHED ||
 121                cache->cached == BTRFS_CACHE_ERROR;
 122}
 123
 124static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 125{
 126        return (cache->flags & bits) == bits;
 127}
 128
 129static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 130{
 131        atomic_inc(&cache->count);
 132}
 133
 134void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 135{
 136        if (atomic_dec_and_test(&cache->count)) {
 137                WARN_ON(cache->pinned > 0);
 138                WARN_ON(cache->reserved > 0);
 139                kfree(cache->free_space_ctl);
 140                kfree(cache);
 141        }
 142}
 143
 144/*
 145 * this adds the block group to the fs_info rb tree for the block group
 146 * cache
 147 */
 148static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 149                                struct btrfs_block_group_cache *block_group)
 150{
 151        struct rb_node **p;
 152        struct rb_node *parent = NULL;
 153        struct btrfs_block_group_cache *cache;
 154
 155        spin_lock(&info->block_group_cache_lock);
 156        p = &info->block_group_cache_tree.rb_node;
 157
 158        while (*p) {
 159                parent = *p;
 160                cache = rb_entry(parent, struct btrfs_block_group_cache,
 161                                 cache_node);
 162                if (block_group->key.objectid < cache->key.objectid) {
 163                        p = &(*p)->rb_left;
 164                } else if (block_group->key.objectid > cache->key.objectid) {
 165                        p = &(*p)->rb_right;
 166                } else {
 167                        spin_unlock(&info->block_group_cache_lock);
 168                        return -EEXIST;
 169                }
 170        }
 171
 172        rb_link_node(&block_group->cache_node, parent, p);
 173        rb_insert_color(&block_group->cache_node,
 174                        &info->block_group_cache_tree);
 175
 176        if (info->first_logical_byte > block_group->key.objectid)
 177                info->first_logical_byte = block_group->key.objectid;
 178
 179        spin_unlock(&info->block_group_cache_lock);
 180
 181        return 0;
 182}
 183
 184/*
 185 * This will return the block group at or after bytenr if contains is 0, else
 186 * it will return the block group that contains the bytenr
 187 */
 188static struct btrfs_block_group_cache *
 189block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 190                              int contains)
 191{
 192        struct btrfs_block_group_cache *cache, *ret = NULL;
 193        struct rb_node *n;
 194        u64 end, start;
 195
 196        spin_lock(&info->block_group_cache_lock);
 197        n = info->block_group_cache_tree.rb_node;
 198
 199        while (n) {
 200                cache = rb_entry(n, struct btrfs_block_group_cache,
 201                                 cache_node);
 202                end = cache->key.objectid + cache->key.offset - 1;
 203                start = cache->key.objectid;
 204
 205                if (bytenr < start) {
 206                        if (!contains && (!ret || start < ret->key.objectid))
 207                                ret = cache;
 208                        n = n->rb_left;
 209                } else if (bytenr > start) {
 210                        if (contains && bytenr <= end) {
 211                                ret = cache;
 212                                break;
 213                        }
 214                        n = n->rb_right;
 215                } else {
 216                        ret = cache;
 217                        break;
 218                }
 219        }
 220        if (ret) {
 221                btrfs_get_block_group(ret);
 222                if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 223                        info->first_logical_byte = ret->key.objectid;
 224        }
 225        spin_unlock(&info->block_group_cache_lock);
 226
 227        return ret;
 228}
 229
 230static int add_excluded_extent(struct btrfs_root *root,
 231                               u64 start, u64 num_bytes)
 232{
 233        u64 end = start + num_bytes - 1;
 234        set_extent_bits(&root->fs_info->freed_extents[0],
 235                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 236        set_extent_bits(&root->fs_info->freed_extents[1],
 237                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 238        return 0;
 239}
 240
 241static void free_excluded_extents(struct btrfs_root *root,
 242                                  struct btrfs_block_group_cache *cache)
 243{
 244        u64 start, end;
 245
 246        start = cache->key.objectid;
 247        end = start + cache->key.offset - 1;
 248
 249        clear_extent_bits(&root->fs_info->freed_extents[0],
 250                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 251        clear_extent_bits(&root->fs_info->freed_extents[1],
 252                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 253}
 254
 255static int exclude_super_stripes(struct btrfs_root *root,
 256                                 struct btrfs_block_group_cache *cache)
 257{
 258        u64 bytenr;
 259        u64 *logical;
 260        int stripe_len;
 261        int i, nr, ret;
 262
 263        if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 264                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 265                cache->bytes_super += stripe_len;
 266                ret = add_excluded_extent(root, cache->key.objectid,
 267                                          stripe_len);
 268                if (ret)
 269                        return ret;
 270        }
 271
 272        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 273                bytenr = btrfs_sb_offset(i);
 274                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 275                                       cache->key.objectid, bytenr,
 276                                       0, &logical, &nr, &stripe_len);
 277                if (ret)
 278                        return ret;
 279
 280                while (nr--) {
 281                        u64 start, len;
 282
 283                        if (logical[nr] > cache->key.objectid +
 284                            cache->key.offset)
 285                                continue;
 286
 287                        if (logical[nr] + stripe_len <= cache->key.objectid)
 288                                continue;
 289
 290                        start = logical[nr];
 291                        if (start < cache->key.objectid) {
 292                                start = cache->key.objectid;
 293                                len = (logical[nr] + stripe_len) - start;
 294                        } else {
 295                                len = min_t(u64, stripe_len,
 296                                            cache->key.objectid +
 297                                            cache->key.offset - start);
 298                        }
 299
 300                        cache->bytes_super += len;
 301                        ret = add_excluded_extent(root, start, len);
 302                        if (ret) {
 303                                kfree(logical);
 304                                return ret;
 305                        }
 306                }
 307
 308                kfree(logical);
 309        }
 310        return 0;
 311}
 312
 313static struct btrfs_caching_control *
 314get_caching_control(struct btrfs_block_group_cache *cache)
 315{
 316        struct btrfs_caching_control *ctl;
 317
 318        spin_lock(&cache->lock);
 319        if (!cache->caching_ctl) {
 320                spin_unlock(&cache->lock);
 321                return NULL;
 322        }
 323
 324        ctl = cache->caching_ctl;
 325        atomic_inc(&ctl->count);
 326        spin_unlock(&cache->lock);
 327        return ctl;
 328}
 329
 330static void put_caching_control(struct btrfs_caching_control *ctl)
 331{
 332        if (atomic_dec_and_test(&ctl->count))
 333                kfree(ctl);
 334}
 335
 336/*
 337 * this is only called by cache_block_group, since we could have freed extents
 338 * we need to check the pinned_extents for any extents that can't be used yet
 339 * since their free space will be released as soon as the transaction commits.
 340 */
 341static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 342                              struct btrfs_fs_info *info, u64 start, u64 end)
 343{
 344        u64 extent_start, extent_end, size, total_added = 0;
 345        int ret;
 346
 347        while (start < end) {
 348                ret = find_first_extent_bit(info->pinned_extents, start,
 349                                            &extent_start, &extent_end,
 350                                            EXTENT_DIRTY | EXTENT_UPTODATE,
 351                                            NULL);
 352                if (ret)
 353                        break;
 354
 355                if (extent_start <= start) {
 356                        start = extent_end + 1;
 357                } else if (extent_start > start && extent_start < end) {
 358                        size = extent_start - start;
 359                        total_added += size;
 360                        ret = btrfs_add_free_space(block_group, start,
 361                                                   size);
 362                        BUG_ON(ret); /* -ENOMEM or logic error */
 363                        start = extent_end + 1;
 364                } else {
 365                        break;
 366                }
 367        }
 368
 369        if (start < end) {
 370                size = end - start;
 371                total_added += size;
 372                ret = btrfs_add_free_space(block_group, start, size);
 373                BUG_ON(ret); /* -ENOMEM or logic error */
 374        }
 375
 376        return total_added;
 377}
 378
 379static noinline void caching_thread(struct btrfs_work *work)
 380{
 381        struct btrfs_block_group_cache *block_group;
 382        struct btrfs_fs_info *fs_info;
 383        struct btrfs_caching_control *caching_ctl;
 384        struct btrfs_root *extent_root;
 385        struct btrfs_path *path;
 386        struct extent_buffer *leaf;
 387        struct btrfs_key key;
 388        u64 total_found = 0;
 389        u64 last = 0;
 390        u32 nritems;
 391        int ret = -ENOMEM;
 392
 393        caching_ctl = container_of(work, struct btrfs_caching_control, work);
 394        block_group = caching_ctl->block_group;
 395        fs_info = block_group->fs_info;
 396        extent_root = fs_info->extent_root;
 397
 398        path = btrfs_alloc_path();
 399        if (!path)
 400                goto out;
 401
 402        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 403
 404        /*
 405         * We don't want to deadlock with somebody trying to allocate a new
 406         * extent for the extent root while also trying to search the extent
 407         * root to add free space.  So we skip locking and search the commit
 408         * root, since its read-only
 409         */
 410        path->skip_locking = 1;
 411        path->search_commit_root = 1;
 412        path->reada = 1;
 413
 414        key.objectid = last;
 415        key.offset = 0;
 416        key.type = BTRFS_EXTENT_ITEM_KEY;
 417again:
 418        mutex_lock(&caching_ctl->mutex);
 419        /* need to make sure the commit_root doesn't disappear */
 420        down_read(&fs_info->commit_root_sem);
 421
 422next:
 423        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 424        if (ret < 0)
 425                goto err;
 426
 427        leaf = path->nodes[0];
 428        nritems = btrfs_header_nritems(leaf);
 429
 430        while (1) {
 431                if (btrfs_fs_closing(fs_info) > 1) {
 432                        last = (u64)-1;
 433                        break;
 434                }
 435
 436                if (path->slots[0] < nritems) {
 437                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 438                } else {
 439                        ret = find_next_key(path, 0, &key);
 440                        if (ret)
 441                                break;
 442
 443                        if (need_resched() ||
 444                            rwsem_is_contended(&fs_info->commit_root_sem)) {
 445                                caching_ctl->progress = last;
 446                                btrfs_release_path(path);
 447                                up_read(&fs_info->commit_root_sem);
 448                                mutex_unlock(&caching_ctl->mutex);
 449                                cond_resched();
 450                                goto again;
 451                        }
 452
 453                        ret = btrfs_next_leaf(extent_root, path);
 454                        if (ret < 0)
 455                                goto err;
 456                        if (ret)
 457                                break;
 458                        leaf = path->nodes[0];
 459                        nritems = btrfs_header_nritems(leaf);
 460                        continue;
 461                }
 462
 463                if (key.objectid < last) {
 464                        key.objectid = last;
 465                        key.offset = 0;
 466                        key.type = BTRFS_EXTENT_ITEM_KEY;
 467
 468                        caching_ctl->progress = last;
 469                        btrfs_release_path(path);
 470                        goto next;
 471                }
 472
 473                if (key.objectid < block_group->key.objectid) {
 474                        path->slots[0]++;
 475                        continue;
 476                }
 477
 478                if (key.objectid >= block_group->key.objectid +
 479                    block_group->key.offset)
 480                        break;
 481
 482                if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 483                    key.type == BTRFS_METADATA_ITEM_KEY) {
 484                        total_found += add_new_free_space(block_group,
 485                                                          fs_info, last,
 486                                                          key.objectid);
 487                        if (key.type == BTRFS_METADATA_ITEM_KEY)
 488                                last = key.objectid +
 489                                        fs_info->tree_root->nodesize;
 490                        else
 491                                last = key.objectid + key.offset;
 492
 493                        if (total_found > (1024 * 1024 * 2)) {
 494                                total_found = 0;
 495                                wake_up(&caching_ctl->wait);
 496                        }
 497                }
 498                path->slots[0]++;
 499        }
 500        ret = 0;
 501
 502        total_found += add_new_free_space(block_group, fs_info, last,
 503                                          block_group->key.objectid +
 504                                          block_group->key.offset);
 505        caching_ctl->progress = (u64)-1;
 506
 507        spin_lock(&block_group->lock);
 508        block_group->caching_ctl = NULL;
 509        block_group->cached = BTRFS_CACHE_FINISHED;
 510        spin_unlock(&block_group->lock);
 511
 512err:
 513        btrfs_free_path(path);
 514        up_read(&fs_info->commit_root_sem);
 515
 516        free_excluded_extents(extent_root, block_group);
 517
 518        mutex_unlock(&caching_ctl->mutex);
 519out:
 520        if (ret) {
 521                spin_lock(&block_group->lock);
 522                block_group->caching_ctl = NULL;
 523                block_group->cached = BTRFS_CACHE_ERROR;
 524                spin_unlock(&block_group->lock);
 525        }
 526        wake_up(&caching_ctl->wait);
 527
 528        put_caching_control(caching_ctl);
 529        btrfs_put_block_group(block_group);
 530}
 531
 532static int cache_block_group(struct btrfs_block_group_cache *cache,
 533                             int load_cache_only)
 534{
 535        DEFINE_WAIT(wait);
 536        struct btrfs_fs_info *fs_info = cache->fs_info;
 537        struct btrfs_caching_control *caching_ctl;
 538        int ret = 0;
 539
 540        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 541        if (!caching_ctl)
 542                return -ENOMEM;
 543
 544        INIT_LIST_HEAD(&caching_ctl->list);
 545        mutex_init(&caching_ctl->mutex);
 546        init_waitqueue_head(&caching_ctl->wait);
 547        caching_ctl->block_group = cache;
 548        caching_ctl->progress = cache->key.objectid;
 549        atomic_set(&caching_ctl->count, 1);
 550        btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 551                        caching_thread, NULL, NULL);
 552
 553        spin_lock(&cache->lock);
 554        /*
 555         * This should be a rare occasion, but this could happen I think in the
 556         * case where one thread starts to load the space cache info, and then
 557         * some other thread starts a transaction commit which tries to do an
 558         * allocation while the other thread is still loading the space cache
 559         * info.  The previous loop should have kept us from choosing this block
 560         * group, but if we've moved to the state where we will wait on caching
 561         * block groups we need to first check if we're doing a fast load here,
 562         * so we can wait for it to finish, otherwise we could end up allocating
 563         * from a block group who's cache gets evicted for one reason or
 564         * another.
 565         */
 566        while (cache->cached == BTRFS_CACHE_FAST) {
 567                struct btrfs_caching_control *ctl;
 568
 569                ctl = cache->caching_ctl;
 570                atomic_inc(&ctl->count);
 571                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 572                spin_unlock(&cache->lock);
 573
 574                schedule();
 575
 576                finish_wait(&ctl->wait, &wait);
 577                put_caching_control(ctl);
 578                spin_lock(&cache->lock);
 579        }
 580
 581        if (cache->cached != BTRFS_CACHE_NO) {
 582                spin_unlock(&cache->lock);
 583                kfree(caching_ctl);
 584                return 0;
 585        }
 586        WARN_ON(cache->caching_ctl);
 587        cache->caching_ctl = caching_ctl;
 588        cache->cached = BTRFS_CACHE_FAST;
 589        spin_unlock(&cache->lock);
 590
 591        if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 592                mutex_lock(&caching_ctl->mutex);
 593                ret = load_free_space_cache(fs_info, cache);
 594
 595                spin_lock(&cache->lock);
 596                if (ret == 1) {
 597                        cache->caching_ctl = NULL;
 598                        cache->cached = BTRFS_CACHE_FINISHED;
 599                        cache->last_byte_to_unpin = (u64)-1;
 600                        caching_ctl->progress = (u64)-1;
 601                } else {
 602                        if (load_cache_only) {
 603                                cache->caching_ctl = NULL;
 604                                cache->cached = BTRFS_CACHE_NO;
 605                        } else {
 606                                cache->cached = BTRFS_CACHE_STARTED;
 607                                cache->has_caching_ctl = 1;
 608                        }
 609                }
 610                spin_unlock(&cache->lock);
 611                mutex_unlock(&caching_ctl->mutex);
 612
 613                wake_up(&caching_ctl->wait);
 614                if (ret == 1) {
 615                        put_caching_control(caching_ctl);
 616                        free_excluded_extents(fs_info->extent_root, cache);
 617                        return 0;
 618                }
 619        } else {
 620                /*
 621                 * We are not going to do the fast caching, set cached to the
 622                 * appropriate value and wakeup any waiters.
 623                 */
 624                spin_lock(&cache->lock);
 625                if (load_cache_only) {
 626                        cache->caching_ctl = NULL;
 627                        cache->cached = BTRFS_CACHE_NO;
 628                } else {
 629                        cache->cached = BTRFS_CACHE_STARTED;
 630                        cache->has_caching_ctl = 1;
 631                }
 632                spin_unlock(&cache->lock);
 633                wake_up(&caching_ctl->wait);
 634        }
 635
 636        if (load_cache_only) {
 637                put_caching_control(caching_ctl);
 638                return 0;
 639        }
 640
 641        down_write(&fs_info->commit_root_sem);
 642        atomic_inc(&caching_ctl->count);
 643        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 644        up_write(&fs_info->commit_root_sem);
 645
 646        btrfs_get_block_group(cache);
 647
 648        btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 649
 650        return ret;
 651}
 652
 653/*
 654 * return the block group that starts at or after bytenr
 655 */
 656static struct btrfs_block_group_cache *
 657btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 658{
 659        struct btrfs_block_group_cache *cache;
 660
 661        cache = block_group_cache_tree_search(info, bytenr, 0);
 662
 663        return cache;
 664}
 665
 666/*
 667 * return the block group that contains the given bytenr
 668 */
 669struct btrfs_block_group_cache *btrfs_lookup_block_group(
 670                                                 struct btrfs_fs_info *info,
 671                                                 u64 bytenr)
 672{
 673        struct btrfs_block_group_cache *cache;
 674
 675        cache = block_group_cache_tree_search(info, bytenr, 1);
 676
 677        return cache;
 678}
 679
 680static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 681                                                  u64 flags)
 682{
 683        struct list_head *head = &info->space_info;
 684        struct btrfs_space_info *found;
 685
 686        flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 687
 688        rcu_read_lock();
 689        list_for_each_entry_rcu(found, head, list) {
 690                if (found->flags & flags) {
 691                        rcu_read_unlock();
 692                        return found;
 693                }
 694        }
 695        rcu_read_unlock();
 696        return NULL;
 697}
 698
 699/*
 700 * after adding space to the filesystem, we need to clear the full flags
 701 * on all the space infos.
 702 */
 703void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 704{
 705        struct list_head *head = &info->space_info;
 706        struct btrfs_space_info *found;
 707
 708        rcu_read_lock();
 709        list_for_each_entry_rcu(found, head, list)
 710                found->full = 0;
 711        rcu_read_unlock();
 712}
 713
 714/* simple helper to search for an existing data extent at a given offset */
 715int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
 716{
 717        int ret;
 718        struct btrfs_key key;
 719        struct btrfs_path *path;
 720
 721        path = btrfs_alloc_path();
 722        if (!path)
 723                return -ENOMEM;
 724
 725        key.objectid = start;
 726        key.offset = len;
 727        key.type = BTRFS_EXTENT_ITEM_KEY;
 728        ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 729                                0, 0);
 730        btrfs_free_path(path);
 731        return ret;
 732}
 733
 734/*
 735 * helper function to lookup reference count and flags of a tree block.
 736 *
 737 * the head node for delayed ref is used to store the sum of all the
 738 * reference count modifications queued up in the rbtree. the head
 739 * node may also store the extent flags to set. This way you can check
 740 * to see what the reference count and extent flags would be if all of
 741 * the delayed refs are not processed.
 742 */
 743int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 744                             struct btrfs_root *root, u64 bytenr,
 745                             u64 offset, int metadata, u64 *refs, u64 *flags)
 746{
 747        struct btrfs_delayed_ref_head *head;
 748        struct btrfs_delayed_ref_root *delayed_refs;
 749        struct btrfs_path *path;
 750        struct btrfs_extent_item *ei;
 751        struct extent_buffer *leaf;
 752        struct btrfs_key key;
 753        u32 item_size;
 754        u64 num_refs;
 755        u64 extent_flags;
 756        int ret;
 757
 758        /*
 759         * If we don't have skinny metadata, don't bother doing anything
 760         * different
 761         */
 762        if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
 763                offset = root->nodesize;
 764                metadata = 0;
 765        }
 766
 767        path = btrfs_alloc_path();
 768        if (!path)
 769                return -ENOMEM;
 770
 771        if (!trans) {
 772                path->skip_locking = 1;
 773                path->search_commit_root = 1;
 774        }
 775
 776search_again:
 777        key.objectid = bytenr;
 778        key.offset = offset;
 779        if (metadata)
 780                key.type = BTRFS_METADATA_ITEM_KEY;
 781        else
 782                key.type = BTRFS_EXTENT_ITEM_KEY;
 783
 784        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 785                                &key, path, 0, 0);
 786        if (ret < 0)
 787                goto out_free;
 788
 789        if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 790                if (path->slots[0]) {
 791                        path->slots[0]--;
 792                        btrfs_item_key_to_cpu(path->nodes[0], &key,
 793                                              path->slots[0]);
 794                        if (key.objectid == bytenr &&
 795                            key.type == BTRFS_EXTENT_ITEM_KEY &&
 796                            key.offset == root->nodesize)
 797                                ret = 0;
 798                }
 799        }
 800
 801        if (ret == 0) {
 802                leaf = path->nodes[0];
 803                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 804                if (item_size >= sizeof(*ei)) {
 805                        ei = btrfs_item_ptr(leaf, path->slots[0],
 806                                            struct btrfs_extent_item);
 807                        num_refs = btrfs_extent_refs(leaf, ei);
 808                        extent_flags = btrfs_extent_flags(leaf, ei);
 809                } else {
 810#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 811                        struct btrfs_extent_item_v0 *ei0;
 812                        BUG_ON(item_size != sizeof(*ei0));
 813                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 814                                             struct btrfs_extent_item_v0);
 815                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
 816                        /* FIXME: this isn't correct for data */
 817                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 818#else
 819                        BUG();
 820#endif
 821                }
 822                BUG_ON(num_refs == 0);
 823        } else {
 824                num_refs = 0;
 825                extent_flags = 0;
 826                ret = 0;
 827        }
 828
 829        if (!trans)
 830                goto out;
 831
 832        delayed_refs = &trans->transaction->delayed_refs;
 833        spin_lock(&delayed_refs->lock);
 834        head = btrfs_find_delayed_ref_head(trans, bytenr);
 835        if (head) {
 836                if (!mutex_trylock(&head->mutex)) {
 837                        atomic_inc(&head->node.refs);
 838                        spin_unlock(&delayed_refs->lock);
 839
 840                        btrfs_release_path(path);
 841
 842                        /*
 843                         * Mutex was contended, block until it's released and try
 844                         * again
 845                         */
 846                        mutex_lock(&head->mutex);
 847                        mutex_unlock(&head->mutex);
 848                        btrfs_put_delayed_ref(&head->node);
 849                        goto search_again;
 850                }
 851                spin_lock(&head->lock);
 852                if (head->extent_op && head->extent_op->update_flags)
 853                        extent_flags |= head->extent_op->flags_to_set;
 854                else
 855                        BUG_ON(num_refs == 0);
 856
 857                num_refs += head->node.ref_mod;
 858                spin_unlock(&head->lock);
 859                mutex_unlock(&head->mutex);
 860        }
 861        spin_unlock(&delayed_refs->lock);
 862out:
 863        WARN_ON(num_refs == 0);
 864        if (refs)
 865                *refs = num_refs;
 866        if (flags)
 867                *flags = extent_flags;
 868out_free:
 869        btrfs_free_path(path);
 870        return ret;
 871}
 872
 873/*
 874 * Back reference rules.  Back refs have three main goals:
 875 *
 876 * 1) differentiate between all holders of references to an extent so that
 877 *    when a reference is dropped we can make sure it was a valid reference
 878 *    before freeing the extent.
 879 *
 880 * 2) Provide enough information to quickly find the holders of an extent
 881 *    if we notice a given block is corrupted or bad.
 882 *
 883 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 884 *    maintenance.  This is actually the same as #2, but with a slightly
 885 *    different use case.
 886 *
 887 * There are two kinds of back refs. The implicit back refs is optimized
 888 * for pointers in non-shared tree blocks. For a given pointer in a block,
 889 * back refs of this kind provide information about the block's owner tree
 890 * and the pointer's key. These information allow us to find the block by
 891 * b-tree searching. The full back refs is for pointers in tree blocks not
 892 * referenced by their owner trees. The location of tree block is recorded
 893 * in the back refs. Actually the full back refs is generic, and can be
 894 * used in all cases the implicit back refs is used. The major shortcoming
 895 * of the full back refs is its overhead. Every time a tree block gets
 896 * COWed, we have to update back refs entry for all pointers in it.
 897 *
 898 * For a newly allocated tree block, we use implicit back refs for
 899 * pointers in it. This means most tree related operations only involve
 900 * implicit back refs. For a tree block created in old transaction, the
 901 * only way to drop a reference to it is COW it. So we can detect the
 902 * event that tree block loses its owner tree's reference and do the
 903 * back refs conversion.
 904 *
 905 * When a tree block is COW'd through a tree, there are four cases:
 906 *
 907 * The reference count of the block is one and the tree is the block's
 908 * owner tree. Nothing to do in this case.
 909 *
 910 * The reference count of the block is one and the tree is not the
 911 * block's owner tree. In this case, full back refs is used for pointers
 912 * in the block. Remove these full back refs, add implicit back refs for
 913 * every pointers in the new block.
 914 *
 915 * The reference count of the block is greater than one and the tree is
 916 * the block's owner tree. In this case, implicit back refs is used for
 917 * pointers in the block. Add full back refs for every pointers in the
 918 * block, increase lower level extents' reference counts. The original
 919 * implicit back refs are entailed to the new block.
 920 *
 921 * The reference count of the block is greater than one and the tree is
 922 * not the block's owner tree. Add implicit back refs for every pointer in
 923 * the new block, increase lower level extents' reference count.
 924 *
 925 * Back Reference Key composing:
 926 *
 927 * The key objectid corresponds to the first byte in the extent,
 928 * The key type is used to differentiate between types of back refs.
 929 * There are different meanings of the key offset for different types
 930 * of back refs.
 931 *
 932 * File extents can be referenced by:
 933 *
 934 * - multiple snapshots, subvolumes, or different generations in one subvol
 935 * - different files inside a single subvolume
 936 * - different offsets inside a file (bookend extents in file.c)
 937 *
 938 * The extent ref structure for the implicit back refs has fields for:
 939 *
 940 * - Objectid of the subvolume root
 941 * - objectid of the file holding the reference
 942 * - original offset in the file
 943 * - how many bookend extents
 944 *
 945 * The key offset for the implicit back refs is hash of the first
 946 * three fields.
 947 *
 948 * The extent ref structure for the full back refs has field for:
 949 *
 950 * - number of pointers in the tree leaf
 951 *
 952 * The key offset for the implicit back refs is the first byte of
 953 * the tree leaf
 954 *
 955 * When a file extent is allocated, The implicit back refs is used.
 956 * the fields are filled in:
 957 *
 958 *     (root_key.objectid, inode objectid, offset in file, 1)
 959 *
 960 * When a file extent is removed file truncation, we find the
 961 * corresponding implicit back refs and check the following fields:
 962 *
 963 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 964 *
 965 * Btree extents can be referenced by:
 966 *
 967 * - Different subvolumes
 968 *
 969 * Both the implicit back refs and the full back refs for tree blocks
 970 * only consist of key. The key offset for the implicit back refs is
 971 * objectid of block's owner tree. The key offset for the full back refs
 972 * is the first byte of parent block.
 973 *
 974 * When implicit back refs is used, information about the lowest key and
 975 * level of the tree block are required. These information are stored in
 976 * tree block info structure.
 977 */
 978
 979#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 980static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 981                                  struct btrfs_root *root,
 982                                  struct btrfs_path *path,
 983                                  u64 owner, u32 extra_size)
 984{
 985        struct btrfs_extent_item *item;
 986        struct btrfs_extent_item_v0 *ei0;
 987        struct btrfs_extent_ref_v0 *ref0;
 988        struct btrfs_tree_block_info *bi;
 989        struct extent_buffer *leaf;
 990        struct btrfs_key key;
 991        struct btrfs_key found_key;
 992        u32 new_size = sizeof(*item);
 993        u64 refs;
 994        int ret;
 995
 996        leaf = path->nodes[0];
 997        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
 998
 999        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1000        ei0 = btrfs_item_ptr(leaf, path->slots[0],
1001                             struct btrfs_extent_item_v0);
1002        refs = btrfs_extent_refs_v0(leaf, ei0);
1003
1004        if (owner == (u64)-1) {
1005                while (1) {
1006                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1007                                ret = btrfs_next_leaf(root, path);
1008                                if (ret < 0)
1009                                        return ret;
1010                                BUG_ON(ret > 0); /* Corruption */
1011                                leaf = path->nodes[0];
1012                        }
1013                        btrfs_item_key_to_cpu(leaf, &found_key,
1014                                              path->slots[0]);
1015                        BUG_ON(key.objectid != found_key.objectid);
1016                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1017                                path->slots[0]++;
1018                                continue;
1019                        }
1020                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1021                                              struct btrfs_extent_ref_v0);
1022                        owner = btrfs_ref_objectid_v0(leaf, ref0);
1023                        break;
1024                }
1025        }
1026        btrfs_release_path(path);
1027
1028        if (owner < BTRFS_FIRST_FREE_OBJECTID)
1029                new_size += sizeof(*bi);
1030
1031        new_size -= sizeof(*ei0);
1032        ret = btrfs_search_slot(trans, root, &key, path,
1033                                new_size + extra_size, 1);
1034        if (ret < 0)
1035                return ret;
1036        BUG_ON(ret); /* Corruption */
1037
1038        btrfs_extend_item(root, path, new_size);
1039
1040        leaf = path->nodes[0];
1041        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1042        btrfs_set_extent_refs(leaf, item, refs);
1043        /* FIXME: get real generation */
1044        btrfs_set_extent_generation(leaf, item, 0);
1045        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1046                btrfs_set_extent_flags(leaf, item,
1047                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1048                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1049                bi = (struct btrfs_tree_block_info *)(item + 1);
1050                /* FIXME: get first key of the block */
1051                memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1052                btrfs_set_tree_block_level(leaf, bi, (int)owner);
1053        } else {
1054                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1055        }
1056        btrfs_mark_buffer_dirty(leaf);
1057        return 0;
1058}
1059#endif
1060
1061static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1062{
1063        u32 high_crc = ~(u32)0;
1064        u32 low_crc = ~(u32)0;
1065        __le64 lenum;
1066
1067        lenum = cpu_to_le64(root_objectid);
1068        high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1069        lenum = cpu_to_le64(owner);
1070        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1071        lenum = cpu_to_le64(offset);
1072        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1073
1074        return ((u64)high_crc << 31) ^ (u64)low_crc;
1075}
1076
1077static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1078                                     struct btrfs_extent_data_ref *ref)
1079{
1080        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1081                                    btrfs_extent_data_ref_objectid(leaf, ref),
1082                                    btrfs_extent_data_ref_offset(leaf, ref));
1083}
1084
1085static int match_extent_data_ref(struct extent_buffer *leaf,
1086                                 struct btrfs_extent_data_ref *ref,
1087                                 u64 root_objectid, u64 owner, u64 offset)
1088{
1089        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1090            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1091            btrfs_extent_data_ref_offset(leaf, ref) != offset)
1092                return 0;
1093        return 1;
1094}
1095
1096static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1097                                           struct btrfs_root *root,
1098                                           struct btrfs_path *path,
1099                                           u64 bytenr, u64 parent,
1100                                           u64 root_objectid,
1101                                           u64 owner, u64 offset)
1102{
1103        struct btrfs_key key;
1104        struct btrfs_extent_data_ref *ref;
1105        struct extent_buffer *leaf;
1106        u32 nritems;
1107        int ret;
1108        int recow;
1109        int err = -ENOENT;
1110
1111        key.objectid = bytenr;
1112        if (parent) {
1113                key.type = BTRFS_SHARED_DATA_REF_KEY;
1114                key.offset = parent;
1115        } else {
1116                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1117                key.offset = hash_extent_data_ref(root_objectid,
1118                                                  owner, offset);
1119        }
1120again:
1121        recow = 0;
1122        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1123        if (ret < 0) {
1124                err = ret;
1125                goto fail;
1126        }
1127
1128        if (parent) {
1129                if (!ret)
1130                        return 0;
1131#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1132                key.type = BTRFS_EXTENT_REF_V0_KEY;
1133                btrfs_release_path(path);
1134                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1135                if (ret < 0) {
1136                        err = ret;
1137                        goto fail;
1138                }
1139                if (!ret)
1140                        return 0;
1141#endif
1142                goto fail;
1143        }
1144
1145        leaf = path->nodes[0];
1146        nritems = btrfs_header_nritems(leaf);
1147        while (1) {
1148                if (path->slots[0] >= nritems) {
1149                        ret = btrfs_next_leaf(root, path);
1150                        if (ret < 0)
1151                                err = ret;
1152                        if (ret)
1153                                goto fail;
1154
1155                        leaf = path->nodes[0];
1156                        nritems = btrfs_header_nritems(leaf);
1157                        recow = 1;
1158                }
1159
1160                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1161                if (key.objectid != bytenr ||
1162                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1163                        goto fail;
1164
1165                ref = btrfs_item_ptr(leaf, path->slots[0],
1166                                     struct btrfs_extent_data_ref);
1167
1168                if (match_extent_data_ref(leaf, ref, root_objectid,
1169                                          owner, offset)) {
1170                        if (recow) {
1171                                btrfs_release_path(path);
1172                                goto again;
1173                        }
1174                        err = 0;
1175                        break;
1176                }
1177                path->slots[0]++;
1178        }
1179fail:
1180        return err;
1181}
1182
1183static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1184                                           struct btrfs_root *root,
1185                                           struct btrfs_path *path,
1186                                           u64 bytenr, u64 parent,
1187                                           u64 root_objectid, u64 owner,
1188                                           u64 offset, int refs_to_add)
1189{
1190        struct btrfs_key key;
1191        struct extent_buffer *leaf;
1192        u32 size;
1193        u32 num_refs;
1194        int ret;
1195
1196        key.objectid = bytenr;
1197        if (parent) {
1198                key.type = BTRFS_SHARED_DATA_REF_KEY;
1199                key.offset = parent;
1200                size = sizeof(struct btrfs_shared_data_ref);
1201        } else {
1202                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1203                key.offset = hash_extent_data_ref(root_objectid,
1204                                                  owner, offset);
1205                size = sizeof(struct btrfs_extent_data_ref);
1206        }
1207
1208        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1209        if (ret && ret != -EEXIST)
1210                goto fail;
1211
1212        leaf = path->nodes[0];
1213        if (parent) {
1214                struct btrfs_shared_data_ref *ref;
1215                ref = btrfs_item_ptr(leaf, path->slots[0],
1216                                     struct btrfs_shared_data_ref);
1217                if (ret == 0) {
1218                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1219                } else {
1220                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
1221                        num_refs += refs_to_add;
1222                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1223                }
1224        } else {
1225                struct btrfs_extent_data_ref *ref;
1226                while (ret == -EEXIST) {
1227                        ref = btrfs_item_ptr(leaf, path->slots[0],
1228                                             struct btrfs_extent_data_ref);
1229                        if (match_extent_data_ref(leaf, ref, root_objectid,
1230                                                  owner, offset))
1231                                break;
1232                        btrfs_release_path(path);
1233                        key.offset++;
1234                        ret = btrfs_insert_empty_item(trans, root, path, &key,
1235                                                      size);
1236                        if (ret && ret != -EEXIST)
1237                                goto fail;
1238
1239                        leaf = path->nodes[0];
1240                }
1241                ref = btrfs_item_ptr(leaf, path->slots[0],
1242                                     struct btrfs_extent_data_ref);
1243                if (ret == 0) {
1244                        btrfs_set_extent_data_ref_root(leaf, ref,
1245                                                       root_objectid);
1246                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1247                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1248                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1249                } else {
1250                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
1251                        num_refs += refs_to_add;
1252                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1253                }
1254        }
1255        btrfs_mark_buffer_dirty(leaf);
1256        ret = 0;
1257fail:
1258        btrfs_release_path(path);
1259        return ret;
1260}
1261
1262static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1263                                           struct btrfs_root *root,
1264                                           struct btrfs_path *path,
1265                                           int refs_to_drop, int *last_ref)
1266{
1267        struct btrfs_key key;
1268        struct btrfs_extent_data_ref *ref1 = NULL;
1269        struct btrfs_shared_data_ref *ref2 = NULL;
1270        struct extent_buffer *leaf;
1271        u32 num_refs = 0;
1272        int ret = 0;
1273
1274        leaf = path->nodes[0];
1275        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1276
1277        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1278                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1279                                      struct btrfs_extent_data_ref);
1280                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1281        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1282                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1283                                      struct btrfs_shared_data_ref);
1284                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1285#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1286        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1287                struct btrfs_extent_ref_v0 *ref0;
1288                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1289                                      struct btrfs_extent_ref_v0);
1290                num_refs = btrfs_ref_count_v0(leaf, ref0);
1291#endif
1292        } else {
1293                BUG();
1294        }
1295
1296        BUG_ON(num_refs < refs_to_drop);
1297        num_refs -= refs_to_drop;
1298
1299        if (num_refs == 0) {
1300                ret = btrfs_del_item(trans, root, path);
1301                *last_ref = 1;
1302        } else {
1303                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1304                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1305                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1306                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1307#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1308                else {
1309                        struct btrfs_extent_ref_v0 *ref0;
1310                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1311                                        struct btrfs_extent_ref_v0);
1312                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1313                }
1314#endif
1315                btrfs_mark_buffer_dirty(leaf);
1316        }
1317        return ret;
1318}
1319
1320static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1321                                          struct btrfs_path *path,
1322                                          struct btrfs_extent_inline_ref *iref)
1323{
1324        struct btrfs_key key;
1325        struct extent_buffer *leaf;
1326        struct btrfs_extent_data_ref *ref1;
1327        struct btrfs_shared_data_ref *ref2;
1328        u32 num_refs = 0;
1329
1330        leaf = path->nodes[0];
1331        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1332        if (iref) {
1333                if (btrfs_extent_inline_ref_type(leaf, iref) ==
1334                    BTRFS_EXTENT_DATA_REF_KEY) {
1335                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1336                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1337                } else {
1338                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1339                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1340                }
1341        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1342                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1343                                      struct btrfs_extent_data_ref);
1344                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1345        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1346                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1347                                      struct btrfs_shared_data_ref);
1348                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1349#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1350        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1351                struct btrfs_extent_ref_v0 *ref0;
1352                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1353                                      struct btrfs_extent_ref_v0);
1354                num_refs = btrfs_ref_count_v0(leaf, ref0);
1355#endif
1356        } else {
1357                WARN_ON(1);
1358        }
1359        return num_refs;
1360}
1361
1362static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1363                                          struct btrfs_root *root,
1364                                          struct btrfs_path *path,
1365                                          u64 bytenr, u64 parent,
1366                                          u64 root_objectid)
1367{
1368        struct btrfs_key key;
1369        int ret;
1370
1371        key.objectid = bytenr;
1372        if (parent) {
1373                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1374                key.offset = parent;
1375        } else {
1376                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1377                key.offset = root_objectid;
1378        }
1379
1380        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1381        if (ret > 0)
1382                ret = -ENOENT;
1383#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1384        if (ret == -ENOENT && parent) {
1385                btrfs_release_path(path);
1386                key.type = BTRFS_EXTENT_REF_V0_KEY;
1387                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1388                if (ret > 0)
1389                        ret = -ENOENT;
1390        }
1391#endif
1392        return ret;
1393}
1394
1395static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1396                                          struct btrfs_root *root,
1397                                          struct btrfs_path *path,
1398                                          u64 bytenr, u64 parent,
1399                                          u64 root_objectid)
1400{
1401        struct btrfs_key key;
1402        int ret;
1403
1404        key.objectid = bytenr;
1405        if (parent) {
1406                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1407                key.offset = parent;
1408        } else {
1409                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1410                key.offset = root_objectid;
1411        }
1412
1413        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1414        btrfs_release_path(path);
1415        return ret;
1416}
1417
1418static inline int extent_ref_type(u64 parent, u64 owner)
1419{
1420        int type;
1421        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1422                if (parent > 0)
1423                        type = BTRFS_SHARED_BLOCK_REF_KEY;
1424                else
1425                        type = BTRFS_TREE_BLOCK_REF_KEY;
1426        } else {
1427                if (parent > 0)
1428                        type = BTRFS_SHARED_DATA_REF_KEY;
1429                else
1430                        type = BTRFS_EXTENT_DATA_REF_KEY;
1431        }
1432        return type;
1433}
1434
1435static int find_next_key(struct btrfs_path *path, int level,
1436                         struct btrfs_key *key)
1437
1438{
1439        for (; level < BTRFS_MAX_LEVEL; level++) {
1440                if (!path->nodes[level])
1441                        break;
1442                if (path->slots[level] + 1 >=
1443                    btrfs_header_nritems(path->nodes[level]))
1444                        continue;
1445                if (level == 0)
1446                        btrfs_item_key_to_cpu(path->nodes[level], key,
1447                                              path->slots[level] + 1);
1448                else
1449                        btrfs_node_key_to_cpu(path->nodes[level], key,
1450                                              path->slots[level] + 1);
1451                return 0;
1452        }
1453        return 1;
1454}
1455
1456/*
1457 * look for inline back ref. if back ref is found, *ref_ret is set
1458 * to the address of inline back ref, and 0 is returned.
1459 *
1460 * if back ref isn't found, *ref_ret is set to the address where it
1461 * should be inserted, and -ENOENT is returned.
1462 *
1463 * if insert is true and there are too many inline back refs, the path
1464 * points to the extent item, and -EAGAIN is returned.
1465 *
1466 * NOTE: inline back refs are ordered in the same way that back ref
1467 *       items in the tree are ordered.
1468 */
1469static noinline_for_stack
1470int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1471                                 struct btrfs_root *root,
1472                                 struct btrfs_path *path,
1473                                 struct btrfs_extent_inline_ref **ref_ret,
1474                                 u64 bytenr, u64 num_bytes,
1475                                 u64 parent, u64 root_objectid,
1476                                 u64 owner, u64 offset, int insert)
1477{
1478        struct btrfs_key key;
1479        struct extent_buffer *leaf;
1480        struct btrfs_extent_item *ei;
1481        struct btrfs_extent_inline_ref *iref;
1482        u64 flags;
1483        u64 item_size;
1484        unsigned long ptr;
1485        unsigned long end;
1486        int extra_size;
1487        int type;
1488        int want;
1489        int ret;
1490        int err = 0;
1491        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1492                                                 SKINNY_METADATA);
1493
1494        key.objectid = bytenr;
1495        key.type = BTRFS_EXTENT_ITEM_KEY;
1496        key.offset = num_bytes;
1497
1498        want = extent_ref_type(parent, owner);
1499        if (insert) {
1500                extra_size = btrfs_extent_inline_ref_size(want);
1501                path->keep_locks = 1;
1502        } else
1503                extra_size = -1;
1504
1505        /*
1506         * Owner is our parent level, so we can just add one to get the level
1507         * for the block we are interested in.
1508         */
1509        if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1510                key.type = BTRFS_METADATA_ITEM_KEY;
1511                key.offset = owner;
1512        }
1513
1514again:
1515        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1516        if (ret < 0) {
1517                err = ret;
1518                goto out;
1519        }
1520
1521        /*
1522         * We may be a newly converted file system which still has the old fat
1523         * extent entries for metadata, so try and see if we have one of those.
1524         */
1525        if (ret > 0 && skinny_metadata) {
1526                skinny_metadata = false;
1527                if (path->slots[0]) {
1528                        path->slots[0]--;
1529                        btrfs_item_key_to_cpu(path->nodes[0], &key,
1530                                              path->slots[0]);
1531                        if (key.objectid == bytenr &&
1532                            key.type == BTRFS_EXTENT_ITEM_KEY &&
1533                            key.offset == num_bytes)
1534                                ret = 0;
1535                }
1536                if (ret) {
1537                        key.objectid = bytenr;
1538                        key.type = BTRFS_EXTENT_ITEM_KEY;
1539                        key.offset = num_bytes;
1540                        btrfs_release_path(path);
1541                        goto again;
1542                }
1543        }
1544
1545        if (ret && !insert) {
1546                err = -ENOENT;
1547                goto out;
1548        } else if (WARN_ON(ret)) {
1549                err = -EIO;
1550                goto out;
1551        }
1552
1553        leaf = path->nodes[0];
1554        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1555#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1556        if (item_size < sizeof(*ei)) {
1557                if (!insert) {
1558                        err = -ENOENT;
1559                        goto out;
1560                }
1561                ret = convert_extent_item_v0(trans, root, path, owner,
1562                                             extra_size);
1563                if (ret < 0) {
1564                        err = ret;
1565                        goto out;
1566                }
1567                leaf = path->nodes[0];
1568                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1569        }
1570#endif
1571        BUG_ON(item_size < sizeof(*ei));
1572
1573        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1574        flags = btrfs_extent_flags(leaf, ei);
1575
1576        ptr = (unsigned long)(ei + 1);
1577        end = (unsigned long)ei + item_size;
1578
1579        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1580                ptr += sizeof(struct btrfs_tree_block_info);
1581                BUG_ON(ptr > end);
1582        }
1583
1584        err = -ENOENT;
1585        while (1) {
1586                if (ptr >= end) {
1587                        WARN_ON(ptr > end);
1588                        break;
1589                }
1590                iref = (struct btrfs_extent_inline_ref *)ptr;
1591                type = btrfs_extent_inline_ref_type(leaf, iref);
1592                if (want < type)
1593                        break;
1594                if (want > type) {
1595                        ptr += btrfs_extent_inline_ref_size(type);
1596                        continue;
1597                }
1598
1599                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1600                        struct btrfs_extent_data_ref *dref;
1601                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1602                        if (match_extent_data_ref(leaf, dref, root_objectid,
1603                                                  owner, offset)) {
1604                                err = 0;
1605                                break;
1606                        }
1607                        if (hash_extent_data_ref_item(leaf, dref) <
1608                            hash_extent_data_ref(root_objectid, owner, offset))
1609                                break;
1610                } else {
1611                        u64 ref_offset;
1612                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1613                        if (parent > 0) {
1614                                if (parent == ref_offset) {
1615                                        err = 0;
1616                                        break;
1617                                }
1618                                if (ref_offset < parent)
1619                                        break;
1620                        } else {
1621                                if (root_objectid == ref_offset) {
1622                                        err = 0;
1623                                        break;
1624                                }
1625                                if (ref_offset < root_objectid)
1626                                        break;
1627                        }
1628                }
1629                ptr += btrfs_extent_inline_ref_size(type);
1630        }
1631        if (err == -ENOENT && insert) {
1632                if (item_size + extra_size >=
1633                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1634                        err = -EAGAIN;
1635                        goto out;
1636                }
1637                /*
1638                 * To add new inline back ref, we have to make sure
1639                 * there is no corresponding back ref item.
1640                 * For simplicity, we just do not add new inline back
1641                 * ref if there is any kind of item for this block
1642                 */
1643                if (find_next_key(path, 0, &key) == 0 &&
1644                    key.objectid == bytenr &&
1645                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1646                        err = -EAGAIN;
1647                        goto out;
1648                }
1649        }
1650        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1651out:
1652        if (insert) {
1653                path->keep_locks = 0;
1654                btrfs_unlock_up_safe(path, 1);
1655        }
1656        return err;
1657}
1658
1659/*
1660 * helper to add new inline back ref
1661 */
1662static noinline_for_stack
1663void setup_inline_extent_backref(struct btrfs_root *root,
1664                                 struct btrfs_path *path,
1665                                 struct btrfs_extent_inline_ref *iref,
1666                                 u64 parent, u64 root_objectid,
1667                                 u64 owner, u64 offset, int refs_to_add,
1668                                 struct btrfs_delayed_extent_op *extent_op)
1669{
1670        struct extent_buffer *leaf;
1671        struct btrfs_extent_item *ei;
1672        unsigned long ptr;
1673        unsigned long end;
1674        unsigned long item_offset;
1675        u64 refs;
1676        int size;
1677        int type;
1678
1679        leaf = path->nodes[0];
1680        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1681        item_offset = (unsigned long)iref - (unsigned long)ei;
1682
1683        type = extent_ref_type(parent, owner);
1684        size = btrfs_extent_inline_ref_size(type);
1685
1686        btrfs_extend_item(root, path, size);
1687
1688        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1689        refs = btrfs_extent_refs(leaf, ei);
1690        refs += refs_to_add;
1691        btrfs_set_extent_refs(leaf, ei, refs);
1692        if (extent_op)
1693                __run_delayed_extent_op(extent_op, leaf, ei);
1694
1695        ptr = (unsigned long)ei + item_offset;
1696        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1697        if (ptr < end - size)
1698                memmove_extent_buffer(leaf, ptr + size, ptr,
1699                                      end - size - ptr);
1700
1701        iref = (struct btrfs_extent_inline_ref *)ptr;
1702        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1703        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1704                struct btrfs_extent_data_ref *dref;
1705                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1706                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1707                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1708                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1709                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1710        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1711                struct btrfs_shared_data_ref *sref;
1712                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1713                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1714                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1715        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1716                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1717        } else {
1718                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1719        }
1720        btrfs_mark_buffer_dirty(leaf);
1721}
1722
1723static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1724                                 struct btrfs_root *root,
1725                                 struct btrfs_path *path,
1726                                 struct btrfs_extent_inline_ref **ref_ret,
1727                                 u64 bytenr, u64 num_bytes, u64 parent,
1728                                 u64 root_objectid, u64 owner, u64 offset)
1729{
1730        int ret;
1731
1732        ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1733                                           bytenr, num_bytes, parent,
1734                                           root_objectid, owner, offset, 0);
1735        if (ret != -ENOENT)
1736                return ret;
1737
1738        btrfs_release_path(path);
1739        *ref_ret = NULL;
1740
1741        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1742                ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1743                                            root_objectid);
1744        } else {
1745                ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1746                                             root_objectid, owner, offset);
1747        }
1748        return ret;
1749}
1750
1751/*
1752 * helper to update/remove inline back ref
1753 */
1754static noinline_for_stack
1755void update_inline_extent_backref(struct btrfs_root *root,
1756                                  struct btrfs_path *path,
1757                                  struct btrfs_extent_inline_ref *iref,
1758                                  int refs_to_mod,
1759                                  struct btrfs_delayed_extent_op *extent_op,
1760                                  int *last_ref)
1761{
1762        struct extent_buffer *leaf;
1763        struct btrfs_extent_item *ei;
1764        struct btrfs_extent_data_ref *dref = NULL;
1765        struct btrfs_shared_data_ref *sref = NULL;
1766        unsigned long ptr;
1767        unsigned long end;
1768        u32 item_size;
1769        int size;
1770        int type;
1771        u64 refs;
1772
1773        leaf = path->nodes[0];
1774        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1775        refs = btrfs_extent_refs(leaf, ei);
1776        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1777        refs += refs_to_mod;
1778        btrfs_set_extent_refs(leaf, ei, refs);
1779        if (extent_op)
1780                __run_delayed_extent_op(extent_op, leaf, ei);
1781
1782        type = btrfs_extent_inline_ref_type(leaf, iref);
1783
1784        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1785                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1786                refs = btrfs_extent_data_ref_count(leaf, dref);
1787        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1788                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1789                refs = btrfs_shared_data_ref_count(leaf, sref);
1790        } else {
1791                refs = 1;
1792                BUG_ON(refs_to_mod != -1);
1793        }
1794
1795        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1796        refs += refs_to_mod;
1797
1798        if (refs > 0) {
1799                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1800                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1801                else
1802                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1803        } else {
1804                *last_ref = 1;
1805                size =  btrfs_extent_inline_ref_size(type);
1806                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1807                ptr = (unsigned long)iref;
1808                end = (unsigned long)ei + item_size;
1809                if (ptr + size < end)
1810                        memmove_extent_buffer(leaf, ptr, ptr + size,
1811                                              end - ptr - size);
1812                item_size -= size;
1813                btrfs_truncate_item(root, path, item_size, 1);
1814        }
1815        btrfs_mark_buffer_dirty(leaf);
1816}
1817
1818static noinline_for_stack
1819int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1820                                 struct btrfs_root *root,
1821                                 struct btrfs_path *path,
1822                                 u64 bytenr, u64 num_bytes, u64 parent,
1823                                 u64 root_objectid, u64 owner,
1824                                 u64 offset, int refs_to_add,
1825                                 struct btrfs_delayed_extent_op *extent_op)
1826{
1827        struct btrfs_extent_inline_ref *iref;
1828        int ret;
1829
1830        ret = lookup_inline_extent_backref(trans, root, path, &iref,
1831                                           bytenr, num_bytes, parent,
1832                                           root_objectid, owner, offset, 1);
1833        if (ret == 0) {
1834                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1835                update_inline_extent_backref(root, path, iref,
1836                                             refs_to_add, extent_op, NULL);
1837        } else if (ret == -ENOENT) {
1838                setup_inline_extent_backref(root, path, iref, parent,
1839                                            root_objectid, owner, offset,
1840                                            refs_to_add, extent_op);
1841                ret = 0;
1842        }
1843        return ret;
1844}
1845
1846static int insert_extent_backref(struct btrfs_trans_handle *trans,
1847                                 struct btrfs_root *root,
1848                                 struct btrfs_path *path,
1849                                 u64 bytenr, u64 parent, u64 root_objectid,
1850                                 u64 owner, u64 offset, int refs_to_add)
1851{
1852        int ret;
1853        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1854                BUG_ON(refs_to_add != 1);
1855                ret = insert_tree_block_ref(trans, root, path, bytenr,
1856                                            parent, root_objectid);
1857        } else {
1858                ret = insert_extent_data_ref(trans, root, path, bytenr,
1859                                             parent, root_objectid,
1860                                             owner, offset, refs_to_add);
1861        }
1862        return ret;
1863}
1864
1865static int remove_extent_backref(struct btrfs_trans_handle *trans,
1866                                 struct btrfs_root *root,
1867                                 struct btrfs_path *path,
1868                                 struct btrfs_extent_inline_ref *iref,
1869                                 int refs_to_drop, int is_data, int *last_ref)
1870{
1871        int ret = 0;
1872
1873        BUG_ON(!is_data && refs_to_drop != 1);
1874        if (iref) {
1875                update_inline_extent_backref(root, path, iref,
1876                                             -refs_to_drop, NULL, last_ref);
1877        } else if (is_data) {
1878                ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1879                                             last_ref);
1880        } else {
1881                *last_ref = 1;
1882                ret = btrfs_del_item(trans, root, path);
1883        }
1884        return ret;
1885}
1886
1887static int btrfs_issue_discard(struct block_device *bdev,
1888                                u64 start, u64 len)
1889{
1890        return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1891}
1892
1893int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1894                         u64 num_bytes, u64 *actual_bytes)
1895{
1896        int ret;
1897        u64 discarded_bytes = 0;
1898        struct btrfs_bio *bbio = NULL;
1899
1900
1901        /* Tell the block device(s) that the sectors can be discarded */
1902        ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1903                              bytenr, &num_bytes, &bbio, 0);
1904        /* Error condition is -ENOMEM */
1905        if (!ret) {
1906                struct btrfs_bio_stripe *stripe = bbio->stripes;
1907                int i;
1908
1909
1910                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1911                        if (!stripe->dev->can_discard)
1912                                continue;
1913
1914                        ret = btrfs_issue_discard(stripe->dev->bdev,
1915                                                  stripe->physical,
1916                                                  stripe->length);
1917                        if (!ret)
1918                                discarded_bytes += stripe->length;
1919                        else if (ret != -EOPNOTSUPP)
1920                                break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1921
1922                        /*
1923                         * Just in case we get back EOPNOTSUPP for some reason,
1924                         * just ignore the return value so we don't screw up
1925                         * people calling discard_extent.
1926                         */
1927                        ret = 0;
1928                }
1929                btrfs_put_bbio(bbio);
1930        }
1931
1932        if (actual_bytes)
1933                *actual_bytes = discarded_bytes;
1934
1935
1936        if (ret == -EOPNOTSUPP)
1937                ret = 0;
1938        return ret;
1939}
1940
1941/* Can return -ENOMEM */
1942int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1943                         struct btrfs_root *root,
1944                         u64 bytenr, u64 num_bytes, u64 parent,
1945                         u64 root_objectid, u64 owner, u64 offset,
1946                         int no_quota)
1947{
1948        int ret;
1949        struct btrfs_fs_info *fs_info = root->fs_info;
1950
1951        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1952               root_objectid == BTRFS_TREE_LOG_OBJECTID);
1953
1954        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1955                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1956                                        num_bytes,
1957                                        parent, root_objectid, (int)owner,
1958                                        BTRFS_ADD_DELAYED_REF, NULL, no_quota);
1959        } else {
1960                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1961                                        num_bytes,
1962                                        parent, root_objectid, owner, offset,
1963                                        BTRFS_ADD_DELAYED_REF, NULL, no_quota);
1964        }
1965        return ret;
1966}
1967
1968static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1969                                  struct btrfs_root *root,
1970                                  u64 bytenr, u64 num_bytes,
1971                                  u64 parent, u64 root_objectid,
1972                                  u64 owner, u64 offset, int refs_to_add,
1973                                  int no_quota,
1974                                  struct btrfs_delayed_extent_op *extent_op)
1975{
1976        struct btrfs_fs_info *fs_info = root->fs_info;
1977        struct btrfs_path *path;
1978        struct extent_buffer *leaf;
1979        struct btrfs_extent_item *item;
1980        struct btrfs_key key;
1981        u64 refs;
1982        int ret;
1983        enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
1984
1985        path = btrfs_alloc_path();
1986        if (!path)
1987                return -ENOMEM;
1988
1989        if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
1990                no_quota = 1;
1991
1992        path->reada = 1;
1993        path->leave_spinning = 1;
1994        /* this will setup the path even if it fails to insert the back ref */
1995        ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
1996                                           bytenr, num_bytes, parent,
1997                                           root_objectid, owner, offset,
1998                                           refs_to_add, extent_op);
1999        if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
2000                goto out;
2001        /*
2002         * Ok we were able to insert an inline extent and it appears to be a new
2003         * reference, deal with the qgroup accounting.
2004         */
2005        if (!ret && !no_quota) {
2006                ASSERT(root->fs_info->quota_enabled);
2007                leaf = path->nodes[0];
2008                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2009                item = btrfs_item_ptr(leaf, path->slots[0],
2010                                      struct btrfs_extent_item);
2011                if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
2012                        type = BTRFS_QGROUP_OPER_ADD_SHARED;
2013                btrfs_release_path(path);
2014
2015                ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
2016                                              bytenr, num_bytes, type, 0);
2017                goto out;
2018        }
2019
2020        /*
2021         * Ok we had -EAGAIN which means we didn't have space to insert and
2022         * inline extent ref, so just update the reference count and add a
2023         * normal backref.
2024         */
2025        leaf = path->nodes[0];
2026        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2027        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2028        refs = btrfs_extent_refs(leaf, item);
2029        if (refs)
2030                type = BTRFS_QGROUP_OPER_ADD_SHARED;
2031        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2032        if (extent_op)
2033                __run_delayed_extent_op(extent_op, leaf, item);
2034
2035        btrfs_mark_buffer_dirty(leaf);
2036        btrfs_release_path(path);
2037
2038        if (!no_quota) {
2039                ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
2040                                              bytenr, num_bytes, type, 0);
2041                if (ret)
2042                        goto out;
2043        }
2044
2045        path->reada = 1;
2046        path->leave_spinning = 1;
2047        /* now insert the actual backref */
2048        ret = insert_extent_backref(trans, root->fs_info->extent_root,
2049                                    path, bytenr, parent, root_objectid,
2050                                    owner, offset, refs_to_add);
2051        if (ret)
2052                btrfs_abort_transaction(trans, root, ret);
2053out:
2054        btrfs_free_path(path);
2055        return ret;
2056}
2057
2058static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2059                                struct btrfs_root *root,
2060                                struct btrfs_delayed_ref_node *node,
2061                                struct btrfs_delayed_extent_op *extent_op,
2062                                int insert_reserved)
2063{
2064        int ret = 0;
2065        struct btrfs_delayed_data_ref *ref;
2066        struct btrfs_key ins;
2067        u64 parent = 0;
2068        u64 ref_root = 0;
2069        u64 flags = 0;
2070
2071        ins.objectid = node->bytenr;
2072        ins.offset = node->num_bytes;
2073        ins.type = BTRFS_EXTENT_ITEM_KEY;
2074
2075        ref = btrfs_delayed_node_to_data_ref(node);
2076        trace_run_delayed_data_ref(node, ref, node->action);
2077
2078        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2079                parent = ref->parent;
2080        ref_root = ref->root;
2081
2082        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2083                if (extent_op)
2084                        flags |= extent_op->flags_to_set;
2085                ret = alloc_reserved_file_extent(trans, root,
2086                                                 parent, ref_root, flags,
2087                                                 ref->objectid, ref->offset,
2088                                                 &ins, node->ref_mod);
2089        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2090                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2091                                             node->num_bytes, parent,
2092                                             ref_root, ref->objectid,
2093                                             ref->offset, node->ref_mod,
2094                                             node->no_quota, extent_op);
2095        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2096                ret = __btrfs_free_extent(trans, root, node->bytenr,
2097                                          node->num_bytes, parent,
2098                                          ref_root, ref->objectid,
2099                                          ref->offset, node->ref_mod,
2100                                          extent_op, node->no_quota);
2101        } else {
2102                BUG();
2103        }
2104        return ret;
2105}
2106
2107static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2108                                    struct extent_buffer *leaf,
2109                                    struct btrfs_extent_item *ei)
2110{
2111        u64 flags = btrfs_extent_flags(leaf, ei);
2112        if (extent_op->update_flags) {
2113                flags |= extent_op->flags_to_set;
2114                btrfs_set_extent_flags(leaf, ei, flags);
2115        }
2116
2117        if (extent_op->update_key) {
2118                struct btrfs_tree_block_info *bi;
2119                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2120                bi = (struct btrfs_tree_block_info *)(ei + 1);
2121                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2122        }
2123}
2124
2125static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2126                                 struct btrfs_root *root,
2127                                 struct btrfs_delayed_ref_node *node,
2128                                 struct btrfs_delayed_extent_op *extent_op)
2129{
2130        struct btrfs_key key;
2131        struct btrfs_path *path;
2132        struct btrfs_extent_item *ei;
2133        struct extent_buffer *leaf;
2134        u32 item_size;
2135        int ret;
2136        int err = 0;
2137        int metadata = !extent_op->is_data;
2138
2139        if (trans->aborted)
2140                return 0;
2141
2142        if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2143                metadata = 0;
2144
2145        path = btrfs_alloc_path();
2146        if (!path)
2147                return -ENOMEM;
2148
2149        key.objectid = node->bytenr;
2150
2151        if (metadata) {
2152                key.type = BTRFS_METADATA_ITEM_KEY;
2153                key.offset = extent_op->level;
2154        } else {
2155                key.type = BTRFS_EXTENT_ITEM_KEY;
2156                key.offset = node->num_bytes;
2157        }
2158
2159again:
2160        path->reada = 1;
2161        path->leave_spinning = 1;
2162        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2163                                path, 0, 1);
2164        if (ret < 0) {
2165                err = ret;
2166                goto out;
2167        }
2168        if (ret > 0) {
2169                if (metadata) {
2170                        if (path->slots[0] > 0) {
2171                                path->slots[0]--;
2172                                btrfs_item_key_to_cpu(path->nodes[0], &key,
2173                                                      path->slots[0]);
2174                                if (key.objectid == node->bytenr &&
2175                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
2176                                    key.offset == node->num_bytes)
2177                                        ret = 0;
2178                        }
2179                        if (ret > 0) {
2180                                btrfs_release_path(path);
2181                                metadata = 0;
2182
2183                                key.objectid = node->bytenr;
2184                                key.offset = node->num_bytes;
2185                                key.type = BTRFS_EXTENT_ITEM_KEY;
2186                                goto again;
2187                        }
2188                } else {
2189                        err = -EIO;
2190                        goto out;
2191                }
2192        }
2193
2194        leaf = path->nodes[0];
2195        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2196#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2197        if (item_size < sizeof(*ei)) {
2198                ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2199                                             path, (u64)-1, 0);
2200                if (ret < 0) {
2201                        err = ret;
2202                        goto out;
2203                }
2204                leaf = path->nodes[0];
2205                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2206        }
2207#endif
2208        BUG_ON(item_size < sizeof(*ei));
2209        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2210        __run_delayed_extent_op(extent_op, leaf, ei);
2211
2212        btrfs_mark_buffer_dirty(leaf);
2213out:
2214        btrfs_free_path(path);
2215        return err;
2216}
2217
2218static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2219                                struct btrfs_root *root,
2220                                struct btrfs_delayed_ref_node *node,
2221                                struct btrfs_delayed_extent_op *extent_op,
2222                                int insert_reserved)
2223{
2224        int ret = 0;
2225        struct btrfs_delayed_tree_ref *ref;
2226        struct btrfs_key ins;
2227        u64 parent = 0;
2228        u64 ref_root = 0;
2229        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2230                                                 SKINNY_METADATA);
2231
2232        ref = btrfs_delayed_node_to_tree_ref(node);
2233        trace_run_delayed_tree_ref(node, ref, node->action);
2234
2235        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2236                parent = ref->parent;
2237        ref_root = ref->root;
2238
2239        ins.objectid = node->bytenr;
2240        if (skinny_metadata) {
2241                ins.offset = ref->level;
2242                ins.type = BTRFS_METADATA_ITEM_KEY;
2243        } else {
2244                ins.offset = node->num_bytes;
2245                ins.type = BTRFS_EXTENT_ITEM_KEY;
2246        }
2247
2248        BUG_ON(node->ref_mod != 1);
2249        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2250                BUG_ON(!extent_op || !extent_op->update_flags);
2251                ret = alloc_reserved_tree_block(trans, root,
2252                                                parent, ref_root,
2253                                                extent_op->flags_to_set,
2254                                                &extent_op->key,
2255                                                ref->level, &ins,
2256                                                node->no_quota);
2257        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2258                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2259                                             node->num_bytes, parent, ref_root,
2260                                             ref->level, 0, 1, node->no_quota,
2261                                             extent_op);
2262        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2263                ret = __btrfs_free_extent(trans, root, node->bytenr,
2264                                          node->num_bytes, parent, ref_root,
2265                                          ref->level, 0, 1, extent_op,
2266                                          node->no_quota);
2267        } else {
2268                BUG();
2269        }
2270        return ret;
2271}
2272
2273/* helper function to actually process a single delayed ref entry */
2274static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2275                               struct btrfs_root *root,
2276                               struct btrfs_delayed_ref_node *node,
2277                               struct btrfs_delayed_extent_op *extent_op,
2278                               int insert_reserved)
2279{
2280        int ret = 0;
2281
2282        if (trans->aborted) {
2283                if (insert_reserved)
2284                        btrfs_pin_extent(root, node->bytenr,
2285                                         node->num_bytes, 1);
2286                return 0;
2287        }
2288
2289        if (btrfs_delayed_ref_is_head(node)) {
2290                struct btrfs_delayed_ref_head *head;
2291                /*
2292                 * we've hit the end of the chain and we were supposed
2293                 * to insert this extent into the tree.  But, it got
2294                 * deleted before we ever needed to insert it, so all
2295                 * we have to do is clean up the accounting
2296                 */
2297                BUG_ON(extent_op);
2298                head = btrfs_delayed_node_to_head(node);
2299                trace_run_delayed_ref_head(node, head, node->action);
2300
2301                if (insert_reserved) {
2302                        btrfs_pin_extent(root, node->bytenr,
2303                                         node->num_bytes, 1);
2304                        if (head->is_data) {
2305                                ret = btrfs_del_csums(trans, root,
2306                                                      node->bytenr,
2307                                                      node->num_bytes);
2308                        }
2309                }
2310                return ret;
2311        }
2312
2313        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2314            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2315                ret = run_delayed_tree_ref(trans, root, node, extent_op,
2316                                           insert_reserved);
2317        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2318                 node->type == BTRFS_SHARED_DATA_REF_KEY)
2319                ret = run_delayed_data_ref(trans, root, node, extent_op,
2320                                           insert_reserved);
2321        else
2322                BUG();
2323        return ret;
2324}
2325
2326static noinline struct btrfs_delayed_ref_node *
2327select_delayed_ref(struct btrfs_delayed_ref_head *head)
2328{
2329        struct rb_node *node;
2330        struct btrfs_delayed_ref_node *ref, *last = NULL;;
2331
2332        /*
2333         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2334         * this prevents ref count from going down to zero when
2335         * there still are pending delayed ref.
2336         */
2337        node = rb_first(&head->ref_root);
2338        while (node) {
2339                ref = rb_entry(node, struct btrfs_delayed_ref_node,
2340                                rb_node);
2341                if (ref->action == BTRFS_ADD_DELAYED_REF)
2342                        return ref;
2343                else if (last == NULL)
2344                        last = ref;
2345                node = rb_next(node);
2346        }
2347        return last;
2348}
2349
2350/*
2351 * Returns 0 on success or if called with an already aborted transaction.
2352 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2353 */
2354static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2355                                             struct btrfs_root *root,
2356                                             unsigned long nr)
2357{
2358        struct btrfs_delayed_ref_root *delayed_refs;
2359        struct btrfs_delayed_ref_node *ref;
2360        struct btrfs_delayed_ref_head *locked_ref = NULL;
2361        struct btrfs_delayed_extent_op *extent_op;
2362        struct btrfs_fs_info *fs_info = root->fs_info;
2363        ktime_t start = ktime_get();
2364        int ret;
2365        unsigned long count = 0;
2366        unsigned long actual_count = 0;
2367        int must_insert_reserved = 0;
2368
2369        delayed_refs = &trans->transaction->delayed_refs;
2370        while (1) {
2371                if (!locked_ref) {
2372                        if (count >= nr)
2373                                break;
2374
2375                        spin_lock(&delayed_refs->lock);
2376                        locked_ref = btrfs_select_ref_head(trans);
2377                        if (!locked_ref) {
2378                                spin_unlock(&delayed_refs->lock);
2379                                break;
2380                        }
2381
2382                        /* grab the lock that says we are going to process
2383                         * all the refs for this head */
2384                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
2385                        spin_unlock(&delayed_refs->lock);
2386                        /*
2387                         * we may have dropped the spin lock to get the head
2388                         * mutex lock, and that might have given someone else
2389                         * time to free the head.  If that's true, it has been
2390                         * removed from our list and we can move on.
2391                         */
2392                        if (ret == -EAGAIN) {
2393                                locked_ref = NULL;
2394                                count++;
2395                                continue;
2396                        }
2397                }
2398
2399                /*
2400                 * We need to try and merge add/drops of the same ref since we
2401                 * can run into issues with relocate dropping the implicit ref
2402                 * and then it being added back again before the drop can
2403                 * finish.  If we merged anything we need to re-loop so we can
2404                 * get a good ref.
2405                 */
2406                spin_lock(&locked_ref->lock);
2407                btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2408                                         locked_ref);
2409
2410                /*
2411                 * locked_ref is the head node, so we have to go one
2412                 * node back for any delayed ref updates
2413                 */
2414                ref = select_delayed_ref(locked_ref);
2415
2416                if (ref && ref->seq &&
2417                    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2418                        spin_unlock(&locked_ref->lock);
2419                        btrfs_delayed_ref_unlock(locked_ref);
2420                        spin_lock(&delayed_refs->lock);
2421                        locked_ref->processing = 0;
2422                        delayed_refs->num_heads_ready++;
2423                        spin_unlock(&delayed_refs->lock);
2424                        locked_ref = NULL;
2425                        cond_resched();
2426                        count++;
2427                        continue;
2428                }
2429
2430                /*
2431                 * record the must insert reserved flag before we
2432                 * drop the spin lock.
2433                 */
2434                must_insert_reserved = locked_ref->must_insert_reserved;
2435                locked_ref->must_insert_reserved = 0;
2436
2437                extent_op = locked_ref->extent_op;
2438                locked_ref->extent_op = NULL;
2439
2440                if (!ref) {
2441
2442
2443                        /* All delayed refs have been processed, Go ahead
2444                         * and send the head node to run_one_delayed_ref,
2445                         * so that any accounting fixes can happen
2446                         */
2447                        ref = &locked_ref->node;
2448
2449                        if (extent_op && must_insert_reserved) {
2450                                btrfs_free_delayed_extent_op(extent_op);
2451                                extent_op = NULL;
2452                        }
2453
2454                        if (extent_op) {
2455                                spin_unlock(&locked_ref->lock);
2456                                ret = run_delayed_extent_op(trans, root,
2457                                                            ref, extent_op);
2458                                btrfs_free_delayed_extent_op(extent_op);
2459
2460                                if (ret) {
2461                                        /*
2462                                         * Need to reset must_insert_reserved if
2463                                         * there was an error so the abort stuff
2464                                         * can cleanup the reserved space
2465                                         * properly.
2466                                         */
2467                                        if (must_insert_reserved)
2468                                                locked_ref->must_insert_reserved = 1;
2469                                        locked_ref->processing = 0;
2470                                        btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2471                                        btrfs_delayed_ref_unlock(locked_ref);
2472                                        return ret;
2473                                }
2474                                continue;
2475                        }
2476
2477                        /*
2478                         * Need to drop our head ref lock and re-aqcuire the
2479                         * delayed ref lock and then re-check to make sure
2480                         * nobody got added.
2481                         */
2482                        spin_unlock(&locked_ref->lock);
2483                        spin_lock(&delayed_refs->lock);
2484                        spin_lock(&locked_ref->lock);
2485                        if (rb_first(&locked_ref->ref_root) ||
2486                            locked_ref->extent_op) {
2487                                spin_unlock(&locked_ref->lock);
2488                                spin_unlock(&delayed_refs->lock);
2489                                continue;
2490                        }
2491                        ref->in_tree = 0;
2492                        delayed_refs->num_heads--;
2493                        rb_erase(&locked_ref->href_node,
2494                                 &delayed_refs->href_root);
2495                        spin_unlock(&delayed_refs->lock);
2496                } else {
2497                        actual_count++;
2498                        ref->in_tree = 0;
2499                        rb_erase(&ref->rb_node, &locked_ref->ref_root);
2500                }
2501                atomic_dec(&delayed_refs->num_entries);
2502
2503                if (!btrfs_delayed_ref_is_head(ref)) {
2504                        /*
2505                         * when we play the delayed ref, also correct the
2506                         * ref_mod on head
2507                         */
2508                        switch (ref->action) {
2509                        case BTRFS_ADD_DELAYED_REF:
2510                        case BTRFS_ADD_DELAYED_EXTENT:
2511                                locked_ref->node.ref_mod -= ref->ref_mod;
2512                                break;
2513                        case BTRFS_DROP_DELAYED_REF:
2514                                locked_ref->node.ref_mod += ref->ref_mod;
2515                                break;
2516                        default:
2517                                WARN_ON(1);
2518                        }
2519                }
2520                spin_unlock(&locked_ref->lock);
2521
2522                ret = run_one_delayed_ref(trans, root, ref, extent_op,
2523                                          must_insert_reserved);
2524
2525                btrfs_free_delayed_extent_op(extent_op);
2526                if (ret) {
2527                        locked_ref->processing = 0;
2528                        btrfs_delayed_ref_unlock(locked_ref);
2529                        btrfs_put_delayed_ref(ref);
2530                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2531                        return ret;
2532                }
2533
2534                /*
2535                 * If this node is a head, that means all the refs in this head
2536                 * have been dealt with, and we will pick the next head to deal
2537                 * with, so we must unlock the head and drop it from the cluster
2538                 * list before we release it.
2539                 */
2540                if (btrfs_delayed_ref_is_head(ref)) {
2541                        btrfs_delayed_ref_unlock(locked_ref);
2542                        locked_ref = NULL;
2543                }
2544                btrfs_put_delayed_ref(ref);
2545                count++;
2546                cond_resched();
2547        }
2548
2549        /*
2550         * We don't want to include ref heads since we can have empty ref heads
2551         * and those will drastically skew our runtime down since we just do
2552         * accounting, no actual extent tree updates.
2553         */
2554        if (actual_count > 0) {
2555                u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2556                u64 avg;
2557
2558                /*
2559                 * We weigh the current average higher than our current runtime
2560                 * to avoid large swings in the average.
2561                 */
2562                spin_lock(&delayed_refs->lock);
2563                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2564                avg = div64_u64(avg, 4);
2565                fs_info->avg_delayed_ref_runtime = avg;
2566                spin_unlock(&delayed_refs->lock);
2567        }
2568        return 0;
2569}
2570
2571#ifdef SCRAMBLE_DELAYED_REFS
2572/*
2573 * Normally delayed refs get processed in ascending bytenr order. This
2574 * correlates in most cases to the order added. To expose dependencies on this
2575 * order, we start to process the tree in the middle instead of the beginning
2576 */
2577static u64 find_middle(struct rb_root *root)
2578{
2579        struct rb_node *n = root->rb_node;
2580        struct btrfs_delayed_ref_node *entry;
2581        int alt = 1;
2582        u64 middle;
2583        u64 first = 0, last = 0;
2584
2585        n = rb_first(root);
2586        if (n) {
2587                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2588                first = entry->bytenr;
2589        }
2590        n = rb_last(root);
2591        if (n) {
2592                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2593                last = entry->bytenr;
2594        }
2595        n = root->rb_node;
2596
2597        while (n) {
2598                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2599                WARN_ON(!entry->in_tree);
2600
2601                middle = entry->bytenr;
2602
2603                if (alt)
2604                        n = n->rb_left;
2605                else
2606                        n = n->rb_right;
2607
2608                alt = 1 - alt;
2609        }
2610        return middle;
2611}
2612#endif
2613
2614static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2615{
2616        u64 num_bytes;
2617
2618        num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2619                             sizeof(struct btrfs_extent_inline_ref));
2620        if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2621                num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2622
2623        /*
2624         * We don't ever fill up leaves all the way so multiply by 2 just to be
2625         * closer to what we're really going to want to ouse.
2626         */
2627        return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2628}
2629
2630int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2631                                       struct btrfs_root *root)
2632{
2633        struct btrfs_block_rsv *global_rsv;
2634        u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2635        u64 num_bytes;
2636        int ret = 0;
2637
2638        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2639        num_heads = heads_to_leaves(root, num_heads);
2640        if (num_heads > 1)
2641                num_bytes += (num_heads - 1) * root->nodesize;
2642        num_bytes <<= 1;
2643        global_rsv = &root->fs_info->global_block_rsv;
2644
2645        /*
2646         * If we can't allocate any more chunks lets make sure we have _lots_ of
2647         * wiggle room since running delayed refs can create more delayed refs.
2648         */
2649        if (global_rsv->space_info->full)
2650                num_bytes <<= 1;
2651
2652        spin_lock(&global_rsv->lock);
2653        if (global_rsv->reserved <= num_bytes)
2654                ret = 1;
2655        spin_unlock(&global_rsv->lock);
2656        return ret;
2657}
2658
2659int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2660                                       struct btrfs_root *root)
2661{
2662        struct btrfs_fs_info *fs_info = root->fs_info;
2663        u64 num_entries =
2664                atomic_read(&trans->transaction->delayed_refs.num_entries);
2665        u64 avg_runtime;
2666        u64 val;
2667
2668        smp_mb();
2669        avg_runtime = fs_info->avg_delayed_ref_runtime;
2670        val = num_entries * avg_runtime;
2671        if (num_entries * avg_runtime >= NSEC_PER_SEC)
2672                return 1;
2673        if (val >= NSEC_PER_SEC / 2)
2674                return 2;
2675
2676        return btrfs_check_space_for_delayed_refs(trans, root);
2677}
2678
2679struct async_delayed_refs {
2680        struct btrfs_root *root;
2681        int count;
2682        int error;
2683        int sync;
2684        struct completion wait;
2685        struct btrfs_work work;
2686};
2687
2688static void delayed_ref_async_start(struct btrfs_work *work)
2689{
2690        struct async_delayed_refs *async;
2691        struct btrfs_trans_handle *trans;
2692        int ret;
2693
2694        async = container_of(work, struct async_delayed_refs, work);
2695
2696        trans = btrfs_join_transaction(async->root);
2697        if (IS_ERR(trans)) {
2698                async->error = PTR_ERR(trans);
2699                goto done;
2700        }
2701
2702        /*
2703         * trans->sync means that when we call end_transaciton, we won't
2704         * wait on delayed refs
2705         */
2706        trans->sync = true;
2707        ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2708        if (ret)
2709                async->error = ret;
2710
2711        ret = btrfs_end_transaction(trans, async->root);
2712        if (ret && !async->error)
2713                async->error = ret;
2714done:
2715        if (async->sync)
2716                complete(&async->wait);
2717        else
2718                kfree(async);
2719}
2720
2721int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2722                                 unsigned long count, int wait)
2723{
2724        struct async_delayed_refs *async;
2725        int ret;
2726
2727        async = kmalloc(sizeof(*async), GFP_NOFS);
2728        if (!async)
2729                return -ENOMEM;
2730
2731        async->root = root->fs_info->tree_root;
2732        async->count = count;
2733        async->error = 0;
2734        if (wait)
2735                async->sync = 1;
2736        else
2737                async->sync = 0;
2738        init_completion(&async->wait);
2739
2740        btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2741                        delayed_ref_async_start, NULL, NULL);
2742
2743        btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2744
2745        if (wait) {
2746                wait_for_completion(&async->wait);
2747                ret = async->error;
2748                kfree(async);
2749                return ret;
2750        }
2751        return 0;
2752}
2753
2754/*
2755 * this starts processing the delayed reference count updates and
2756 * extent insertions we have queued up so far.  count can be
2757 * 0, which means to process everything in the tree at the start
2758 * of the run (but not newly added entries), or it can be some target
2759 * number you'd like to process.
2760 *
2761 * Returns 0 on success or if called with an aborted transaction
2762 * Returns <0 on error and aborts the transaction
2763 */
2764int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2765                           struct btrfs_root *root, unsigned long count)
2766{
2767        struct rb_node *node;
2768        struct btrfs_delayed_ref_root *delayed_refs;
2769        struct btrfs_delayed_ref_head *head;
2770        int ret;
2771        int run_all = count == (unsigned long)-1;
2772
2773        /* We'll clean this up in btrfs_cleanup_transaction */
2774        if (trans->aborted)
2775                return 0;
2776
2777        if (root == root->fs_info->extent_root)
2778                root = root->fs_info->tree_root;
2779
2780        delayed_refs = &trans->transaction->delayed_refs;
2781        if (count == 0)
2782                count = atomic_read(&delayed_refs->num_entries) * 2;
2783
2784again:
2785#ifdef SCRAMBLE_DELAYED_REFS
2786        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2787#endif
2788        ret = __btrfs_run_delayed_refs(trans, root, count);
2789        if (ret < 0) {
2790                btrfs_abort_transaction(trans, root, ret);
2791                return ret;
2792        }
2793
2794        if (run_all) {
2795                if (!list_empty(&trans->new_bgs))
2796                        btrfs_create_pending_block_groups(trans, root);
2797
2798                spin_lock(&delayed_refs->lock);
2799                node = rb_first(&delayed_refs->href_root);
2800                if (!node) {
2801                        spin_unlock(&delayed_refs->lock);
2802                        goto out;
2803                }
2804                count = (unsigned long)-1;
2805
2806                while (node) {
2807                        head = rb_entry(node, struct btrfs_delayed_ref_head,
2808                                        href_node);
2809                        if (btrfs_delayed_ref_is_head(&head->node)) {
2810                                struct btrfs_delayed_ref_node *ref;
2811
2812                                ref = &head->node;
2813                                atomic_inc(&ref->refs);
2814
2815                                spin_unlock(&delayed_refs->lock);
2816                                /*
2817                                 * Mutex was contended, block until it's
2818                                 * released and try again
2819                                 */
2820                                mutex_lock(&head->mutex);
2821                                mutex_unlock(&head->mutex);
2822
2823                                btrfs_put_delayed_ref(ref);
2824                                cond_resched();
2825                                goto again;
2826                        } else {
2827                                WARN_ON(1);
2828                        }
2829                        node = rb_next(node);
2830                }
2831                spin_unlock(&delayed_refs->lock);
2832                cond_resched();
2833                goto again;
2834        }
2835out:
2836        ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
2837        if (ret)
2838                return ret;
2839        assert_qgroups_uptodate(trans);
2840        return 0;
2841}
2842
2843int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2844                                struct btrfs_root *root,
2845                                u64 bytenr, u64 num_bytes, u64 flags,
2846                                int level, int is_data)
2847{
2848        struct btrfs_delayed_extent_op *extent_op;
2849        int ret;
2850
2851        extent_op = btrfs_alloc_delayed_extent_op();
2852        if (!extent_op)
2853                return -ENOMEM;
2854
2855        extent_op->flags_to_set = flags;
2856        extent_op->update_flags = 1;
2857        extent_op->update_key = 0;
2858        extent_op->is_data = is_data ? 1 : 0;
2859        extent_op->level = level;
2860
2861        ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2862                                          num_bytes, extent_op);
2863        if (ret)
2864                btrfs_free_delayed_extent_op(extent_op);
2865        return ret;
2866}
2867
2868static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2869                                      struct btrfs_root *root,
2870                                      struct btrfs_path *path,
2871                                      u64 objectid, u64 offset, u64 bytenr)
2872{
2873        struct btrfs_delayed_ref_head *head;
2874        struct btrfs_delayed_ref_node *ref;
2875        struct btrfs_delayed_data_ref *data_ref;
2876        struct btrfs_delayed_ref_root *delayed_refs;
2877        struct rb_node *node;
2878        int ret = 0;
2879
2880        delayed_refs = &trans->transaction->delayed_refs;
2881        spin_lock(&delayed_refs->lock);
2882        head = btrfs_find_delayed_ref_head(trans, bytenr);
2883        if (!head) {
2884                spin_unlock(&delayed_refs->lock);
2885                return 0;
2886        }
2887
2888        if (!mutex_trylock(&head->mutex)) {
2889                atomic_inc(&head->node.refs);
2890                spin_unlock(&delayed_refs->lock);
2891
2892                btrfs_release_path(path);
2893
2894                /*
2895                 * Mutex was contended, block until it's released and let
2896                 * caller try again
2897                 */
2898                mutex_lock(&head->mutex);
2899                mutex_unlock(&head->mutex);
2900                btrfs_put_delayed_ref(&head->node);
2901                return -EAGAIN;
2902        }
2903        spin_unlock(&delayed_refs->lock);
2904
2905        spin_lock(&head->lock);
2906        node = rb_first(&head->ref_root);
2907        while (node) {
2908                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2909                node = rb_next(node);
2910
2911                /* If it's a shared ref we know a cross reference exists */
2912                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2913                        ret = 1;
2914                        break;
2915                }
2916
2917                data_ref = btrfs_delayed_node_to_data_ref(ref);
2918
2919                /*
2920                 * If our ref doesn't match the one we're currently looking at
2921                 * then we have a cross reference.
2922                 */
2923                if (data_ref->root != root->root_key.objectid ||
2924                    data_ref->objectid != objectid ||
2925                    data_ref->offset != offset) {
2926                        ret = 1;
2927                        break;
2928                }
2929        }
2930        spin_unlock(&head->lock);
2931        mutex_unlock(&head->mutex);
2932        return ret;
2933}
2934
2935static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2936                                        struct btrfs_root *root,
2937                                        struct btrfs_path *path,
2938                                        u64 objectid, u64 offset, u64 bytenr)
2939{
2940        struct btrfs_root *extent_root = root->fs_info->extent_root;
2941        struct extent_buffer *leaf;
2942        struct btrfs_extent_data_ref *ref;
2943        struct btrfs_extent_inline_ref *iref;
2944        struct btrfs_extent_item *ei;
2945        struct btrfs_key key;
2946        u32 item_size;
2947        int ret;
2948
2949        key.objectid = bytenr;
2950        key.offset = (u64)-1;
2951        key.type = BTRFS_EXTENT_ITEM_KEY;
2952
2953        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2954        if (ret < 0)
2955                goto out;
2956        BUG_ON(ret == 0); /* Corruption */
2957
2958        ret = -ENOENT;
2959        if (path->slots[0] == 0)
2960                goto out;
2961
2962        path->slots[0]--;
2963        leaf = path->nodes[0];
2964        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2965
2966        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2967                goto out;
2968
2969        ret = 1;
2970        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2971#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2972        if (item_size < sizeof(*ei)) {
2973                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2974                goto out;
2975        }
2976#endif
2977        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2978
2979        if (item_size != sizeof(*ei) +
2980            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2981                goto out;
2982
2983        if (btrfs_extent_generation(leaf, ei) <=
2984            btrfs_root_last_snapshot(&root->root_item))
2985                goto out;
2986
2987        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2988        if (btrfs_extent_inline_ref_type(leaf, iref) !=
2989            BTRFS_EXTENT_DATA_REF_KEY)
2990                goto out;
2991
2992        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2993        if (btrfs_extent_refs(leaf, ei) !=
2994            btrfs_extent_data_ref_count(leaf, ref) ||
2995            btrfs_extent_data_ref_root(leaf, ref) !=
2996            root->root_key.objectid ||
2997            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2998            btrfs_extent_data_ref_offset(leaf, ref) != offset)
2999                goto out;
3000
3001        ret = 0;
3002out:
3003        return ret;
3004}
3005
3006int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3007                          struct btrfs_root *root,
3008                          u64 objectid, u64 offset, u64 bytenr)
3009{
3010        struct btrfs_path *path;
3011        int ret;
3012        int ret2;
3013
3014        path = btrfs_alloc_path();
3015        if (!path)
3016                return -ENOENT;
3017
3018        do {
3019                ret = check_committed_ref(trans, root, path, objectid,
3020                                          offset, bytenr);
3021                if (ret && ret != -ENOENT)
3022                        goto out;
3023
3024                ret2 = check_delayed_ref(trans, root, path, objectid,
3025                                         offset, bytenr);
3026        } while (ret2 == -EAGAIN);
3027
3028        if (ret2 && ret2 != -ENOENT) {
3029                ret = ret2;
3030                goto out;
3031        }
3032
3033        if (ret != -ENOENT || ret2 != -ENOENT)
3034                ret = 0;
3035out:
3036        btrfs_free_path(path);
3037        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3038                WARN_ON(ret > 0);
3039        return ret;
3040}
3041
3042static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3043                           struct btrfs_root *root,
3044                           struct extent_buffer *buf,
3045                           int full_backref, int inc)
3046{
3047        u64 bytenr;
3048        u64 num_bytes;
3049        u64 parent;
3050        u64 ref_root;
3051        u32 nritems;
3052        struct btrfs_key key;
3053        struct btrfs_file_extent_item *fi;
3054        int i;
3055        int level;
3056        int ret = 0;
3057        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3058                            u64, u64, u64, u64, u64, u64, int);
3059
3060
3061        if (btrfs_test_is_dummy_root(root))
3062                return 0;
3063
3064        ref_root = btrfs_header_owner(buf);
3065        nritems = btrfs_header_nritems(buf);
3066        level = btrfs_header_level(buf);
3067
3068        if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3069                return 0;
3070
3071        if (inc)
3072                process_func = btrfs_inc_extent_ref;
3073        else
3074                process_func = btrfs_free_extent;
3075
3076        if (full_backref)
3077                parent = buf->start;
3078        else
3079                parent = 0;
3080
3081        for (i = 0; i < nritems; i++) {
3082                if (level == 0) {
3083                        btrfs_item_key_to_cpu(buf, &key, i);
3084                        if (key.type != BTRFS_EXTENT_DATA_KEY)
3085                                continue;
3086                        fi = btrfs_item_ptr(buf, i,
3087                                            struct btrfs_file_extent_item);
3088                        if (btrfs_file_extent_type(buf, fi) ==
3089                            BTRFS_FILE_EXTENT_INLINE)
3090                                continue;
3091                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3092                        if (bytenr == 0)
3093                                continue;
3094
3095                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3096                        key.offset -= btrfs_file_extent_offset(buf, fi);
3097                        ret = process_func(trans, root, bytenr, num_bytes,
3098                                           parent, ref_root, key.objectid,
3099                                           key.offset, 1);
3100                        if (ret)
3101                                goto fail;
3102                } else {
3103                        bytenr = btrfs_node_blockptr(buf, i);
3104                        num_bytes = root->nodesize;
3105                        ret = process_func(trans, root, bytenr, num_bytes,
3106                                           parent, ref_root, level - 1, 0,
3107                                           1);
3108                        if (ret)
3109                                goto fail;
3110                }
3111        }
3112        return 0;
3113fail:
3114        return ret;
3115}
3116
3117int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3118                  struct extent_buffer *buf, int full_backref)
3119{
3120        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3121}
3122
3123int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3124                  struct extent_buffer *buf, int full_backref)
3125{
3126        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3127}
3128
3129static int write_one_cache_group(struct btrfs_trans_handle *trans,
3130                                 struct btrfs_root *root,
3131                                 struct btrfs_path *path,
3132                                 struct btrfs_block_group_cache *cache)
3133{
3134        int ret;
3135        struct btrfs_root *extent_root = root->fs_info->extent_root;
3136        unsigned long bi;
3137        struct extent_buffer *leaf;
3138
3139        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3140        if (ret) {
3141                if (ret > 0)
3142                        ret = -ENOENT;
3143                goto fail;
3144        }
3145
3146        leaf = path->nodes[0];
3147        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3148        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3149        btrfs_mark_buffer_dirty(leaf);
3150        btrfs_release_path(path);
3151fail:
3152        if (ret)
3153                btrfs_abort_transaction(trans, root, ret);
3154        return ret;
3155
3156}
3157
3158static struct btrfs_block_group_cache *
3159next_block_group(struct btrfs_root *root,
3160                 struct btrfs_block_group_cache *cache)
3161{
3162        struct rb_node *node;
3163
3164        spin_lock(&root->fs_info->block_group_cache_lock);
3165
3166        /* If our block group was removed, we need a full search. */
3167        if (RB_EMPTY_NODE(&cache->cache_node)) {
3168                const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3169
3170                spin_unlock(&root->fs_info->block_group_cache_lock);
3171                btrfs_put_block_group(cache);
3172                cache = btrfs_lookup_first_block_group(root->fs_info,
3173                                                       next_bytenr);
3174                return cache;
3175        }
3176        node = rb_next(&cache->cache_node);
3177        btrfs_put_block_group(cache);
3178        if (node) {
3179                cache = rb_entry(node, struct btrfs_block_group_cache,
3180                                 cache_node);
3181                btrfs_get_block_group(cache);
3182        } else
3183                cache = NULL;
3184        spin_unlock(&root->fs_info->block_group_cache_lock);
3185        return cache;
3186}
3187
3188static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3189                            struct btrfs_trans_handle *trans,
3190                            struct btrfs_path *path)
3191{
3192        struct btrfs_root *root = block_group->fs_info->tree_root;
3193        struct inode *inode = NULL;
3194        u64 alloc_hint = 0;
3195        int dcs = BTRFS_DC_ERROR;
3196        int num_pages = 0;
3197        int retries = 0;
3198        int ret = 0;
3199
3200        /*
3201         * If this block group is smaller than 100 megs don't bother caching the
3202         * block group.
3203         */
3204        if (block_group->key.offset < (100 * 1024 * 1024)) {
3205                spin_lock(&block_group->lock);
3206                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3207                spin_unlock(&block_group->lock);
3208                return 0;
3209        }
3210
3211        if (trans->aborted)
3212                return 0;
3213again:
3214        inode = lookup_free_space_inode(root, block_group, path);
3215        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3216                ret = PTR_ERR(inode);
3217                btrfs_release_path(path);
3218                goto out;
3219        }
3220
3221        if (IS_ERR(inode)) {
3222                BUG_ON(retries);
3223                retries++;
3224
3225                if (block_group->ro)
3226                        goto out_free;
3227
3228                ret = create_free_space_inode(root, trans, block_group, path);
3229                if (ret)
3230                        goto out_free;
3231                goto again;
3232        }
3233
3234        /* We've already setup this transaction, go ahead and exit */
3235        if (block_group->cache_generation == trans->transid &&
3236            i_size_read(inode)) {
3237                dcs = BTRFS_DC_SETUP;
3238                goto out_put;
3239        }
3240
3241        /*
3242         * We want to set the generation to 0, that way if anything goes wrong
3243         * from here on out we know not to trust this cache when we load up next
3244         * time.
3245         */
3246        BTRFS_I(inode)->generation = 0;
3247        ret = btrfs_update_inode(trans, root, inode);
3248        if (ret) {
3249                /*
3250                 * So theoretically we could recover from this, simply set the
3251                 * super cache generation to 0 so we know to invalidate the
3252                 * cache, but then we'd have to keep track of the block groups
3253                 * that fail this way so we know we _have_ to reset this cache
3254                 * before the next commit or risk reading stale cache.  So to
3255                 * limit our exposure to horrible edge cases lets just abort the
3256                 * transaction, this only happens in really bad situations
3257                 * anyway.
3258                 */
3259                btrfs_abort_transaction(trans, root, ret);
3260                goto out_put;
3261        }
3262        WARN_ON(ret);
3263
3264        if (i_size_read(inode) > 0) {
3265                ret = btrfs_check_trunc_cache_free_space(root,
3266                                        &root->fs_info->global_block_rsv);
3267                if (ret)
3268                        goto out_put;
3269
3270                ret = btrfs_truncate_free_space_cache(root, trans, inode);
3271                if (ret)
3272                        goto out_put;
3273        }
3274
3275        spin_lock(&block_group->lock);
3276        if (block_group->cached != BTRFS_CACHE_FINISHED ||
3277            !btrfs_test_opt(root, SPACE_CACHE) ||
3278            block_group->delalloc_bytes) {
3279                /*
3280                 * don't bother trying to write stuff out _if_
3281                 * a) we're not cached,
3282                 * b) we're with nospace_cache mount option.
3283                 */
3284                dcs = BTRFS_DC_WRITTEN;
3285                spin_unlock(&block_group->lock);
3286                goto out_put;
3287        }
3288        spin_unlock(&block_group->lock);
3289
3290        /*
3291         * Try to preallocate enough space based on how big the block group is.
3292         * Keep in mind this has to include any pinned space which could end up
3293         * taking up quite a bit since it's not folded into the other space
3294         * cache.
3295         */
3296        num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3297        if (!num_pages)
3298                num_pages = 1;
3299
3300        num_pages *= 16;
3301        num_pages *= PAGE_CACHE_SIZE;
3302
3303        ret = btrfs_check_data_free_space(inode, num_pages);
3304        if (ret)
3305                goto out_put;
3306
3307        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3308                                              num_pages, num_pages,
3309                                              &alloc_hint);
3310        if (!ret)
3311                dcs = BTRFS_DC_SETUP;
3312        btrfs_free_reserved_data_space(inode, num_pages);
3313
3314out_put:
3315        iput(inode);
3316out_free:
3317        btrfs_release_path(path);
3318out:
3319        spin_lock(&block_group->lock);
3320        if (!ret && dcs == BTRFS_DC_SETUP)
3321                block_group->cache_generation = trans->transid;
3322        block_group->disk_cache_state = dcs;
3323        spin_unlock(&block_group->lock);
3324
3325        return ret;
3326}
3327
3328int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3329                            struct btrfs_root *root)
3330{
3331        struct btrfs_block_group_cache *cache, *tmp;
3332        struct btrfs_transaction *cur_trans = trans->transaction;
3333        struct btrfs_path *path;
3334
3335        if (list_empty(&cur_trans->dirty_bgs) ||
3336            !btrfs_test_opt(root, SPACE_CACHE))
3337                return 0;
3338
3339        path = btrfs_alloc_path();
3340        if (!path)
3341                return -ENOMEM;
3342
3343        /* Could add new block groups, use _safe just in case */
3344        list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3345                                 dirty_list) {
3346                if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3347                        cache_save_setup(cache, trans, path);
3348        }
3349
3350        btrfs_free_path(path);
3351        return 0;
3352}
3353
3354int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3355                                   struct btrfs_root *root)
3356{
3357        struct btrfs_block_group_cache *cache;
3358        struct btrfs_transaction *cur_trans = trans->transaction;
3359        int ret = 0;
3360        struct btrfs_path *path;
3361
3362        if (list_empty(&cur_trans->dirty_bgs))
3363                return 0;
3364
3365        path = btrfs_alloc_path();
3366        if (!path)
3367                return -ENOMEM;
3368
3369        /*
3370         * We don't need the lock here since we are protected by the transaction
3371         * commit.  We want to do the cache_save_setup first and then run the
3372         * delayed refs to make sure we have the best chance at doing this all
3373         * in one shot.
3374         */
3375        while (!list_empty(&cur_trans->dirty_bgs)) {
3376                cache = list_first_entry(&cur_trans->dirty_bgs,
3377                                         struct btrfs_block_group_cache,
3378                                         dirty_list);
3379                list_del_init(&cache->dirty_list);
3380                if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3381                        cache_save_setup(cache, trans, path);
3382                if (!ret)
3383                        ret = btrfs_run_delayed_refs(trans, root,
3384                                                     (unsigned long) -1);
3385                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
3386                        btrfs_write_out_cache(root, trans, cache, path);
3387                if (!ret)
3388                        ret = write_one_cache_group(trans, root, path, cache);
3389                btrfs_put_block_group(cache);
3390        }
3391
3392        btrfs_free_path(path);
3393        return ret;
3394}
3395
3396int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3397{
3398        struct btrfs_block_group_cache *block_group;
3399        int readonly = 0;
3400
3401        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3402        if (!block_group || block_group->ro)
3403                readonly = 1;
3404        if (block_group)
3405                btrfs_put_block_group(block_group);
3406        return readonly;
3407}
3408
3409static const char *alloc_name(u64 flags)
3410{
3411        switch (flags) {
3412        case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3413                return "mixed";
3414        case BTRFS_BLOCK_GROUP_METADATA:
3415                return "metadata";
3416        case BTRFS_BLOCK_GROUP_DATA:
3417                return "data";
3418        case BTRFS_BLOCK_GROUP_SYSTEM:
3419                return "system";
3420        default:
3421                WARN_ON(1);
3422                return "invalid-combination";
3423        };
3424}
3425
3426static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3427                             u64 total_bytes, u64 bytes_used,
3428                             struct btrfs_space_info **space_info)
3429{
3430        struct btrfs_space_info *found;
3431        int i;
3432        int factor;
3433        int ret;
3434
3435        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3436                     BTRFS_BLOCK_GROUP_RAID10))
3437                factor = 2;
3438        else
3439                factor = 1;
3440
3441        found = __find_space_info(info, flags);
3442        if (found) {
3443                spin_lock(&found->lock);
3444                found->total_bytes += total_bytes;
3445                found->disk_total += total_bytes * factor;
3446                found->bytes_used += bytes_used;
3447                found->disk_used += bytes_used * factor;
3448                found->full = 0;
3449                spin_unlock(&found->lock);
3450                *space_info = found;
3451                return 0;
3452        }
3453        found = kzalloc(sizeof(*found), GFP_NOFS);
3454        if (!found)
3455                return -ENOMEM;
3456
3457        ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3458        if (ret) {
3459                kfree(found);
3460                return ret;
3461        }
3462
3463        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3464                INIT_LIST_HEAD(&found->block_groups[i]);
3465        init_rwsem(&found->groups_sem);
3466        spin_lock_init(&found->lock);
3467        found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3468        found->total_bytes = total_bytes;
3469        found->disk_total = total_bytes * factor;
3470        found->bytes_used = bytes_used;
3471        found->disk_used = bytes_used * factor;
3472        found->bytes_pinned = 0;
3473        found->bytes_reserved = 0;
3474        found->bytes_readonly = 0;
3475        found->bytes_may_use = 0;
3476        found->full = 0;
3477        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3478        found->chunk_alloc = 0;
3479        found->flush = 0;
3480        init_waitqueue_head(&found->wait);
3481        INIT_LIST_HEAD(&found->ro_bgs);
3482
3483        ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3484                                    info->space_info_kobj, "%s",
3485                                    alloc_name(found->flags));
3486        if (ret) {
3487                kfree(found);
3488                return ret;
3489        }
3490
3491        *space_info = found;
3492        list_add_rcu(&found->list, &info->space_info);
3493        if (flags & BTRFS_BLOCK_GROUP_DATA)
3494                info->data_sinfo = found;
3495
3496        return ret;
3497}
3498
3499static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3500{
3501        u64 extra_flags = chunk_to_extended(flags) &
3502                                BTRFS_EXTENDED_PROFILE_MASK;
3503
3504        write_seqlock(&fs_info->profiles_lock);
3505        if (flags & BTRFS_BLOCK_GROUP_DATA)
3506                fs_info->avail_data_alloc_bits |= extra_flags;
3507        if (flags & BTRFS_BLOCK_GROUP_METADATA)
3508                fs_info->avail_metadata_alloc_bits |= extra_flags;
3509        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3510                fs_info->avail_system_alloc_bits |= extra_flags;
3511        write_sequnlock(&fs_info->profiles_lock);
3512}
3513
3514/*
3515 * returns target flags in extended format or 0 if restripe for this
3516 * chunk_type is not in progress
3517 *
3518 * should be called with either volume_mutex or balance_lock held
3519 */
3520static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3521{
3522        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3523        u64 target = 0;
3524
3525        if (!bctl)
3526                return 0;
3527
3528        if (flags & BTRFS_BLOCK_GROUP_DATA &&
3529            bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3530                target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3531        } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3532                   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3533                target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3534        } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3535                   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3536                target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3537        }
3538
3539        return target;
3540}
3541
3542/*
3543 * @flags: available profiles in extended format (see ctree.h)
3544 *
3545 * Returns reduced profile in chunk format.  If profile changing is in
3546 * progress (either running or paused) picks the target profile (if it's
3547 * already available), otherwise falls back to plain reducing.
3548 */
3549static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3550{
3551        u64 num_devices = root->fs_info->fs_devices->rw_devices;
3552        u64 target;
3553        u64 tmp;
3554
3555        /*
3556         * see if restripe for this chunk_type is in progress, if so
3557         * try to reduce to the target profile
3558         */
3559        spin_lock(&root->fs_info->balance_lock);
3560        target = get_restripe_target(root->fs_info, flags);
3561        if (target) {
3562                /* pick target profile only if it's already available */
3563                if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3564                        spin_unlock(&root->fs_info->balance_lock);
3565                        return extended_to_chunk(target);
3566                }
3567        }
3568        spin_unlock(&root->fs_info->balance_lock);
3569
3570        /* First, mask out the RAID levels which aren't possible */
3571        if (num_devices == 1)
3572                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3573                           BTRFS_BLOCK_GROUP_RAID5);
3574        if (num_devices < 3)
3575                flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3576        if (num_devices < 4)
3577                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3578
3579        tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3580                       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3581                       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3582        flags &= ~tmp;
3583
3584        if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3585                tmp = BTRFS_BLOCK_GROUP_RAID6;
3586        else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3587                tmp = BTRFS_BLOCK_GROUP_RAID5;
3588        else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3589                tmp = BTRFS_BLOCK_GROUP_RAID10;
3590        else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3591                tmp = BTRFS_BLOCK_GROUP_RAID1;
3592        else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3593                tmp = BTRFS_BLOCK_GROUP_RAID0;
3594
3595        return extended_to_chunk(flags | tmp);
3596}
3597
3598static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
3599{
3600        unsigned seq;
3601        u64 flags;
3602
3603        do {
3604                flags = orig_flags;
3605                seq = read_seqbegin(&root->fs_info->profiles_lock);
3606
3607                if (flags & BTRFS_BLOCK_GROUP_DATA)
3608                        flags |= root->fs_info->avail_data_alloc_bits;
3609                else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3610                        flags |= root->fs_info->avail_system_alloc_bits;
3611                else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3612                        flags |= root->fs_info->avail_metadata_alloc_bits;
3613        } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3614
3615        return btrfs_reduce_alloc_profile(root, flags);
3616}
3617
3618u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3619{
3620        u64 flags;
3621        u64 ret;
3622
3623        if (data)
3624                flags = BTRFS_BLOCK_GROUP_DATA;
3625        else if (root == root->fs_info->chunk_root)
3626                flags = BTRFS_BLOCK_GROUP_SYSTEM;
3627        else
3628                flags = BTRFS_BLOCK_GROUP_METADATA;
3629
3630        ret = get_alloc_profile(root, flags);
3631        return ret;
3632}
3633
3634/*
3635 * This will check the space that the inode allocates from to make sure we have
3636 * enough space for bytes.
3637 */
3638int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3639{
3640        struct btrfs_space_info *data_sinfo;
3641        struct btrfs_root *root = BTRFS_I(inode)->root;
3642        struct btrfs_fs_info *fs_info = root->fs_info;
3643        u64 used;
3644        int ret = 0, committed = 0, alloc_chunk = 1;
3645
3646        /* make sure bytes are sectorsize aligned */
3647        bytes = ALIGN(bytes, root->sectorsize);
3648
3649        if (btrfs_is_free_space_inode(inode)) {
3650                committed = 1;
3651                ASSERT(current->journal_info);
3652        }
3653
3654        data_sinfo = fs_info->data_sinfo;
3655        if (!data_sinfo)
3656                goto alloc;
3657
3658again:
3659        /* make sure we have enough space to handle the data first */
3660        spin_lock(&data_sinfo->lock);
3661        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3662                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3663                data_sinfo->bytes_may_use;
3664
3665        if (used + bytes > data_sinfo->total_bytes) {
3666                struct btrfs_trans_handle *trans;
3667
3668                /*
3669                 * if we don't have enough free bytes in this space then we need
3670                 * to alloc a new chunk.
3671                 */
3672                if (!data_sinfo->full && alloc_chunk) {
3673                        u64 alloc_target;
3674
3675                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3676                        spin_unlock(&data_sinfo->lock);
3677alloc:
3678                        alloc_target = btrfs_get_alloc_profile(root, 1);
3679                        /*
3680                         * It is ugly that we don't call nolock join
3681                         * transaction for the free space inode case here.
3682                         * But it is safe because we only do the data space
3683                         * reservation for the free space cache in the
3684                         * transaction context, the common join transaction
3685                         * just increase the counter of the current transaction
3686                         * handler, doesn't try to acquire the trans_lock of
3687                         * the fs.
3688                         */
3689                        trans = btrfs_join_transaction(root);
3690                        if (IS_ERR(trans))
3691                                return PTR_ERR(trans);
3692
3693                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3694                                             alloc_target,
3695                                             CHUNK_ALLOC_NO_FORCE);
3696                        btrfs_end_transaction(trans, root);
3697                        if (ret < 0) {
3698                                if (ret != -ENOSPC)
3699                                        return ret;
3700                                else
3701                                        goto commit_trans;
3702                        }
3703
3704                        if (!data_sinfo)
3705                                data_sinfo = fs_info->data_sinfo;
3706
3707                        goto again;
3708                }
3709
3710                /*
3711                 * If we don't have enough pinned space to deal with this
3712                 * allocation don't bother committing the transaction.
3713                 */
3714                if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
3715                                           bytes) < 0)
3716                        committed = 1;
3717                spin_unlock(&data_sinfo->lock);
3718
3719                /* commit the current transaction and try again */
3720commit_trans:
3721                if (!committed &&
3722                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3723                        committed = 1;
3724
3725                        trans = btrfs_join_transaction(root);
3726                        if (IS_ERR(trans))
3727                                return PTR_ERR(trans);
3728                        ret = btrfs_commit_transaction(trans, root);
3729                        if (ret)
3730                                return ret;
3731                        goto again;
3732                }
3733
3734                trace_btrfs_space_reservation(root->fs_info,
3735                                              "space_info:enospc",
3736                                              data_sinfo->flags, bytes, 1);
3737                return -ENOSPC;
3738        }
3739        data_sinfo->bytes_may_use += bytes;
3740        trace_btrfs_space_reservation(root->fs_info, "space_info",
3741                                      data_sinfo->flags, bytes, 1);
3742        spin_unlock(&data_sinfo->lock);
3743
3744        return 0;
3745}
3746
3747/*
3748 * Called if we need to clear a data reservation for this inode.
3749 */
3750void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3751{
3752        struct btrfs_root *root = BTRFS_I(inode)->root;
3753        struct btrfs_space_info *data_sinfo;
3754
3755        /* make sure bytes are sectorsize aligned */
3756        bytes = ALIGN(bytes, root->sectorsize);
3757
3758        data_sinfo = root->fs_info->data_sinfo;
3759        spin_lock(&data_sinfo->lock);
3760        WARN_ON(data_sinfo->bytes_may_use < bytes);
3761        data_sinfo->bytes_may_use -= bytes;
3762        trace_btrfs_space_reservation(root->fs_info, "space_info",
3763                                      data_sinfo->flags, bytes, 0);
3764        spin_unlock(&data_sinfo->lock);
3765}
3766
3767static void force_metadata_allocation(struct btrfs_fs_info *info)
3768{
3769        struct list_head *head = &info->space_info;
3770        struct btrfs_space_info *found;
3771
3772        rcu_read_lock();
3773        list_for_each_entry_rcu(found, head, list) {
3774                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3775                        found->force_alloc = CHUNK_ALLOC_FORCE;
3776        }
3777        rcu_read_unlock();
3778}
3779
3780static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
3781{
3782        return (global->size << 1);
3783}
3784
3785static int should_alloc_chunk(struct btrfs_root *root,
3786                              struct btrfs_space_info *sinfo, int force)
3787{
3788        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3789        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3790        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3791        u64 thresh;
3792
3793        if (force == CHUNK_ALLOC_FORCE)
3794                return 1;
3795
3796        /*
3797         * We need to take into account the global rsv because for all intents
3798         * and purposes it's used space.  Don't worry about locking the
3799         * global_rsv, it doesn't change except when the transaction commits.
3800         */
3801        if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3802                num_allocated += calc_global_rsv_need_space(global_rsv);
3803
3804        /*
3805         * in limited mode, we want to have some free space up to
3806         * about 1% of the FS size.
3807         */
3808        if (force == CHUNK_ALLOC_LIMITED) {
3809                thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3810                thresh = max_t(u64, 64 * 1024 * 1024,
3811                               div_factor_fine(thresh, 1));
3812
3813                if (num_bytes - num_allocated < thresh)
3814                        return 1;
3815        }
3816
3817        if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3818                return 0;
3819        return 1;
3820}
3821
3822static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3823{
3824        u64 num_dev;
3825
3826        if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3827                    BTRFS_BLOCK_GROUP_RAID0 |
3828                    BTRFS_BLOCK_GROUP_RAID5 |
3829                    BTRFS_BLOCK_GROUP_RAID6))
3830                num_dev = root->fs_info->fs_devices->rw_devices;
3831        else if (type & BTRFS_BLOCK_GROUP_RAID1)
3832                num_dev = 2;
3833        else
3834                num_dev = 1;    /* DUP or single */
3835
3836        /* metadata for updaing devices and chunk tree */
3837        return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3838}
3839
3840static void check_system_chunk(struct btrfs_trans_handle *trans,
3841                               struct btrfs_root *root, u64 type)
3842{
3843        struct btrfs_space_info *info;
3844        u64 left;
3845        u64 thresh;
3846
3847        info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3848        spin_lock(&info->lock);
3849        left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3850                info->bytes_reserved - info->bytes_readonly;
3851        spin_unlock(&info->lock);
3852
3853        thresh = get_system_chunk_thresh(root, type);
3854        if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3855                btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
3856                        left, thresh, type);
3857                dump_space_info(info, 0, 0);
3858        }
3859
3860        if (left < thresh) {
3861                u64 flags;
3862
3863                flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3864                btrfs_alloc_chunk(trans, root, flags);
3865        }
3866}
3867
3868static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3869                          struct btrfs_root *extent_root, u64 flags, int force)
3870{
3871        struct btrfs_space_info *space_info;
3872        struct btrfs_fs_info *fs_info = extent_root->fs_info;
3873        int wait_for_alloc = 0;
3874        int ret = 0;
3875
3876        /* Don't re-enter if we're already allocating a chunk */
3877        if (trans->allocating_chunk)
3878                return -ENOSPC;
3879
3880        space_info = __find_space_info(extent_root->fs_info, flags);
3881        if (!space_info) {
3882                ret = update_space_info(extent_root->fs_info, flags,
3883                                        0, 0, &space_info);
3884                BUG_ON(ret); /* -ENOMEM */
3885        }
3886        BUG_ON(!space_info); /* Logic error */
3887
3888again:
3889        spin_lock(&space_info->lock);
3890        if (force < space_info->force_alloc)
3891                force = space_info->force_alloc;
3892        if (space_info->full) {
3893                if (should_alloc_chunk(extent_root, space_info, force))
3894                        ret = -ENOSPC;
3895                else
3896                        ret = 0;
3897                spin_unlock(&space_info->lock);
3898                return ret;
3899        }
3900
3901        if (!should_alloc_chunk(extent_root, space_info, force)) {
3902                spin_unlock(&space_info->lock);
3903                return 0;
3904        } else if (space_info->chunk_alloc) {
3905                wait_for_alloc = 1;
3906        } else {
3907                space_info->chunk_alloc = 1;
3908        }
3909
3910        spin_unlock(&space_info->lock);
3911
3912        mutex_lock(&fs_info->chunk_mutex);
3913
3914        /*
3915         * The chunk_mutex is held throughout the entirety of a chunk
3916         * allocation, so once we've acquired the chunk_mutex we know that the
3917         * other guy is done and we need to recheck and see if we should
3918         * allocate.
3919         */
3920        if (wait_for_alloc) {
3921                mutex_unlock(&fs_info->chunk_mutex);
3922                wait_for_alloc = 0;
3923                goto again;
3924        }
3925
3926        trans->allocating_chunk = true;
3927
3928        /*
3929         * If we have mixed data/metadata chunks we want to make sure we keep
3930         * allocating mixed chunks instead of individual chunks.
3931         */
3932        if (btrfs_mixed_space_info(space_info))
3933                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3934
3935        /*
3936         * if we're doing a data chunk, go ahead and make sure that
3937         * we keep a reasonable number of metadata chunks allocated in the
3938         * FS as well.
3939         */
3940        if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3941                fs_info->data_chunk_allocations++;
3942                if (!(fs_info->data_chunk_allocations %
3943                      fs_info->metadata_ratio))
3944                        force_metadata_allocation(fs_info);
3945        }
3946
3947        /*
3948         * Check if we have enough space in SYSTEM chunk because we may need
3949         * to update devices.
3950         */
3951        check_system_chunk(trans, extent_root, flags);
3952
3953        ret = btrfs_alloc_chunk(trans, extent_root, flags);
3954        trans->allocating_chunk = false;
3955
3956        spin_lock(&space_info->lock);
3957        if (ret < 0 && ret != -ENOSPC)
3958                goto out;
3959        if (ret)
3960                space_info->full = 1;
3961        else
3962                ret = 1;
3963
3964        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3965out:
3966        space_info->chunk_alloc = 0;
3967        spin_unlock(&space_info->lock);
3968        mutex_unlock(&fs_info->chunk_mutex);
3969        return ret;
3970}
3971
3972static int can_overcommit(struct btrfs_root *root,
3973                          struct btrfs_space_info *space_info, u64 bytes,
3974                          enum btrfs_reserve_flush_enum flush)
3975{
3976        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3977        u64 profile = btrfs_get_alloc_profile(root, 0);
3978        u64 space_size;
3979        u64 avail;
3980        u64 used;
3981
3982        used = space_info->bytes_used + space_info->bytes_reserved +
3983                space_info->bytes_pinned + space_info->bytes_readonly;
3984
3985        /*
3986         * We only want to allow over committing if we have lots of actual space
3987         * free, but if we don't have enough space to handle the global reserve
3988         * space then we could end up having a real enospc problem when trying
3989         * to allocate a chunk or some other such important allocation.
3990         */
3991        spin_lock(&global_rsv->lock);
3992        space_size = calc_global_rsv_need_space(global_rsv);
3993        spin_unlock(&global_rsv->lock);
3994        if (used + space_size >= space_info->total_bytes)
3995                return 0;
3996
3997        used += space_info->bytes_may_use;
3998
3999        spin_lock(&root->fs_info->free_chunk_lock);
4000        avail = root->fs_info->free_chunk_space;
4001        spin_unlock(&root->fs_info->free_chunk_lock);
4002
4003        /*
4004         * If we have dup, raid1 or raid10 then only half of the free
4005         * space is actually useable.  For raid56, the space info used
4006         * doesn't include the parity drive, so we don't have to
4007         * change the math
4008         */
4009        if (profile & (BTRFS_BLOCK_GROUP_DUP |
4010                       BTRFS_BLOCK_GROUP_RAID1 |
4011                       BTRFS_BLOCK_GROUP_RAID10))
4012                avail >>= 1;
4013
4014        /*
4015         * If we aren't flushing all things, let us overcommit up to
4016         * 1/2th of the space. If we can flush, don't let us overcommit
4017         * too much, let it overcommit up to 1/8 of the space.
4018         */
4019        if (flush == BTRFS_RESERVE_FLUSH_ALL)
4020                avail >>= 3;
4021        else
4022                avail >>= 1;
4023
4024        if (used + bytes < space_info->total_bytes + avail)
4025                return 1;
4026        return 0;
4027}
4028
4029static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4030                                         unsigned long nr_pages, int nr_items)
4031{
4032        struct super_block *sb = root->fs_info->sb;
4033
4034        if (down_read_trylock(&sb->s_umount)) {
4035                writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4036                up_read(&sb->s_umount);
4037        } else {
4038                /*
4039                 * We needn't worry the filesystem going from r/w to r/o though
4040                 * we don't acquire ->s_umount mutex, because the filesystem
4041                 * should guarantee the delalloc inodes list be empty after
4042                 * the filesystem is readonly(all dirty pages are written to
4043                 * the disk).
4044                 */
4045                btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4046                if (!current->journal_info)
4047                        btrfs_wait_ordered_roots(root->fs_info, nr_items);
4048        }
4049}
4050
4051static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4052{
4053        u64 bytes;
4054        int nr;
4055
4056        bytes = btrfs_calc_trans_metadata_size(root, 1);
4057        nr = (int)div64_u64(to_reclaim, bytes);
4058        if (!nr)
4059                nr = 1;
4060        return nr;
4061}
4062
4063#define EXTENT_SIZE_PER_ITEM    (256 * 1024)
4064
4065/*
4066 * shrink metadata reservation for delalloc
4067 */
4068static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4069                            bool wait_ordered)
4070{
4071        struct btrfs_block_rsv *block_rsv;
4072        struct btrfs_space_info *space_info;
4073        struct btrfs_trans_handle *trans;
4074        u64 delalloc_bytes;
4075        u64 max_reclaim;
4076        long time_left;
4077        unsigned long nr_pages;
4078        int loops;
4079        int items;
4080        enum btrfs_reserve_flush_enum flush;
4081
4082        /* Calc the number of the pages we need flush for space reservation */
4083        items = calc_reclaim_items_nr(root, to_reclaim);
4084        to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4085
4086        trans = (struct btrfs_trans_handle *)current->journal_info;
4087        block_rsv = &root->fs_info->delalloc_block_rsv;
4088        space_info = block_rsv->space_info;
4089
4090        delalloc_bytes = percpu_counter_sum_positive(
4091                                                &root->fs_info->delalloc_bytes);
4092        if (delalloc_bytes == 0) {
4093                if (trans)
4094                        return;
4095                if (wait_ordered)
4096                        btrfs_wait_ordered_roots(root->fs_info, items);
4097                return;
4098        }
4099
4100        loops = 0;
4101        while (delalloc_bytes && loops < 3) {
4102                max_reclaim = min(delalloc_bytes, to_reclaim);
4103                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4104                btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4105                /*
4106                 * We need to wait for the async pages to actually start before
4107                 * we do anything.
4108                 */
4109                max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4110                if (!max_reclaim)
4111                        goto skip_async;
4112
4113                if (max_reclaim <= nr_pages)
4114                        max_reclaim = 0;
4115                else
4116                        max_reclaim -= nr_pages;
4117
4118                wait_event(root->fs_info->async_submit_wait,
4119                           atomic_read(&root->fs_info->async_delalloc_pages) <=
4120                           (int)max_reclaim);
4121skip_async:
4122                if (!trans)
4123                        flush = BTRFS_RESERVE_FLUSH_ALL;
4124                else
4125                        flush = BTRFS_RESERVE_NO_FLUSH;
4126                spin_lock(&space_info->lock);
4127                if (can_overcommit(root, space_info, orig, flush)) {
4128                        spin_unlock(&space_info->lock);
4129                        break;
4130                }
4131                spin_unlock(&space_info->lock);
4132
4133                loops++;
4134                if (wait_ordered && !trans) {
4135                        btrfs_wait_ordered_roots(root->fs_info, items);
4136                } else {
4137                        time_left = schedule_timeout_killable(1);
4138                        if (time_left)
4139                                break;
4140                }
4141                delalloc_bytes = percpu_counter_sum_positive(
4142                                                &root->fs_info->delalloc_bytes);
4143        }
4144}
4145
4146/**
4147 * maybe_commit_transaction - possibly commit the transaction if its ok to
4148 * @root - the root we're allocating for
4149 * @bytes - the number of bytes we want to reserve
4150 * @force - force the commit
4151 *
4152 * This will check to make sure that committing the transaction will actually
4153 * get us somewhere and then commit the transaction if it does.  Otherwise it
4154 * will return -ENOSPC.
4155 */
4156static int may_commit_transaction(struct btrfs_root *root,
4157                                  struct btrfs_space_info *space_info,
4158                                  u64 bytes, int force)
4159{
4160        struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4161        struct btrfs_trans_handle *trans;
4162
4163        trans = (struct btrfs_trans_handle *)current->journal_info;
4164        if (trans)
4165                return -EAGAIN;
4166
4167        if (force)
4168                goto commit;
4169
4170        /* See if there is enough pinned space to make this reservation */
4171        if (percpu_counter_compare(&space_info->total_bytes_pinned,
4172                                   bytes) >= 0)
4173                goto commit;
4174
4175        /*
4176         * See if there is some space in the delayed insertion reservation for
4177         * this reservation.
4178         */
4179        if (space_info != delayed_rsv->space_info)
4180                return -ENOSPC;
4181
4182        spin_lock(&delayed_rsv->lock);
4183        if (percpu_counter_compare(&space_info->total_bytes_pinned,
4184                                   bytes - delayed_rsv->size) >= 0) {
4185                spin_unlock(&delayed_rsv->lock);
4186                return -ENOSPC;
4187        }
4188        spin_unlock(&delayed_rsv->lock);
4189
4190commit:
4191        trans = btrfs_join_transaction(root);
4192        if (IS_ERR(trans))
4193                return -ENOSPC;
4194
4195        return btrfs_commit_transaction(trans, root);
4196}
4197
4198enum flush_state {
4199        FLUSH_DELAYED_ITEMS_NR  =       1,
4200        FLUSH_DELAYED_ITEMS     =       2,
4201        FLUSH_DELALLOC          =       3,
4202        FLUSH_DELALLOC_WAIT     =       4,
4203        ALLOC_CHUNK             =       5,
4204        COMMIT_TRANS            =       6,
4205};
4206
4207static int flush_space(struct btrfs_root *root,
4208                       struct btrfs_space_info *space_info, u64 num_bytes,
4209                       u64 orig_bytes, int state)
4210{
4211        struct btrfs_trans_handle *trans;
4212        int nr;
4213        int ret = 0;
4214
4215        switch (state) {
4216        case FLUSH_DELAYED_ITEMS_NR:
4217        case FLUSH_DELAYED_ITEMS:
4218                if (state == FLUSH_DELAYED_ITEMS_NR)
4219                        nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4220                else
4221                        nr = -1;
4222
4223                trans = btrfs_join_transaction(root);
4224                if (IS_ERR(trans)) {
4225                        ret = PTR_ERR(trans);
4226                        break;
4227                }
4228                ret = btrfs_run_delayed_items_nr(trans, root, nr);
4229                btrfs_end_transaction(trans, root);
4230                break;
4231        case FLUSH_DELALLOC:
4232        case FLUSH_DELALLOC_WAIT:
4233                shrink_delalloc(root, num_bytes * 2, orig_bytes,
4234                                state == FLUSH_DELALLOC_WAIT);
4235                break;
4236        case ALLOC_CHUNK:
4237                trans = btrfs_join_transaction(root);
4238                if (IS_ERR(trans)) {
4239                        ret = PTR_ERR(trans);
4240                        break;
4241                }
4242                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4243                                     btrfs_get_alloc_profile(root, 0),
4244                                     CHUNK_ALLOC_NO_FORCE);
4245                btrfs_end_transaction(trans, root);
4246                if (ret == -ENOSPC)
4247                        ret = 0;
4248                break;
4249        case COMMIT_TRANS:
4250                ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4251                break;
4252        default:
4253                ret = -ENOSPC;
4254                break;
4255        }
4256
4257        return ret;
4258}
4259
4260static inline u64
4261btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4262                                 struct btrfs_space_info *space_info)
4263{
4264        u64 used;
4265        u64 expected;
4266        u64 to_reclaim;
4267
4268        to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
4269                                16 * 1024 * 1024);
4270        spin_lock(&space_info->lock);
4271        if (can_overcommit(root, space_info, to_reclaim,
4272                           BTRFS_RESERVE_FLUSH_ALL)) {
4273                to_reclaim = 0;
4274                goto out;
4275        }
4276
4277        used = space_info->bytes_used + space_info->bytes_reserved +
4278               space_info->bytes_pinned + space_info->bytes_readonly +
4279               space_info->bytes_may_use;
4280        if (can_overcommit(root, space_info, 1024 * 1024,
4281                           BTRFS_RESERVE_FLUSH_ALL))
4282                expected = div_factor_fine(space_info->total_bytes, 95);
4283        else
4284                expected = div_factor_fine(space_info->total_bytes, 90);
4285
4286        if (used > expected)
4287                to_reclaim = used - expected;
4288        else
4289                to_reclaim = 0;
4290        to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4291                                     space_info->bytes_reserved);
4292out:
4293        spin_unlock(&space_info->lock);
4294
4295        return to_reclaim;
4296}
4297
4298static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4299                                        struct btrfs_fs_info *fs_info, u64 used)
4300{
4301        return (used >= div_factor_fine(space_info->total_bytes, 98) &&
4302                !btrfs_fs_closing(fs_info) &&
4303                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4304}
4305
4306static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4307                                       struct btrfs_fs_info *fs_info,
4308                                       int flush_state)
4309{
4310        u64 used;
4311
4312        spin_lock(&space_info->lock);
4313        /*
4314         * We run out of space and have not got any free space via flush_space,
4315         * so don't bother doing async reclaim.
4316         */
4317        if (flush_state > COMMIT_TRANS && space_info->full) {
4318                spin_unlock(&space_info->lock);
4319                return 0;
4320        }
4321
4322        used = space_info->bytes_used + space_info->bytes_reserved +
4323               space_info->bytes_pinned + space_info->bytes_readonly +
4324               space_info->bytes_may_use;
4325        if (need_do_async_reclaim(space_info, fs_info, used)) {
4326                spin_unlock(&space_info->lock);
4327                return 1;
4328        }
4329        spin_unlock(&space_info->lock);
4330
4331        return 0;
4332}
4333
4334static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4335{
4336        struct btrfs_fs_info *fs_info;
4337        struct btrfs_space_info *space_info;
4338        u64 to_reclaim;
4339        int flush_state;
4340
4341        fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4342        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4343
4344        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4345                                                      space_info);
4346        if (!to_reclaim)
4347                return;
4348
4349        flush_state = FLUSH_DELAYED_ITEMS_NR;
4350        do {
4351                flush_space(fs_info->fs_root, space_info, to_reclaim,
4352                            to_reclaim, flush_state);
4353                flush_state++;
4354                if (!btrfs_need_do_async_reclaim(space_info, fs_info,
4355                                                 flush_state))
4356                        return;
4357        } while (flush_state <= COMMIT_TRANS);
4358
4359        if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
4360                queue_work(system_unbound_wq, work);
4361}
4362
4363void btrfs_init_async_reclaim_work(struct work_struct *work)
4364{
4365        INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4366}
4367
4368/**
4369 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4370 * @root - the root we're allocating for
4371 * @block_rsv - the block_rsv we're allocating for
4372 * @orig_bytes - the number of bytes we want
4373 * @flush - whether or not we can flush to make our reservation
4374 *
4375 * This will reserve orgi_bytes number of bytes from the space info associated
4376 * with the block_rsv.  If there is not enough space it will make an attempt to
4377 * flush out space to make room.  It will do this by flushing delalloc if
4378 * possible or committing the transaction.  If flush is 0 then no attempts to
4379 * regain reservations will be made and this will fail if there is not enough
4380 * space already.
4381 */
4382static int reserve_metadata_bytes(struct btrfs_root *root,
4383                                  struct btrfs_block_rsv *block_rsv,
4384                                  u64 orig_bytes,
4385                                  enum btrfs_reserve_flush_enum flush)
4386{
4387        struct btrfs_space_info *space_info = block_rsv->space_info;
4388        u64 used;
4389        u64 num_bytes = orig_bytes;
4390        int flush_state = FLUSH_DELAYED_ITEMS_NR;
4391        int ret = 0;
4392        bool flushing = false;
4393
4394again:
4395        ret = 0;
4396        spin_lock(&space_info->lock);
4397        /*
4398         * We only want to wait if somebody other than us is flushing and we
4399         * are actually allowed to flush all things.
4400         */
4401        while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4402               space_info->flush) {
4403                spin_unlock(&space_info->lock);
4404                /*
4405                 * If we have a trans handle we can't wait because the flusher
4406                 * may have to commit the transaction, which would mean we would
4407                 * deadlock since we are waiting for the flusher to finish, but
4408                 * hold the current transaction open.
4409                 */
4410                if (current->journal_info)
4411                        return -EAGAIN;
4412                ret = wait_event_killable(space_info->wait, !space_info->flush);
4413                /* Must have been killed, return */
4414                if (ret)
4415                        return -EINTR;
4416
4417                spin_lock(&space_info->lock);
4418        }
4419
4420        ret = -ENOSPC;
4421        used = space_info->bytes_used + space_info->bytes_reserved +
4422                space_info->bytes_pinned + space_info->bytes_readonly +
4423                space_info->bytes_may_use;
4424
4425        /*
4426         * The idea here is that we've not already over-reserved the block group
4427         * then we can go ahead and save our reservation first and then start
4428         * flushing if we need to.  Otherwise if we've already overcommitted
4429         * lets start flushing stuff first and then come back and try to make
4430         * our reservation.
4431         */
4432        if (used <= space_info->total_bytes) {
4433                if (used + orig_bytes <= space_info->total_bytes) {
4434                        space_info->bytes_may_use += orig_bytes;
4435                        trace_btrfs_space_reservation(root->fs_info,
4436                                "space_info", space_info->flags, orig_bytes, 1);
4437                        ret = 0;
4438                } else {
4439                        /*
4440                         * Ok set num_bytes to orig_bytes since we aren't
4441                         * overocmmitted, this way we only try and reclaim what
4442                         * we need.
4443                         */
4444                        num_bytes = orig_bytes;
4445                }
4446        } else {
4447                /*
4448                 * Ok we're over committed, set num_bytes to the overcommitted
4449                 * amount plus the amount of bytes that we need for this
4450                 * reservation.
4451                 */
4452                num_bytes = used - space_info->total_bytes +
4453                        (orig_bytes * 2);
4454        }
4455
4456        if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4457                space_info->bytes_may_use += orig_bytes;
4458                trace_btrfs_space_reservation(root->fs_info, "space_info",
4459                                              space_info->flags, orig_bytes,
4460                                              1);
4461                ret = 0;
4462        }
4463
4464        /*
4465         * Couldn't make our reservation, save our place so while we're trying
4466         * to reclaim space we can actually use it instead of somebody else
4467         * stealing it from us.
4468         *
4469         * We make the other tasks wait for the flush only when we can flush
4470         * all things.
4471         */
4472        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4473                flushing = true;
4474                space_info->flush = 1;
4475        } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4476                used += orig_bytes;
4477                /*
4478                 * We will do the space reservation dance during log replay,
4479                 * which means we won't have fs_info->fs_root set, so don't do
4480                 * the async reclaim as we will panic.
4481                 */
4482                if (!root->fs_info->log_root_recovering &&
4483                    need_do_async_reclaim(space_info, root->fs_info, used) &&
4484                    !work_busy(&root->fs_info->async_reclaim_work))
4485                        queue_work(system_unbound_wq,
4486                                   &root->fs_info->async_reclaim_work);
4487        }
4488        spin_unlock(&space_info->lock);
4489
4490        if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4491                goto out;
4492
4493        ret = flush_space(root, space_info, num_bytes, orig_bytes,
4494                          flush_state);
4495        flush_state++;
4496
4497        /*
4498         * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4499         * would happen. So skip delalloc flush.
4500         */
4501        if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4502            (flush_state == FLUSH_DELALLOC ||
4503             flush_state == FLUSH_DELALLOC_WAIT))
4504                flush_state = ALLOC_CHUNK;
4505
4506        if (!ret)
4507                goto again;
4508        else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4509                 flush_state < COMMIT_TRANS)
4510                goto again;
4511        else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4512                 flush_state <= COMMIT_TRANS)
4513                goto again;
4514
4515out:
4516        if (ret == -ENOSPC &&
4517            unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4518                struct btrfs_block_rsv *global_rsv =
4519                        &root->fs_info->global_block_rsv;
4520
4521                if (block_rsv != global_rsv &&
4522                    !block_rsv_use_bytes(global_rsv, orig_bytes))
4523                        ret = 0;
4524        }
4525        if (ret == -ENOSPC)
4526                trace_btrfs_space_reservation(root->fs_info,
4527                                              "space_info:enospc",
4528                                              space_info->flags, orig_bytes, 1);
4529        if (flushing) {
4530                spin_lock(&space_info->lock);
4531                space_info->flush = 0;
4532                wake_up_all(&space_info->wait);
4533                spin_unlock(&space_info->lock);
4534        }
4535        return ret;
4536}
4537
4538static struct btrfs_block_rsv *get_block_rsv(
4539                                        const struct btrfs_trans_handle *trans,
4540                                        const struct btrfs_root *root)
4541{
4542        struct btrfs_block_rsv *block_rsv = NULL;
4543
4544        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4545                block_rsv = trans->block_rsv;
4546
4547        if (root == root->fs_info->csum_root && trans->adding_csums)
4548                block_rsv = trans->block_rsv;
4549
4550        if (root == root->fs_info->uuid_root)
4551                block_rsv = trans->block_rsv;
4552
4553        if (!block_rsv)
4554                block_rsv = root->block_rsv;
4555
4556        if (!block_rsv)
4557                block_rsv = &root->fs_info->empty_block_rsv;
4558
4559        return block_rsv;
4560}
4561
4562static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4563                               u64 num_bytes)
4564{
4565        int ret = -ENOSPC;
4566        spin_lock(&block_rsv->lock);
4567        if (block_rsv->reserved >= num_bytes) {
4568                block_rsv->reserved -= num_bytes;
4569                if (block_rsv->reserved < block_rsv->size)
4570                        block_rsv->full = 0;
4571                ret = 0;
4572        }
4573        spin_unlock(&block_rsv->lock);
4574        return ret;
4575}
4576
4577static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4578                                u64 num_bytes, int update_size)
4579{
4580        spin_lock(&block_rsv->lock);
4581        block_rsv->reserved += num_bytes;
4582        if (update_size)
4583                block_rsv->size += num_bytes;
4584        else if (block_rsv->reserved >= block_rsv->size)
4585                block_rsv->full = 1;
4586        spin_unlock(&block_rsv->lock);
4587}
4588
4589int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4590                             struct btrfs_block_rsv *dest, u64 num_bytes,
4591                             int min_factor)
4592{
4593        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4594        u64 min_bytes;
4595
4596        if (global_rsv->space_info != dest->space_info)
4597                return -ENOSPC;
4598
4599        spin_lock(&global_rsv->lock);
4600        min_bytes = div_factor(global_rsv->size, min_factor);
4601        if (global_rsv->reserved < min_bytes + num_bytes) {
4602                spin_unlock(&global_rsv->lock);
4603                return -ENOSPC;
4604        }
4605        global_rsv->reserved -= num_bytes;
4606        if (global_rsv->reserved < global_rsv->size)
4607                global_rsv->full = 0;
4608        spin_unlock(&global_rsv->lock);
4609
4610        block_rsv_add_bytes(dest, num_bytes, 1);
4611        return 0;
4612}
4613
4614static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4615                                    struct btrfs_block_rsv *block_rsv,
4616                                    struct btrfs_block_rsv *dest, u64 num_bytes)
4617{
4618        struct btrfs_space_info *space_info = block_rsv->space_info;
4619
4620        spin_lock(&block_rsv->lock);
4621        if (num_bytes == (u64)-1)
4622                num_bytes = block_rsv->size;
4623        block_rsv->size -= num_bytes;
4624        if (block_rsv->reserved >= block_rsv->size) {
4625                num_bytes = block_rsv->reserved - block_rsv->size;
4626                block_rsv->reserved = block_rsv->size;
4627                block_rsv->full = 1;
4628        } else {
4629                num_bytes = 0;
4630        }
4631        spin_unlock(&block_rsv->lock);
4632
4633        if (num_bytes > 0) {
4634                if (dest) {
4635                        spin_lock(&dest->lock);
4636                        if (!dest->full) {
4637                                u64 bytes_to_add;
4638
4639                                bytes_to_add = dest->size - dest->reserved;
4640                                bytes_to_add = min(num_bytes, bytes_to_add);
4641                                dest->reserved += bytes_to_add;
4642                                if (dest->reserved >= dest->size)
4643                                        dest->full = 1;
4644                                num_bytes -= bytes_to_add;
4645                        }
4646                        spin_unlock(&dest->lock);
4647                }
4648                if (num_bytes) {
4649                        spin_lock(&space_info->lock);
4650                        space_info->bytes_may_use -= num_bytes;
4651                        trace_btrfs_space_reservation(fs_info, "space_info",
4652                                        space_info->flags, num_bytes, 0);
4653                        spin_unlock(&space_info->lock);
4654                }
4655        }
4656}
4657
4658static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4659                                   struct btrfs_block_rsv *dst, u64 num_bytes)
4660{
4661        int ret;
4662
4663        ret = block_rsv_use_bytes(src, num_bytes);
4664        if (ret)
4665                return ret;
4666
4667        block_rsv_add_bytes(dst, num_bytes, 1);
4668        return 0;
4669}
4670
4671void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4672{
4673        memset(rsv, 0, sizeof(*rsv));
4674        spin_lock_init(&rsv->lock);
4675        rsv->type = type;
4676}
4677
4678struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4679                                              unsigned short type)
4680{
4681        struct btrfs_block_rsv *block_rsv;
4682        struct btrfs_fs_info *fs_info = root->fs_info;
4683
4684        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4685        if (!block_rsv)
4686                return NULL;
4687
4688        btrfs_init_block_rsv(block_rsv, type);
4689        block_rsv->space_info = __find_space_info(fs_info,
4690                                                  BTRFS_BLOCK_GROUP_METADATA);
4691        return block_rsv;
4692}
4693
4694void btrfs_free_block_rsv(struct btrfs_root *root,
4695                          struct btrfs_block_rsv *rsv)
4696{
4697        if (!rsv)
4698                return;
4699        btrfs_block_rsv_release(root, rsv, (u64)-1);
4700        kfree(rsv);
4701}
4702
4703int btrfs_block_rsv_add(struct btrfs_root *root,
4704                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4705                        enum btrfs_reserve_flush_enum flush)
4706{
4707        int ret;
4708
4709        if (num_bytes == 0)
4710                return 0;
4711
4712        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4713        if (!ret) {
4714                block_rsv_add_bytes(block_rsv, num_bytes, 1);
4715                return 0;
4716        }
4717
4718        return ret;
4719}
4720
4721int btrfs_block_rsv_check(struct btrfs_root *root,
4722                          struct btrfs_block_rsv *block_rsv, int min_factor)
4723{
4724        u64 num_bytes = 0;
4725        int ret = -ENOSPC;
4726
4727        if (!block_rsv)
4728                return 0;
4729
4730        spin_lock(&block_rsv->lock);
4731        num_bytes = div_factor(block_rsv->size, min_factor);
4732        if (block_rsv->reserved >= num_bytes)
4733                ret = 0;
4734        spin_unlock(&block_rsv->lock);
4735
4736        return ret;
4737}
4738
4739int btrfs_block_rsv_refill(struct btrfs_root *root,
4740                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4741                           enum btrfs_reserve_flush_enum flush)
4742{
4743        u64 num_bytes = 0;
4744        int ret = -ENOSPC;
4745
4746        if (!block_rsv)
4747                return 0;
4748
4749        spin_lock(&block_rsv->lock);
4750        num_bytes = min_reserved;
4751        if (block_rsv->reserved >= num_bytes)
4752                ret = 0;
4753        else
4754                num_bytes -= block_rsv->reserved;
4755        spin_unlock(&block_rsv->lock);
4756
4757        if (!ret)
4758                return 0;
4759
4760        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4761        if (!ret) {
4762                block_rsv_add_bytes(block_rsv, num_bytes, 0);
4763                return 0;
4764        }
4765
4766        return ret;
4767}
4768
4769int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4770                            struct btrfs_block_rsv *dst_rsv,
4771                            u64 num_bytes)
4772{
4773        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4774}
4775
4776void btrfs_block_rsv_release(struct btrfs_root *root,
4777                             struct btrfs_block_rsv *block_rsv,
4778                             u64 num_bytes)
4779{
4780        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4781        if (global_rsv == block_rsv ||
4782            block_rsv->space_info != global_rsv->space_info)
4783                global_rsv = NULL;
4784        block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4785                                num_bytes);
4786}
4787
4788/*
4789 * helper to calculate size of global block reservation.
4790 * the desired value is sum of space used by extent tree,
4791 * checksum tree and root tree
4792 */
4793static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4794{
4795        struct btrfs_space_info *sinfo;
4796        u64 num_bytes;
4797        u64 meta_used;
4798        u64 data_used;
4799        int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4800
4801        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4802        spin_lock(&sinfo->lock);
4803        data_used = sinfo->bytes_used;
4804        spin_unlock(&sinfo->lock);
4805
4806        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4807        spin_lock(&sinfo->lock);
4808        if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4809                data_used = 0;
4810        meta_used = sinfo->bytes_used;
4811        spin_unlock(&sinfo->lock);
4812
4813        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4814                    csum_size * 2;
4815        num_bytes += div64_u64(data_used + meta_used, 50);
4816
4817        if (num_bytes * 3 > meta_used)
4818                num_bytes = div64_u64(meta_used, 3);
4819
4820        return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
4821}
4822
4823static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4824{
4825        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4826        struct btrfs_space_info *sinfo = block_rsv->space_info;
4827        u64 num_bytes;
4828
4829        num_bytes = calc_global_metadata_size(fs_info);
4830
4831        spin_lock(&sinfo->lock);
4832        spin_lock(&block_rsv->lock);
4833
4834        block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
4835
4836        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4837                    sinfo->bytes_reserved + sinfo->bytes_readonly +
4838                    sinfo->bytes_may_use;
4839
4840        if (sinfo->total_bytes > num_bytes) {
4841                num_bytes = sinfo->total_bytes - num_bytes;
4842                block_rsv->reserved += num_bytes;
4843                sinfo->bytes_may_use += num_bytes;
4844                trace_btrfs_space_reservation(fs_info, "space_info",
4845                                      sinfo->flags, num_bytes, 1);
4846        }
4847
4848        if (block_rsv->reserved >= block_rsv->size) {
4849                num_bytes = block_rsv->reserved - block_rsv->size;
4850                sinfo->bytes_may_use -= num_bytes;
4851                trace_btrfs_space_reservation(fs_info, "space_info",
4852                                      sinfo->flags, num_bytes, 0);
4853                block_rsv->reserved = block_rsv->size;
4854                block_rsv->full = 1;
4855        }
4856
4857        spin_unlock(&block_rsv->lock);
4858        spin_unlock(&sinfo->lock);
4859}
4860
4861static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4862{
4863        struct btrfs_space_info *space_info;
4864
4865        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4866        fs_info->chunk_block_rsv.space_info = space_info;
4867
4868        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4869        fs_info->global_block_rsv.space_info = space_info;
4870        fs_info->delalloc_block_rsv.space_info = space_info;
4871        fs_info->trans_block_rsv.space_info = space_info;
4872        fs_info->empty_block_rsv.space_info = space_info;
4873        fs_info->delayed_block_rsv.space_info = space_info;
4874
4875        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4876        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4877        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4878        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4879        if (fs_info->quota_root)
4880                fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
4881        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4882
4883        update_global_block_rsv(fs_info);
4884}
4885
4886static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4887{
4888        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4889                                (u64)-1);
4890        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4891        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4892        WARN_ON(fs_info->trans_block_rsv.size > 0);
4893        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4894        WARN_ON(fs_info->chunk_block_rsv.size > 0);
4895        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4896        WARN_ON(fs_info->delayed_block_rsv.size > 0);
4897        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4898}
4899
4900void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4901                                  struct btrfs_root *root)
4902{
4903        if (!trans->block_rsv)
4904                return;
4905
4906        if (!trans->bytes_reserved)
4907                return;
4908
4909        trace_btrfs_space_reservation(root->fs_info, "transaction",
4910                                      trans->transid, trans->bytes_reserved, 0);
4911        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4912        trans->bytes_reserved = 0;
4913}
4914
4915/* Can only return 0 or -ENOSPC */
4916int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4917                                  struct inode *inode)
4918{
4919        struct btrfs_root *root = BTRFS_I(inode)->root;
4920        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4921        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4922
4923        /*
4924         * We need to hold space in order to delete our orphan item once we've
4925         * added it, so this takes the reservation so we can release it later
4926         * when we are truly done with the orphan item.
4927         */
4928        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4929        trace_btrfs_space_reservation(root->fs_info, "orphan",
4930                                      btrfs_ino(inode), num_bytes, 1);
4931        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4932}
4933
4934void btrfs_orphan_release_metadata(struct inode *inode)
4935{
4936        struct btrfs_root *root = BTRFS_I(inode)->root;
4937        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4938        trace_btrfs_space_reservation(root->fs_info, "orphan",
4939                                      btrfs_ino(inode), num_bytes, 0);
4940        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4941}
4942
4943/*
4944 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4945 * root: the root of the parent directory
4946 * rsv: block reservation
4947 * items: the number of items that we need do reservation
4948 * qgroup_reserved: used to return the reserved size in qgroup
4949 *
4950 * This function is used to reserve the space for snapshot/subvolume
4951 * creation and deletion. Those operations are different with the
4952 * common file/directory operations, they change two fs/file trees
4953 * and root tree, the number of items that the qgroup reserves is
4954 * different with the free space reservation. So we can not use
4955 * the space reseravtion mechanism in start_transaction().
4956 */
4957int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4958                                     struct btrfs_block_rsv *rsv,
4959                                     int items,
4960                                     u64 *qgroup_reserved,
4961                                     bool use_global_rsv)
4962{
4963        u64 num_bytes;
4964        int ret;
4965        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4966
4967        if (root->fs_info->quota_enabled) {
4968                /* One for parent inode, two for dir entries */
4969                num_bytes = 3 * root->nodesize;
4970                ret = btrfs_qgroup_reserve(root, num_bytes);
4971                if (ret)
4972                        return ret;
4973        } else {
4974                num_bytes = 0;
4975        }
4976
4977        *qgroup_reserved = num_bytes;
4978
4979        num_bytes = btrfs_calc_trans_metadata_size(root, items);
4980        rsv->space_info = __find_space_info(root->fs_info,
4981                                            BTRFS_BLOCK_GROUP_METADATA);
4982        ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4983                                  BTRFS_RESERVE_FLUSH_ALL);
4984
4985        if (ret == -ENOSPC && use_global_rsv)
4986                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
4987
4988        if (ret) {
4989                if (*qgroup_reserved)
4990                        btrfs_qgroup_free(root, *qgroup_reserved);
4991        }
4992
4993        return ret;
4994}
4995
4996void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4997                                      struct btrfs_block_rsv *rsv,
4998                                      u64 qgroup_reserved)
4999{
5000        btrfs_block_rsv_release(root, rsv, (u64)-1);
5001        if (qgroup_reserved)
5002                btrfs_qgroup_free(root, qgroup_reserved);
5003}
5004
5005/**
5006 * drop_outstanding_extent - drop an outstanding extent
5007 * @inode: the inode we're dropping the extent for
5008 * @num_bytes: the number of bytes we're relaseing.
5009 *
5010 * This is called when we are freeing up an outstanding extent, either called
5011 * after an error or after an extent is written.  This will return the number of
5012 * reserved extents that need to be freed.  This must be called with
5013 * BTRFS_I(inode)->lock held.
5014 */
5015static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5016{
5017        unsigned drop_inode_space = 0;
5018        unsigned dropped_extents = 0;
5019        unsigned num_extents = 0;
5020
5021        num_extents = (unsigned)div64_u64(num_bytes +
5022                                          BTRFS_MAX_EXTENT_SIZE - 1,
5023                                          BTRFS_MAX_EXTENT_SIZE);
5024        ASSERT(num_extents);
5025        ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
5026        BTRFS_I(inode)->outstanding_extents -= num_extents;
5027
5028        if (BTRFS_I(inode)->outstanding_extents == 0 &&
5029            test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5030                               &BTRFS_I(inode)->runtime_flags))
5031                drop_inode_space = 1;
5032
5033        /*
5034         * If we have more or the same amount of outsanding extents than we have
5035         * reserved then we need to leave the reserved extents count alone.
5036         */
5037        if (BTRFS_I(inode)->outstanding_extents >=
5038            BTRFS_I(inode)->reserved_extents)
5039                return drop_inode_space;
5040
5041        dropped_extents = BTRFS_I(inode)->reserved_extents -
5042                BTRFS_I(inode)->outstanding_extents;
5043        BTRFS_I(inode)->reserved_extents -= dropped_extents;
5044        return dropped_extents + drop_inode_space;
5045}
5046
5047/**
5048 * calc_csum_metadata_size - return the amount of metada space that must be
5049 *      reserved/free'd for the given bytes.
5050 * @inode: the inode we're manipulating
5051 * @num_bytes: the number of bytes in question
5052 * @reserve: 1 if we are reserving space, 0 if we are freeing space
5053 *
5054 * This adjusts the number of csum_bytes in the inode and then returns the
5055 * correct amount of metadata that must either be reserved or freed.  We
5056 * calculate how many checksums we can fit into one leaf and then divide the
5057 * number of bytes that will need to be checksumed by this value to figure out
5058 * how many checksums will be required.  If we are adding bytes then the number
5059 * may go up and we will return the number of additional bytes that must be
5060 * reserved.  If it is going down we will return the number of bytes that must
5061 * be freed.
5062 *
5063 * This must be called with BTRFS_I(inode)->lock held.
5064 */
5065static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5066                                   int reserve)
5067{
5068        struct btrfs_root *root = BTRFS_I(inode)->root;
5069        u64 csum_size;
5070        int num_csums_per_leaf;
5071        int num_csums;
5072        int old_csums;
5073
5074        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5075            BTRFS_I(inode)->csum_bytes == 0)
5076                return 0;
5077
5078        old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
5079        if (reserve)
5080                BTRFS_I(inode)->csum_bytes += num_bytes;
5081        else
5082                BTRFS_I(inode)->csum_bytes -= num_bytes;
5083        csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
5084        num_csums_per_leaf = (int)div64_u64(csum_size,
5085                                            sizeof(struct btrfs_csum_item) +
5086                                            sizeof(struct btrfs_disk_key));
5087        num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
5088        num_csums = num_csums + num_csums_per_leaf - 1;
5089        num_csums = num_csums / num_csums_per_leaf;
5090
5091        old_csums = old_csums + num_csums_per_leaf - 1;
5092        old_csums = old_csums / num_csums_per_leaf;
5093
5094        /* No change, no need to reserve more */
5095        if (old_csums == num_csums)
5096                return 0;
5097
5098        if (reserve)
5099                return btrfs_calc_trans_metadata_size(root,
5100                                                      num_csums - old_csums);
5101
5102        return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5103}
5104
5105int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5106{
5107        struct btrfs_root *root = BTRFS_I(inode)->root;
5108        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5109        u64 to_reserve = 0;
5110        u64 csum_bytes;
5111        unsigned nr_extents = 0;
5112        int extra_reserve = 0;
5113        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5114        int ret = 0;
5115        bool delalloc_lock = true;
5116        u64 to_free = 0;
5117        unsigned dropped;
5118
5119        /* If we are a free space inode we need to not flush since we will be in
5120         * the middle of a transaction commit.  We also don't need the delalloc
5121         * mutex since we won't race with anybody.  We need this mostly to make
5122         * lockdep shut its filthy mouth.
5123         */
5124        if (btrfs_is_free_space_inode(inode)) {
5125                flush = BTRFS_RESERVE_NO_FLUSH;
5126                delalloc_lock = false;
5127        }
5128
5129        if (flush != BTRFS_RESERVE_NO_FLUSH &&
5130            btrfs_transaction_in_commit(root->fs_info))
5131                schedule_timeout(1);
5132
5133        if (delalloc_lock)
5134                mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
5135
5136        num_bytes = ALIGN(num_bytes, root->sectorsize);
5137
5138        spin_lock(&BTRFS_I(inode)->lock);
5139        nr_extents = (unsigned)div64_u64(num_bytes +
5140                                         BTRFS_MAX_EXTENT_SIZE - 1,
5141                                         BTRFS_MAX_EXTENT_SIZE);
5142        BTRFS_I(inode)->outstanding_extents += nr_extents;
5143        nr_extents = 0;
5144
5145        if (BTRFS_I(inode)->outstanding_extents >
5146            BTRFS_I(inode)->reserved_extents)
5147                nr_extents = BTRFS_I(inode)->outstanding_extents -
5148                        BTRFS_I(inode)->reserved_extents;
5149
5150        /*
5151         * Add an item to reserve for updating the inode when we complete the
5152         * delalloc io.
5153         */
5154        if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5155                      &BTRFS_I(inode)->runtime_flags)) {
5156                nr_extents++;
5157                extra_reserve = 1;
5158        }
5159
5160        to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
5161        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5162        csum_bytes = BTRFS_I(inode)->csum_bytes;
5163        spin_unlock(&BTRFS_I(inode)->lock);
5164
5165        if (root->fs_info->quota_enabled) {
5166                ret = btrfs_qgroup_reserve(root, num_bytes +
5167                                           nr_extents * root->nodesize);
5168                if (ret)
5169                        goto out_fail;
5170        }
5171
5172        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
5173        if (unlikely(ret)) {
5174                if (root->fs_info->quota_enabled)
5175                        btrfs_qgroup_free(root, num_bytes +
5176                                                nr_extents * root->nodesize);
5177                goto out_fail;
5178        }
5179
5180        spin_lock(&BTRFS_I(inode)->lock);
5181        if (extra_reserve) {
5182                set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5183                        &BTRFS_I(inode)->runtime_flags);
5184                nr_extents--;
5185        }
5186        BTRFS_I(inode)->reserved_extents += nr_extents;
5187        spin_unlock(&BTRFS_I(inode)->lock);
5188
5189        if (delalloc_lock)
5190                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5191
5192        if (to_reserve)
5193                trace_btrfs_space_reservation(root->fs_info, "delalloc",
5194                                              btrfs_ino(inode), to_reserve, 1);
5195        block_rsv_add_bytes(block_rsv, to_reserve, 1);
5196
5197        return 0;
5198
5199out_fail:
5200        spin_lock(&BTRFS_I(inode)->lock);
5201        dropped = drop_outstanding_extent(inode, num_bytes);
5202        /*
5203         * If the inodes csum_bytes is the same as the original
5204         * csum_bytes then we know we haven't raced with any free()ers
5205         * so we can just reduce our inodes csum bytes and carry on.
5206         */
5207        if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
5208                calc_csum_metadata_size(inode, num_bytes, 0);
5209        } else {
5210                u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
5211                u64 bytes;
5212
5213                /*
5214                 * This is tricky, but first we need to figure out how much we
5215                 * free'd from any free-ers that occured during this
5216                 * reservation, so we reset ->csum_bytes to the csum_bytes
5217                 * before we dropped our lock, and then call the free for the
5218                 * number of bytes that were freed while we were trying our
5219                 * reservation.
5220                 */
5221                bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
5222                BTRFS_I(inode)->csum_bytes = csum_bytes;
5223                to_free = calc_csum_metadata_size(inode, bytes, 0);
5224
5225
5226                /*
5227                 * Now we need to see how much we would have freed had we not
5228                 * been making this reservation and our ->csum_bytes were not
5229                 * artificially inflated.
5230                 */
5231                BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
5232                bytes = csum_bytes - orig_csum_bytes;
5233                bytes = calc_csum_metadata_size(inode, bytes, 0);
5234
5235                /*
5236                 * Now reset ->csum_bytes to what it should be.  If bytes is
5237                 * more than to_free then we would have free'd more space had we
5238                 * not had an artificially high ->csum_bytes, so we need to free
5239                 * the remainder.  If bytes is the same or less then we don't
5240                 * need to do anything, the other free-ers did the correct
5241                 * thing.
5242                 */
5243                BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
5244                if (bytes > to_free)
5245                        to_free = bytes - to_free;
5246                else
5247                        to_free = 0;
5248        }
5249        spin_unlock(&BTRFS_I(inode)->lock);
5250        if (dropped)
5251                to_free += btrfs_calc_trans_metadata_size(root, dropped);
5252
5253        if (to_free) {
5254                btrfs_block_rsv_release(root, block_rsv, to_free);
5255                trace_btrfs_space_reservation(root->fs_info, "delalloc",
5256                                              btrfs_ino(inode), to_free, 0);
5257        }
5258        if (delalloc_lock)
5259                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5260        return ret;
5261}
5262
5263/**
5264 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5265 * @inode: the inode to release the reservation for
5266 * @num_bytes: the number of bytes we're releasing
5267 *
5268 * This will release the metadata reservation for an inode.  This can be called
5269 * once we complete IO for a given set of bytes to release their metadata
5270 * reservations.
5271 */
5272void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5273{
5274        struct btrfs_root *root = BTRFS_I(inode)->root;
5275        u64 to_free = 0;
5276        unsigned dropped;
5277
5278        num_bytes = ALIGN(num_bytes, root->sectorsize);
5279        spin_lock(&BTRFS_I(inode)->lock);
5280        dropped = drop_outstanding_extent(inode, num_bytes);
5281
5282        if (num_bytes)
5283                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
5284        spin_unlock(&BTRFS_I(inode)->lock);
5285        if (dropped > 0)
5286                to_free += btrfs_calc_trans_metadata_size(root, dropped);
5287
5288        if (btrfs_test_is_dummy_root(root))
5289                return;
5290
5291        trace_btrfs_space_reservation(root->fs_info, "delalloc",
5292                                      btrfs_ino(inode), to_free, 0);
5293        if (root->fs_info->quota_enabled) {
5294                btrfs_qgroup_free(root, num_bytes +
5295                                        dropped * root->nodesize);
5296        }
5297
5298        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5299                                to_free);
5300}
5301
5302/**
5303 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
5304 * @inode: inode we're writing to
5305 * @num_bytes: the number of bytes we want to allocate
5306 *
5307 * This will do the following things
5308 *
5309 * o reserve space in the data space info for num_bytes
5310 * o reserve space in the metadata space info based on number of outstanding
5311 *   extents and how much csums will be needed
5312 * o add to the inodes ->delalloc_bytes
5313 * o add it to the fs_info's delalloc inodes list.
5314 *
5315 * This will return 0 for success and -ENOSPC if there is no space left.
5316 */
5317int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
5318{
5319        int ret;
5320
5321        ret = btrfs_check_data_free_space(inode, num_bytes);
5322        if (ret)
5323                return ret;
5324
5325        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
5326        if (ret) {
5327                btrfs_free_reserved_data_space(inode, num_bytes);
5328                return ret;
5329        }
5330
5331        return 0;
5332}
5333
5334/**
5335 * btrfs_delalloc_release_space - release data and metadata space for delalloc
5336 * @inode: inode we're releasing space for
5337 * @num_bytes: the number of bytes we want to free up
5338 *
5339 * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
5340 * called in the case that we don't need the metadata AND data reservations
5341 * anymore.  So if there is an error or we insert an inline extent.
5342 *
5343 * This function will release the metadata space that was not used and will
5344 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5345 * list if there are no delalloc bytes left.
5346 */
5347void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
5348{
5349        btrfs_delalloc_release_metadata(inode, num_bytes);
5350        btrfs_free_reserved_data_space(inode, num_bytes);
5351}
5352
5353static int update_block_group(struct btrfs_trans_handle *trans,
5354                              struct btrfs_root *root, u64 bytenr,
5355                              u64 num_bytes, int alloc)
5356{
5357        struct btrfs_block_group_cache *cache = NULL;
5358        struct btrfs_fs_info *info = root->fs_info;
5359        u64 total = num_bytes;
5360        u64 old_val;
5361        u64 byte_in_group;
5362        int factor;
5363
5364        /* block accounting for super block */
5365        spin_lock(&info->delalloc_root_lock);
5366        old_val = btrfs_super_bytes_used(info->super_copy);
5367        if (alloc)
5368                old_val += num_bytes;
5369        else
5370                old_val -= num_bytes;
5371        btrfs_set_super_bytes_used(info->super_copy, old_val);
5372        spin_unlock(&info->delalloc_root_lock);
5373
5374        while (total) {
5375                cache = btrfs_lookup_block_group(info, bytenr);
5376                if (!cache)
5377                        return -ENOENT;
5378                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5379                                    BTRFS_BLOCK_GROUP_RAID1 |
5380                                    BTRFS_BLOCK_GROUP_RAID10))
5381                        factor = 2;
5382                else
5383                        factor = 1;
5384                /*
5385                 * If this block group has free space cache written out, we
5386                 * need to make sure to load it if we are removing space.  This
5387                 * is because we need the unpinning stage to actually add the
5388                 * space back to the block group, otherwise we will leak space.
5389                 */
5390                if (!alloc && cache->cached == BTRFS_CACHE_NO)
5391                        cache_block_group(cache, 1);
5392
5393                spin_lock(&trans->transaction->dirty_bgs_lock);
5394                if (list_empty(&cache->dirty_list)) {
5395                        list_add_tail(&cache->dirty_list,
5396                                      &trans->transaction->dirty_bgs);
5397                        btrfs_get_block_group(cache);
5398                }
5399                spin_unlock(&trans->transaction->dirty_bgs_lock);
5400
5401                byte_in_group = bytenr - cache->key.objectid;
5402                WARN_ON(byte_in_group > cache->key.offset);
5403
5404                spin_lock(&cache->space_info->lock);
5405                spin_lock(&cache->lock);
5406
5407                if (btrfs_test_opt(root, SPACE_CACHE) &&
5408                    cache->disk_cache_state < BTRFS_DC_CLEAR)
5409                        cache->disk_cache_state = BTRFS_DC_CLEAR;
5410
5411                old_val = btrfs_block_group_used(&cache->item);
5412                num_bytes = min(total, cache->key.offset - byte_in_group);
5413                if (alloc) {
5414                        old_val += num_bytes;
5415                        btrfs_set_block_group_used(&cache->item, old_val);
5416                        cache->reserved -= num_bytes;
5417                        cache->space_info->bytes_reserved -= num_bytes;
5418                        cache->space_info->bytes_used += num_bytes;
5419                        cache->space_info->disk_used += num_bytes * factor;
5420                        spin_unlock(&cache->lock);
5421                        spin_unlock(&cache->space_info->lock);
5422                } else {
5423                        old_val -= num_bytes;
5424                        btrfs_set_block_group_used(&cache->item, old_val);
5425                        cache->pinned += num_bytes;
5426                        cache->space_info->bytes_pinned += num_bytes;
5427                        cache->space_info->bytes_used -= num_bytes;
5428                        cache->space_info->disk_used -= num_bytes * factor;
5429                        spin_unlock(&cache->lock);
5430                        spin_unlock(&cache->space_info->lock);
5431
5432                        set_extent_dirty(info->pinned_extents,
5433                                         bytenr, bytenr + num_bytes - 1,
5434                                         GFP_NOFS | __GFP_NOFAIL);
5435                        /*
5436                         * No longer have used bytes in this block group, queue
5437                         * it for deletion.
5438                         */
5439                        if (old_val == 0) {
5440                                spin_lock(&info->unused_bgs_lock);
5441                                if (list_empty(&cache->bg_list)) {
5442                                        btrfs_get_block_group(cache);
5443                                        list_add_tail(&cache->bg_list,
5444                                                      &info->unused_bgs);
5445                                }
5446                                spin_unlock(&info->unused_bgs_lock);
5447                        }
5448                }
5449                btrfs_put_block_group(cache);
5450                total -= num_bytes;
5451                bytenr += num_bytes;
5452        }
5453        return 0;
5454}
5455
5456static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5457{
5458        struct btrfs_block_group_cache *cache;
5459        u64 bytenr;
5460
5461        spin_lock(&root->fs_info->block_group_cache_lock);
5462        bytenr = root->fs_info->first_logical_byte;
5463        spin_unlock(&root->fs_info->block_group_cache_lock);
5464
5465        if (bytenr < (u64)-1)
5466                return bytenr;
5467
5468        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5469        if (!cache)
5470                return 0;
5471
5472        bytenr = cache->key.objectid;
5473        btrfs_put_block_group(cache);
5474
5475        return bytenr;
5476}
5477
5478static int pin_down_extent(struct btrfs_root *root,
5479                           struct btrfs_block_group_cache *cache,
5480                           u64 bytenr, u64 num_bytes, int reserved)
5481{
5482        spin_lock(&cache->space_info->lock);
5483        spin_lock(&cache->lock);
5484        cache->pinned += num_bytes;
5485        cache->space_info->bytes_pinned += num_bytes;
5486        if (reserved) {
5487                cache->reserved -= num_bytes;
5488                cache->space_info->bytes_reserved -= num_bytes;
5489        }
5490        spin_unlock(&cache->lock);
5491        spin_unlock(&cache->space_info->lock);
5492
5493        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5494                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5495        if (reserved)
5496                trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
5497        return 0;
5498}
5499
5500/*
5501 * this function must be called within transaction
5502 */
5503int btrfs_pin_extent(struct btrfs_root *root,
5504                     u64 bytenr, u64 num_bytes, int reserved)
5505{
5506        struct btrfs_block_group_cache *cache;
5507
5508        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5509        BUG_ON(!cache); /* Logic error */
5510
5511        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
5512
5513        btrfs_put_block_group(cache);
5514        return 0;
5515}
5516
5517/*
5518 * this function must be called within transaction
5519 */
5520int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5521                                    u64 bytenr, u64 num_bytes)
5522{
5523        struct btrfs_block_group_cache *cache;
5524        int ret;
5525
5526        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5527        if (!cache)
5528                return -EINVAL;
5529
5530        /*
5531         * pull in the free space cache (if any) so that our pin
5532         * removes the free space from the cache.  We have load_only set
5533         * to one because the slow code to read in the free extents does check
5534         * the pinned extents.
5535         */
5536        cache_block_group(cache, 1);
5537
5538        pin_down_extent(root, cache, bytenr, num_bytes, 0);
5539
5540        /* remove us from the free space cache (if we're there at all) */
5541        ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
5542        btrfs_put_block_group(cache);
5543        return ret;
5544}
5545
5546static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5547{
5548        int ret;
5549        struct btrfs_block_group_cache *block_group;
5550        struct btrfs_caching_control *caching_ctl;
5551
5552        block_group = btrfs_lookup_block_group(root->fs_info, start);
5553        if (!block_group)
5554                return -EINVAL;
5555
5556        cache_block_group(block_group, 0);
5557        caching_ctl = get_caching_control(block_group);
5558
5559        if (!caching_ctl) {
5560                /* Logic error */
5561                BUG_ON(!block_group_cache_done(block_group));
5562                ret = btrfs_remove_free_space(block_group, start, num_bytes);
5563        } else {
5564                mutex_lock(&caching_ctl->mutex);
5565
5566                if (start >= caching_ctl->progress) {
5567                        ret = add_excluded_extent(root, start, num_bytes);
5568                } else if (start + num_bytes <= caching_ctl->progress) {
5569                        ret = btrfs_remove_free_space(block_group,
5570                                                      start, num_bytes);
5571                } else {
5572                        num_bytes = caching_ctl->progress - start;
5573                        ret = btrfs_remove_free_space(block_group,
5574                                                      start, num_bytes);
5575                        if (ret)
5576                                goto out_lock;
5577
5578                        num_bytes = (start + num_bytes) -
5579                                caching_ctl->progress;
5580                        start = caching_ctl->progress;
5581                        ret = add_excluded_extent(root, start, num_bytes);
5582                }
5583out_lock:
5584                mutex_unlock(&caching_ctl->mutex);
5585                put_caching_control(caching_ctl);
5586        }
5587        btrfs_put_block_group(block_group);
5588        return ret;
5589}
5590
5591int btrfs_exclude_logged_extents(struct btrfs_root *log,
5592                                 struct extent_buffer *eb)
5593{
5594        struct btrfs_file_extent_item *item;
5595        struct btrfs_key key;
5596        int found_type;
5597        int i;
5598
5599        if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5600                return 0;
5601
5602        for (i = 0; i < btrfs_header_nritems(eb); i++) {
5603                btrfs_item_key_to_cpu(eb, &key, i);
5604                if (key.type != BTRFS_EXTENT_DATA_KEY)
5605                        continue;
5606                item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5607                found_type = btrfs_file_extent_type(eb, item);
5608                if (found_type == BTRFS_FILE_EXTENT_INLINE)
5609                        continue;
5610                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5611                        continue;
5612                key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5613                key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5614                __exclude_logged_extent(log, key.objectid, key.offset);
5615        }
5616
5617        return 0;
5618}
5619
5620/**
5621 * btrfs_update_reserved_bytes - update the block_group and space info counters
5622 * @cache:      The cache we are manipulating
5623 * @num_bytes:  The number of bytes in question
5624 * @reserve:    One of the reservation enums
5625 * @delalloc:   The blocks are allocated for the delalloc write
5626 *
5627 * This is called by the allocator when it reserves space, or by somebody who is
5628 * freeing space that was never actually used on disk.  For example if you
5629 * reserve some space for a new leaf in transaction A and before transaction A
5630 * commits you free that leaf, you call this with reserve set to 0 in order to
5631 * clear the reservation.
5632 *
5633 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
5634 * ENOSPC accounting.  For data we handle the reservation through clearing the
5635 * delalloc bits in the io_tree.  We have to do this since we could end up
5636 * allocating less disk space for the amount of data we have reserved in the
5637 * case of compression.
5638 *
5639 * If this is a reservation and the block group has become read only we cannot
5640 * make the reservation and return -EAGAIN, otherwise this function always
5641 * succeeds.
5642 */
5643static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5644                                       u64 num_bytes, int reserve, int delalloc)
5645{
5646        struct btrfs_space_info *space_info = cache->space_info;
5647        int ret = 0;
5648
5649        spin_lock(&space_info->lock);
5650        spin_lock(&cache->lock);
5651        if (reserve != RESERVE_FREE) {
5652                if (cache->ro) {
5653                        ret = -EAGAIN;
5654                } else {
5655                        cache->reserved += num_bytes;
5656                        space_info->bytes_reserved += num_bytes;
5657                        if (reserve == RESERVE_ALLOC) {
5658                                trace_btrfs_space_reservation(cache->fs_info,
5659                                                "space_info", space_info->flags,
5660                                                num_bytes, 0);
5661                                space_info->bytes_may_use -= num_bytes;
5662                        }
5663
5664                        if (delalloc)
5665                                cache->delalloc_bytes += num_bytes;
5666                }
5667        } else {
5668                if (cache->ro)
5669                        space_info->bytes_readonly += num_bytes;
5670                cache->reserved -= num_bytes;
5671                space_info->bytes_reserved -= num_bytes;
5672
5673                if (delalloc)
5674                        cache->delalloc_bytes -= num_bytes;
5675        }
5676        spin_unlock(&cache->lock);
5677        spin_unlock(&space_info->lock);
5678        return ret;
5679}
5680
5681void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5682                                struct btrfs_root *root)
5683{
5684        struct btrfs_fs_info *fs_info = root->fs_info;
5685        struct btrfs_caching_control *next;
5686        struct btrfs_caching_control *caching_ctl;
5687        struct btrfs_block_group_cache *cache;
5688
5689        down_write(&fs_info->commit_root_sem);
5690
5691        list_for_each_entry_safe(caching_ctl, next,
5692                                 &fs_info->caching_block_groups, list) {
5693                cache = caching_ctl->block_group;
5694                if (block_group_cache_done(cache)) {
5695                        cache->last_byte_to_unpin = (u64)-1;
5696                        list_del_init(&caching_ctl->list);
5697                        put_caching_control(caching_ctl);
5698                } else {
5699                        cache->last_byte_to_unpin = caching_ctl->progress;
5700                }
5701        }
5702
5703        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5704                fs_info->pinned_extents = &fs_info->freed_extents[1];
5705        else
5706                fs_info->pinned_extents = &fs_info->freed_extents[0];
5707
5708        up_write(&fs_info->commit_root_sem);
5709
5710        update_global_block_rsv(fs_info);
5711}
5712
5713static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
5714                              const bool return_free_space)
5715{
5716        struct btrfs_fs_info *fs_info = root->fs_info;
5717        struct btrfs_block_group_cache *cache = NULL;
5718        struct btrfs_space_info *space_info;
5719        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5720        u64 len;
5721        bool readonly;
5722
5723        while (start <= end) {
5724                readonly = false;
5725                if (!cache ||
5726                    start >= cache->key.objectid + cache->key.offset) {
5727                        if (cache)
5728                                btrfs_put_block_group(cache);
5729                        cache = btrfs_lookup_block_group(fs_info, start);
5730                        BUG_ON(!cache); /* Logic error */
5731                }
5732
5733                len = cache->key.objectid + cache->key.offset - start;
5734                len = min(len, end + 1 - start);
5735
5736                if (start < cache->last_byte_to_unpin) {
5737                        len = min(len, cache->last_byte_to_unpin - start);
5738                        if (return_free_space)
5739                                btrfs_add_free_space(cache, start, len);
5740                }
5741
5742                start += len;
5743                space_info = cache->space_info;
5744
5745                spin_lock(&space_info->lock);
5746                spin_lock(&cache->lock);
5747                cache->pinned -= len;
5748                space_info->bytes_pinned -= len;
5749                percpu_counter_add(&space_info->total_bytes_pinned, -len);
5750                if (cache->ro) {
5751                        space_info->bytes_readonly += len;
5752                        readonly = true;
5753                }
5754                spin_unlock(&cache->lock);
5755                if (!readonly && global_rsv->space_info == space_info) {
5756                        spin_lock(&global_rsv->lock);
5757                        if (!global_rsv->full) {
5758                                len = min(len, global_rsv->size -
5759                                          global_rsv->reserved);
5760                                global_rsv->reserved += len;
5761                                space_info->bytes_may_use += len;
5762                                if (global_rsv->reserved >= global_rsv->size)
5763                                        global_rsv->full = 1;
5764                        }
5765                        spin_unlock(&global_rsv->lock);
5766                }
5767                spin_unlock(&space_info->lock);
5768        }
5769
5770        if (cache)
5771                btrfs_put_block_group(cache);
5772        return 0;
5773}
5774
5775int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5776                               struct btrfs_root *root)
5777{
5778        struct btrfs_fs_info *fs_info = root->fs_info;
5779        struct extent_io_tree *unpin;
5780        u64 start;
5781        u64 end;
5782        int ret;
5783
5784        if (trans->aborted)
5785                return 0;
5786
5787        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5788                unpin = &fs_info->freed_extents[1];
5789        else
5790                unpin = &fs_info->freed_extents[0];
5791
5792        while (1) {
5793                mutex_lock(&fs_info->unused_bg_unpin_mutex);
5794                ret = find_first_extent_bit(unpin, 0, &start, &end,
5795                                            EXTENT_DIRTY, NULL);
5796                if (ret) {
5797                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5798                        break;
5799                }
5800
5801                if (btrfs_test_opt(root, DISCARD))
5802                        ret = btrfs_discard_extent(root, start,
5803                                                   end + 1 - start, NULL);
5804
5805                clear_extent_dirty(unpin, start, end, GFP_NOFS);
5806                unpin_extent_range(root, start, end, true);
5807                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
5808                cond_resched();
5809        }
5810
5811        return 0;
5812}
5813
5814static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
5815                             u64 owner, u64 root_objectid)
5816{
5817        struct btrfs_space_info *space_info;
5818        u64 flags;
5819
5820        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5821                if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
5822                        flags = BTRFS_BLOCK_GROUP_SYSTEM;
5823                else
5824                        flags = BTRFS_BLOCK_GROUP_METADATA;
5825        } else {
5826                flags = BTRFS_BLOCK_GROUP_DATA;
5827        }
5828
5829        space_info = __find_space_info(fs_info, flags);
5830        BUG_ON(!space_info); /* Logic bug */
5831        percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
5832}
5833
5834
5835static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5836                                struct btrfs_root *root,
5837                                u64 bytenr, u64 num_bytes, u64 parent,
5838                                u64 root_objectid, u64 owner_objectid,
5839                                u64 owner_offset, int refs_to_drop,
5840                                struct btrfs_delayed_extent_op *extent_op,
5841                                int no_quota)
5842{
5843        struct btrfs_key key;
5844        struct btrfs_path *path;
5845        struct btrfs_fs_info *info = root->fs_info;
5846        struct btrfs_root *extent_root = info->extent_root;
5847        struct extent_buffer *leaf;
5848        struct btrfs_extent_item *ei;
5849        struct btrfs_extent_inline_ref *iref;
5850        int ret;
5851        int is_data;
5852        int extent_slot = 0;
5853        int found_extent = 0;
5854        int num_to_del = 1;
5855        u32 item_size;
5856        u64 refs;
5857        int last_ref = 0;
5858        enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
5859        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
5860                                                 SKINNY_METADATA);
5861
5862        if (!info->quota_enabled || !is_fstree(root_objectid))
5863                no_quota = 1;
5864
5865        path = btrfs_alloc_path();
5866        if (!path)
5867                return -ENOMEM;
5868
5869        path->reada = 1;
5870        path->leave_spinning = 1;
5871
5872        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5873        BUG_ON(!is_data && refs_to_drop != 1);
5874
5875        if (is_data)
5876                skinny_metadata = 0;
5877
5878        ret = lookup_extent_backref(trans, extent_root, path, &iref,
5879                                    bytenr, num_bytes, parent,
5880                                    root_objectid, owner_objectid,
5881                                    owner_offset);
5882        if (ret == 0) {
5883                extent_slot = path->slots[0];
5884                while (extent_slot >= 0) {
5885                        btrfs_item_key_to_cpu(path->nodes[0], &key,
5886                                              extent_slot);
5887                        if (key.objectid != bytenr)
5888                                break;
5889                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5890                            key.offset == num_bytes) {
5891                                found_extent = 1;
5892                                break;
5893                        }
5894                        if (key.type == BTRFS_METADATA_ITEM_KEY &&
5895                            key.offset == owner_objectid) {
5896                                found_extent = 1;
5897                                break;
5898                        }
5899                        if (path->slots[0] - extent_slot > 5)
5900                                break;
5901                        extent_slot--;
5902                }
5903#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5904                item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5905                if (found_extent && item_size < sizeof(*ei))
5906                        found_extent = 0;
5907#endif
5908                if (!found_extent) {
5909                        BUG_ON(iref);
5910                        ret = remove_extent_backref(trans, extent_root, path,
5911                                                    NULL, refs_to_drop,
5912                                                    is_data, &last_ref);
5913                        if (ret) {
5914                                btrfs_abort_transaction(trans, extent_root, ret);
5915                                goto out;
5916                        }
5917                        btrfs_release_path(path);
5918                        path->leave_spinning = 1;
5919
5920                        key.objectid = bytenr;
5921                        key.type = BTRFS_EXTENT_ITEM_KEY;
5922                        key.offset = num_bytes;
5923
5924                        if (!is_data && skinny_metadata) {
5925                                key.type = BTRFS_METADATA_ITEM_KEY;
5926                                key.offset = owner_objectid;
5927                        }
5928
5929                        ret = btrfs_search_slot(trans, extent_root,
5930                                                &key, path, -1, 1);
5931                        if (ret > 0 && skinny_metadata && path->slots[0]) {
5932                                /*
5933                                 * Couldn't find our skinny metadata item,
5934                                 * see if we have ye olde extent item.
5935                                 */
5936                                path->slots[0]--;
5937                                btrfs_item_key_to_cpu(path->nodes[0], &key,
5938                                                      path->slots[0]);
5939                                if (key.objectid == bytenr &&
5940                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
5941                                    key.offset == num_bytes)
5942                                        ret = 0;
5943                        }
5944
5945                        if (ret > 0 && skinny_metadata) {
5946                                skinny_metadata = false;
5947                                key.objectid = bytenr;
5948                                key.type = BTRFS_EXTENT_ITEM_KEY;
5949                                key.offset = num_bytes;
5950                                btrfs_release_path(path);
5951                                ret = btrfs_search_slot(trans, extent_root,
5952                                                        &key, path, -1, 1);
5953                        }
5954
5955                        if (ret) {
5956                                btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5957                                        ret, bytenr);
5958                                if (ret > 0)
5959                                        btrfs_print_leaf(extent_root,
5960                                                         path->nodes[0]);
5961                        }
5962                        if (ret < 0) {
5963                                btrfs_abort_transaction(trans, extent_root, ret);
5964                                goto out;
5965                        }
5966                        extent_slot = path->slots[0];
5967                }
5968        } else if (WARN_ON(ret == -ENOENT)) {
5969                btrfs_print_leaf(extent_root, path->nodes[0]);
5970                btrfs_err(info,
5971                        "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
5972                        bytenr, parent, root_objectid, owner_objectid,
5973                        owner_offset);
5974                btrfs_abort_transaction(trans, extent_root, ret);
5975                goto out;
5976        } else {
5977                btrfs_abort_transaction(trans, extent_root, ret);
5978                goto out;
5979        }
5980
5981        leaf = path->nodes[0];
5982        item_size = btrfs_item_size_nr(leaf, extent_slot);
5983#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5984        if (item_size < sizeof(*ei)) {
5985                BUG_ON(found_extent || extent_slot != path->slots[0]);
5986                ret = convert_extent_item_v0(trans, extent_root, path,
5987                                             owner_objectid, 0);
5988                if (ret < 0) {
5989                        btrfs_abort_transaction(trans, extent_root, ret);
5990                        goto out;
5991                }
5992
5993                btrfs_release_path(path);
5994                path->leave_spinning = 1;
5995
5996                key.objectid = bytenr;
5997                key.type = BTRFS_EXTENT_ITEM_KEY;
5998                key.offset = num_bytes;
5999
6000                ret = btrfs_search_slot(trans, extent_root, &key, path,
6001                                        -1, 1);
6002                if (ret) {
6003                        btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6004                                ret, bytenr);
6005                        btrfs_print_leaf(extent_root, path->nodes[0]);
6006                }
6007                if (ret < 0) {
6008                        btrfs_abort_transaction(trans, extent_root, ret);
6009                        goto out;
6010                }
6011
6012                extent_slot = path->slots[0];
6013                leaf = path->nodes[0];
6014                item_size = btrfs_item_size_nr(leaf, extent_slot);
6015        }
6016#endif
6017        BUG_ON(item_size < sizeof(*ei));
6018        ei = btrfs_item_ptr(leaf, extent_slot,
6019                            struct btrfs_extent_item);
6020        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6021            key.type == BTRFS_EXTENT_ITEM_KEY) {
6022                struct btrfs_tree_block_info *bi;
6023                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6024                bi = (struct btrfs_tree_block_info *)(ei + 1);
6025                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6026        }
6027
6028        refs = btrfs_extent_refs(leaf, ei);
6029        if (refs < refs_to_drop) {
6030                btrfs_err(info, "trying to drop %d refs but we only have %Lu "
6031                          "for bytenr %Lu", refs_to_drop, refs, bytenr);
6032                ret = -EINVAL;
6033                btrfs_abort_transaction(trans, extent_root, ret);
6034                goto out;
6035        }
6036        refs -= refs_to_drop;
6037
6038        if (refs > 0) {
6039                type = BTRFS_QGROUP_OPER_SUB_SHARED;
6040                if (extent_op)
6041                        __run_delayed_extent_op(extent_op, leaf, ei);
6042                /*
6043                 * In the case of inline back ref, reference count will
6044                 * be updated by remove_extent_backref
6045                 */
6046                if (iref) {
6047                        BUG_ON(!found_extent);
6048                } else {
6049                        btrfs_set_extent_refs(leaf, ei, refs);
6050                        btrfs_mark_buffer_dirty(leaf);
6051                }
6052                if (found_extent) {
6053                        ret = remove_extent_backref(trans, extent_root, path,
6054                                                    iref, refs_to_drop,
6055                                                    is_data, &last_ref);
6056                        if (ret) {
6057                                btrfs_abort_transaction(trans, extent_root, ret);
6058                                goto out;
6059                        }
6060                }
6061                add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
6062                                 root_objectid);
6063        } else {
6064                if (found_extent) {
6065                        BUG_ON(is_data && refs_to_drop !=
6066                               extent_data_ref_count(root, path, iref));
6067                        if (iref) {
6068                                BUG_ON(path->slots[0] != extent_slot);
6069                        } else {
6070                                BUG_ON(path->slots[0] != extent_slot + 1);
6071                                path->slots[0] = extent_slot;
6072                                num_to_del = 2;
6073                        }
6074                }
6075
6076                last_ref = 1;
6077                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6078                                      num_to_del);
6079                if (ret) {
6080                        btrfs_abort_transaction(trans, extent_root, ret);
6081                        goto out;
6082                }
6083                btrfs_release_path(path);
6084
6085                if (is_data) {
6086                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
6087                        if (ret) {
6088                                btrfs_abort_transaction(trans, extent_root, ret);
6089                                goto out;
6090                        }
6091                }
6092
6093                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
6094                if (ret) {
6095                        btrfs_abort_transaction(trans, extent_root, ret);
6096                        goto out;
6097                }
6098        }
6099        btrfs_release_path(path);
6100
6101        /* Deal with the quota accounting */
6102        if (!ret && last_ref && !no_quota) {
6103                int mod_seq = 0;
6104
6105                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
6106                    type == BTRFS_QGROUP_OPER_SUB_SHARED)
6107                        mod_seq = 1;
6108
6109                ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
6110                                              bytenr, num_bytes, type,
6111                                              mod_seq);
6112        }
6113out:
6114        btrfs_free_path(path);
6115        return ret;
6116}
6117
6118/*
6119 * when we free an block, it is possible (and likely) that we free the last
6120 * delayed ref for that extent as well.  This searches the delayed ref tree for
6121 * a given extent, and if there are no other delayed refs to be processed, it
6122 * removes it from the tree.
6123 */
6124static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6125                                      struct btrfs_root *root, u64 bytenr)
6126{
6127        struct btrfs_delayed_ref_head *head;
6128        struct btrfs_delayed_ref_root *delayed_refs;
6129        int ret = 0;
6130
6131        delayed_refs = &trans->transaction->delayed_refs;
6132        spin_lock(&delayed_refs->lock);
6133        head = btrfs_find_delayed_ref_head(trans, bytenr);
6134        if (!head)
6135                goto out_delayed_unlock;
6136
6137        spin_lock(&head->lock);
6138        if (rb_first(&head->ref_root))
6139                goto out;
6140
6141        if (head->extent_op) {
6142                if (!head->must_insert_reserved)
6143                        goto out;
6144                btrfs_free_delayed_extent_op(head->extent_op);
6145                head->extent_op = NULL;
6146        }
6147
6148        /*
6149         * waiting for the lock here would deadlock.  If someone else has it
6150         * locked they are already in the process of dropping it anyway
6151         */
6152        if (!mutex_trylock(&head->mutex))
6153                goto out;
6154
6155        /*
6156         * at this point we have a head with no other entries.  Go
6157         * ahead and process it.
6158         */
6159        head->node.in_tree = 0;
6160        rb_erase(&head->href_node, &delayed_refs->href_root);
6161
6162        atomic_dec(&delayed_refs->num_entries);
6163
6164        /*
6165         * we don't take a ref on the node because we're removing it from the
6166         * tree, so we just steal the ref the tree was holding.
6167         */
6168        delayed_refs->num_heads--;
6169        if (head->processing == 0)
6170                delayed_refs->num_heads_ready--;
6171        head->processing = 0;
6172        spin_unlock(&head->lock);
6173        spin_unlock(&delayed_refs->lock);
6174
6175        BUG_ON(head->extent_op);
6176        if (head->must_insert_reserved)
6177                ret = 1;
6178
6179        mutex_unlock(&head->mutex);
6180        btrfs_put_delayed_ref(&head->node);
6181        return ret;
6182out:
6183        spin_unlock(&head->lock);
6184
6185out_delayed_unlock:
6186        spin_unlock(&delayed_refs->lock);
6187        return 0;
6188}
6189
6190void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6191                           struct btrfs_root *root,
6192                           struct extent_buffer *buf,
6193                           u64 parent, int last_ref)
6194{
6195        int pin = 1;
6196        int ret;
6197
6198        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6199                ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6200                                        buf->start, buf->len,
6201                                        parent, root->root_key.objectid,
6202                                        btrfs_header_level(buf),
6203                                        BTRFS_DROP_DELAYED_REF, NULL, 0);
6204                BUG_ON(ret); /* -ENOMEM */
6205        }
6206
6207        if (!last_ref)
6208                return;
6209
6210        if (btrfs_header_generation(buf) == trans->transid) {
6211                struct btrfs_block_group_cache *cache;
6212
6213                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6214                        ret = check_ref_cleanup(trans, root, buf->start);
6215                        if (!ret)
6216                                goto out;
6217                }
6218
6219                cache = btrfs_lookup_block_group(root->fs_info, buf->start);
6220
6221                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6222                        pin_down_extent(root, cache, buf->start, buf->len, 1);
6223                        btrfs_put_block_group(cache);
6224                        goto out;
6225                }
6226
6227                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6228
6229                btrfs_add_free_space(cache, buf->start, buf->len);
6230                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
6231                btrfs_put_block_group(cache);
6232                trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
6233                pin = 0;
6234        }
6235out:
6236        if (pin)
6237                add_pinned_bytes(root->fs_info, buf->len,
6238                                 btrfs_header_level(buf),
6239                                 root->root_key.objectid);
6240
6241        /*
6242         * Deleting the buffer, clear the corrupt flag since it doesn't matter
6243         * anymore.
6244         */
6245        clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6246}
6247
6248/* Can return -ENOMEM */
6249int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6250                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6251                      u64 owner, u64 offset, int no_quota)
6252{
6253        int ret;
6254        struct btrfs_fs_info *fs_info = root->fs_info;
6255
6256        if (btrfs_test_is_dummy_root(root))
6257                return 0;
6258
6259        add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6260
6261        /*
6262         * tree log blocks never actually go into the extent allocation
6263         * tree, just update pinning info and exit early.
6264         */
6265        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
6266                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
6267                /* unlocks the pinned mutex */
6268                btrfs_pin_extent(root, bytenr, num_bytes, 1);
6269                ret = 0;
6270        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6271                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6272                                        num_bytes,
6273                                        parent, root_objectid, (int)owner,
6274                                        BTRFS_DROP_DELAYED_REF, NULL, no_quota);
6275        } else {
6276                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6277                                                num_bytes,
6278                                                parent, root_objectid, owner,
6279                                                offset, BTRFS_DROP_DELAYED_REF,
6280                                                NULL, no_quota);
6281        }
6282        return ret;
6283}
6284
6285/*
6286 * when we wait for progress in the block group caching, its because
6287 * our allocation attempt failed at least once.  So, we must sleep
6288 * and let some progress happen before we try again.
6289 *
6290 * This function will sleep at least once waiting for new free space to
6291 * show up, and then it will check the block group free space numbers
6292 * for our min num_bytes.  Another option is to have it go ahead
6293 * and look in the rbtree for a free extent of a given size, but this
6294 * is a good start.
6295 *
6296 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6297 * any of the information in this block group.
6298 */
6299static noinline void
6300wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6301                                u64 num_bytes)
6302{
6303        struct btrfs_caching_control *caching_ctl;
6304
6305        caching_ctl = get_caching_control(cache);
6306        if (!caching_ctl)
6307                return;
6308
6309        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6310                   (cache->free_space_ctl->free_space >= num_bytes));
6311
6312        put_caching_control(caching_ctl);
6313}
6314
6315static noinline int
6316wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6317{
6318        struct btrfs_caching_control *caching_ctl;
6319        int ret = 0;
6320
6321        caching_ctl = get_caching_control(cache);
6322        if (!caching_ctl)
6323                return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6324
6325        wait_event(caching_ctl->wait, block_group_cache_done(cache));
6326        if (cache->cached == BTRFS_CACHE_ERROR)
6327                ret = -EIO;
6328        put_caching_control(caching_ctl);
6329        return ret;
6330}
6331
6332int __get_raid_index(u64 flags)
6333{
6334        if (flags & BTRFS_BLOCK_GROUP_RAID10)
6335                return BTRFS_RAID_RAID10;
6336        else if (flags & BTRFS_BLOCK_GROUP_RAID1)
6337                return BTRFS_RAID_RAID1;
6338        else if (flags & BTRFS_BLOCK_GROUP_DUP)
6339                return BTRFS_RAID_DUP;
6340        else if (flags & BTRFS_BLOCK_GROUP_RAID0)
6341                return BTRFS_RAID_RAID0;
6342        else if (flags & BTRFS_BLOCK_GROUP_RAID5)
6343                return BTRFS_RAID_RAID5;
6344        else if (flags & BTRFS_BLOCK_GROUP_RAID6)
6345                return BTRFS_RAID_RAID6;
6346
6347        return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
6348}
6349
6350int get_block_group_index(struct btrfs_block_group_cache *cache)
6351{
6352        return __get_raid_index(cache->flags);
6353}
6354
6355static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
6356        [BTRFS_RAID_RAID10]     = "raid10",
6357        [BTRFS_RAID_RAID1]      = "raid1",
6358        [BTRFS_RAID_DUP]        = "dup",
6359        [BTRFS_RAID_RAID0]      = "raid0",
6360        [BTRFS_RAID_SINGLE]     = "single",
6361        [BTRFS_RAID_RAID5]      = "raid5",
6362        [BTRFS_RAID_RAID6]      = "raid6",
6363};
6364
6365static const char *get_raid_name(enum btrfs_raid_types type)
6366{
6367        if (type >= BTRFS_NR_RAID_TYPES)
6368                return NULL;
6369
6370        return btrfs_raid_type_names[type];
6371}
6372
6373enum btrfs_loop_type {
6374        LOOP_CACHING_NOWAIT = 0,
6375        LOOP_CACHING_WAIT = 1,
6376        LOOP_ALLOC_CHUNK = 2,
6377        LOOP_NO_EMPTY_SIZE = 3,
6378};
6379
6380static inline void
6381btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6382                       int delalloc)
6383{
6384        if (delalloc)
6385                down_read(&cache->data_rwsem);
6386}
6387
6388static inline void
6389btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6390                       int delalloc)
6391{
6392        btrfs_get_block_group(cache);
6393        if (delalloc)
6394                down_read(&cache->data_rwsem);
6395}
6396
6397static struct btrfs_block_group_cache *
6398btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6399                   struct btrfs_free_cluster *cluster,
6400                   int delalloc)
6401{
6402        struct btrfs_block_group_cache *used_bg;
6403        bool locked = false;
6404again:
6405        spin_lock(&cluster->refill_lock);
6406        if (locked) {
6407                if (used_bg == cluster->block_group)
6408                        return used_bg;
6409
6410                up_read(&used_bg->data_rwsem);
6411                btrfs_put_block_group(used_bg);
6412        }
6413
6414        used_bg = cluster->block_group;
6415        if (!used_bg)
6416                return NULL;
6417
6418        if (used_bg == block_group)
6419                return used_bg;
6420
6421        btrfs_get_block_group(used_bg);
6422
6423        if (!delalloc)
6424                return used_bg;
6425
6426        if (down_read_trylock(&used_bg->data_rwsem))
6427                return used_bg;
6428
6429        spin_unlock(&cluster->refill_lock);
6430        down_read(&used_bg->data_rwsem);
6431        locked = true;
6432        goto again;
6433}
6434
6435static inline void
6436btrfs_release_block_group(struct btrfs_block_group_cache *cache,
6437                         int delalloc)
6438{
6439        if (delalloc)
6440                up_read(&cache->data_rwsem);
6441        btrfs_put_block_group(cache);
6442}
6443
6444/*
6445 * walks the btree of allocated extents and find a hole of a given size.
6446 * The key ins is changed to record the hole:
6447 * ins->objectid == start position
6448 * ins->flags = BTRFS_EXTENT_ITEM_KEY
6449 * ins->offset == the size of the hole.
6450 * Any available blocks before search_start are skipped.
6451 *
6452 * If there is no suitable free space, we will record the max size of
6453 * the free space extent currently.
6454 */
6455static noinline int find_free_extent(struct btrfs_root *orig_root,
6456                                     u64 num_bytes, u64 empty_size,
6457                                     u64 hint_byte, struct btrfs_key *ins,
6458                                     u64 flags, int delalloc)
6459{
6460        int ret = 0;
6461        struct btrfs_root *root = orig_root->fs_info->extent_root;
6462        struct btrfs_free_cluster *last_ptr = NULL;
6463        struct btrfs_block_group_cache *block_group = NULL;
6464        u64 search_start = 0;
6465        u64 max_extent_size = 0;
6466        int empty_cluster = 2 * 1024 * 1024;
6467        struct btrfs_space_info *space_info;
6468        int loop = 0;
6469        int index = __get_raid_index(flags);
6470        int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
6471                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
6472        bool failed_cluster_refill = false;
6473        bool failed_alloc = false;
6474        bool use_cluster = true;
6475        bool have_caching_bg = false;
6476
6477        WARN_ON(num_bytes < root->sectorsize);
6478        ins->type = BTRFS_EXTENT_ITEM_KEY;
6479        ins->objectid = 0;
6480        ins->offset = 0;
6481
6482        trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
6483
6484        space_info = __find_space_info(root->fs_info, flags);
6485        if (!space_info) {
6486                btrfs_err(root->fs_info, "No space info for %llu", flags);
6487                return -ENOSPC;
6488        }
6489
6490        /*
6491         * If the space info is for both data and metadata it means we have a
6492         * small filesystem and we can't use the clustering stuff.
6493         */
6494        if (btrfs_mixed_space_info(space_info))
6495                use_cluster = false;
6496
6497        if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
6498                last_ptr = &root->fs_info->meta_alloc_cluster;
6499                if (!btrfs_test_opt(root, SSD))
6500                        empty_cluster = 64 * 1024;
6501        }
6502
6503        if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
6504            btrfs_test_opt(root, SSD)) {
6505                last_ptr = &root->fs_info->data_alloc_cluster;
6506        }
6507
6508        if (last_ptr) {
6509                spin_lock(&last_ptr->lock);
6510                if (last_ptr->block_group)
6511                        hint_byte = last_ptr->window_start;
6512                spin_unlock(&last_ptr->lock);
6513        }
6514
6515        search_start = max(search_start, first_logical_byte(root, 0));
6516        search_start = max(search_start, hint_byte);
6517
6518        if (!last_ptr)
6519                empty_cluster = 0;
6520
6521        if (search_start == hint_byte) {
6522                block_group = btrfs_lookup_block_group(root->fs_info,
6523                                                       search_start);
6524                /*
6525                 * we don't want to use the block group if it doesn't match our
6526                 * allocation bits, or if its not cached.
6527                 *
6528                 * However if we are re-searching with an ideal block group
6529                 * picked out then we don't care that the block group is cached.
6530                 */
6531                if (block_group && block_group_bits(block_group, flags) &&
6532                    block_group->cached != BTRFS_CACHE_NO) {
6533                        down_read(&space_info->groups_sem);
6534                        if (list_empty(&block_group->list) ||
6535                            block_group->ro) {
6536                                /*
6537                                 * someone is removing this block group,
6538                                 * we can't jump into the have_block_group
6539                                 * target because our list pointers are not
6540                                 * valid
6541                                 */
6542                                btrfs_put_block_group(block_group);
6543                                up_read(&space_info->groups_sem);
6544                        } else {
6545                                index = get_block_group_index(block_group);
6546                                btrfs_lock_block_group(block_group, delalloc);
6547                                goto have_block_group;
6548                        }
6549                } else if (block_group) {
6550                        btrfs_put_block_group(block_group);
6551                }
6552        }
6553search:
6554        have_caching_bg = false;
6555        down_read(&space_info->groups_sem);
6556        list_for_each_entry(block_group, &space_info->block_groups[index],
6557                            list) {
6558                u64 offset;
6559                int cached;
6560
6561                btrfs_grab_block_group(block_group, delalloc);
6562                search_start = block_group->key.objectid;
6563
6564                /*
6565                 * this can happen if we end up cycling through all the
6566                 * raid types, but we want to make sure we only allocate
6567                 * for the proper type.
6568                 */
6569                if (!block_group_bits(block_group, flags)) {
6570                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
6571                                BTRFS_BLOCK_GROUP_RAID1 |
6572                                BTRFS_BLOCK_GROUP_RAID5 |
6573                                BTRFS_BLOCK_GROUP_RAID6 |
6574                                BTRFS_BLOCK_GROUP_RAID10;
6575
6576                        /*
6577                         * if they asked for extra copies and this block group
6578                         * doesn't provide them, bail.  This does allow us to
6579                         * fill raid0 from raid1.
6580                         */
6581                        if ((flags & extra) && !(block_group->flags & extra))
6582                                goto loop;
6583                }
6584
6585have_block_group:
6586                cached = block_group_cache_done(block_group);
6587                if (unlikely(!cached)) {
6588                        ret = cache_block_group(block_group, 0);
6589                        BUG_ON(ret < 0);
6590                        ret = 0;
6591                }
6592
6593                if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
6594                        goto loop;
6595                if (unlikely(block_group->ro))
6596                        goto loop;
6597
6598                /*
6599                 * Ok we want to try and use the cluster allocator, so
6600                 * lets look there
6601                 */
6602                if (last_ptr) {
6603                        struct btrfs_block_group_cache *used_block_group;
6604                        unsigned long aligned_cluster;
6605                        /*
6606                         * the refill lock keeps out other
6607                         * people trying to start a new cluster
6608                         */
6609                        used_block_group = btrfs_lock_cluster(block_group,
6610                                                              last_ptr,
6611                                                              delalloc);
6612                        if (!used_block_group)
6613                                goto refill_cluster;
6614
6615                        if (used_block_group != block_group &&
6616                            (used_block_group->ro ||
6617                             !block_group_bits(used_block_group, flags)))
6618                                goto release_cluster;
6619
6620                        offset = btrfs_alloc_from_cluster(used_block_group,
6621                                                last_ptr,
6622                                                num_bytes,
6623                                                used_block_group->key.objectid,
6624                                                &max_extent_size);
6625                        if (offset) {
6626                                /* we have a block, we're done */
6627                                spin_unlock(&last_ptr->refill_lock);
6628                                trace_btrfs_reserve_extent_cluster(root,
6629                                                used_block_group,
6630                                                search_start, num_bytes);
6631                                if (used_block_group != block_group) {
6632                                        btrfs_release_block_group(block_group,
6633                                                                  delalloc);
6634                                        block_group = used_block_group;
6635                                }
6636                                goto checks;
6637                        }
6638
6639                        WARN_ON(last_ptr->block_group != used_block_group);
6640release_cluster:
6641                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
6642                         * set up a new clusters, so lets just skip it
6643                         * and let the allocator find whatever block
6644                         * it can find.  If we reach this point, we
6645                         * will have tried the cluster allocator
6646                         * plenty of times and not have found
6647                         * anything, so we are likely way too
6648                         * fragmented for the clustering stuff to find
6649                         * anything.
6650                         *
6651                         * However, if the cluster is taken from the
6652                         * current block group, release the cluster
6653                         * first, so that we stand a better chance of
6654                         * succeeding in the unclustered
6655                         * allocation.  */
6656                        if (loop >= LOOP_NO_EMPTY_SIZE &&
6657                            used_block_group != block_group) {
6658                                spin_unlock(&last_ptr->refill_lock);
6659                                btrfs_release_block_group(used_block_group,
6660                                                          delalloc);
6661                                goto unclustered_alloc;
6662                        }
6663
6664                        /*
6665                         * this cluster didn't work out, free it and
6666                         * start over
6667                         */
6668                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
6669
6670                        if (used_block_group != block_group)
6671                                btrfs_release_block_group(used_block_group,
6672                                                          delalloc);
6673refill_cluster:
6674                        if (loop >= LOOP_NO_EMPTY_SIZE) {
6675                                spin_unlock(&last_ptr->refill_lock);
6676                                goto unclustered_alloc;
6677                        }
6678
6679                        aligned_cluster = max_t(unsigned long,
6680                                                empty_cluster + empty_size,
6681                                              block_group->full_stripe_len);
6682
6683                        /* allocate a cluster in this block group */
6684                        ret = btrfs_find_space_cluster(root, block_group,
6685                                                       last_ptr, search_start,
6686                                                       num_bytes,
6687                                                       aligned_cluster);
6688                        if (ret == 0) {
6689                                /*
6690                                 * now pull our allocation out of this
6691                                 * cluster
6692                                 */
6693                                offset = btrfs_alloc_from_cluster(block_group,
6694                                                        last_ptr,
6695                                                        num_bytes,
6696                                                        search_start,
6697                                                        &max_extent_size);
6698                                if (offset) {
6699                                        /* we found one, proceed */
6700                                        spin_unlock(&last_ptr->refill_lock);
6701                                        trace_btrfs_reserve_extent_cluster(root,
6702                                                block_group, search_start,
6703                                                num_bytes);
6704                                        goto checks;
6705                                }
6706                        } else if (!cached && loop > LOOP_CACHING_NOWAIT
6707                                   && !failed_cluster_refill) {
6708                                spin_unlock(&last_ptr->refill_lock);
6709
6710                                failed_cluster_refill = true;
6711                                wait_block_group_cache_progress(block_group,
6712                                       num_bytes + empty_cluster + empty_size);
6713                                goto have_block_group;
6714                        }
6715
6716                        /*
6717                         * at this point we either didn't find a cluster
6718                         * or we weren't able to allocate a block from our
6719                         * cluster.  Free the cluster we've been trying
6720                         * to use, and go to the next block group
6721                         */
6722                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
6723                        spin_unlock(&last_ptr->refill_lock);
6724                        goto loop;
6725                }
6726
6727unclustered_alloc:
6728                spin_lock(&block_group->free_space_ctl->tree_lock);
6729                if (cached &&
6730                    block_group->free_space_ctl->free_space <
6731                    num_bytes + empty_cluster + empty_size) {
6732                        if (block_group->free_space_ctl->free_space >
6733                            max_extent_size)
6734                                max_extent_size =
6735                                        block_group->free_space_ctl->free_space;
6736                        spin_unlock(&block_group->free_space_ctl->tree_lock);
6737                        goto loop;
6738                }
6739                spin_unlock(&block_group->free_space_ctl->tree_lock);
6740
6741                offset = btrfs_find_space_for_alloc(block_group, search_start,
6742                                                    num_bytes, empty_size,
6743                                                    &max_extent_size);
6744                /*
6745                 * If we didn't find a chunk, and we haven't failed on this
6746                 * block group before, and this block group is in the middle of
6747                 * caching and we are ok with waiting, then go ahead and wait
6748                 * for progress to be made, and set failed_alloc to true.
6749                 *
6750                 * If failed_alloc is true then we've already waited on this
6751                 * block group once and should move on to the next block group.
6752                 */
6753                if (!offset && !failed_alloc && !cached &&
6754                    loop > LOOP_CACHING_NOWAIT) {
6755                        wait_block_group_cache_progress(block_group,
6756                                                num_bytes + empty_size);
6757                        failed_alloc = true;
6758                        goto have_block_group;
6759                } else if (!offset) {
6760                        if (!cached)
6761                                have_caching_bg = true;
6762                        goto loop;
6763                }
6764checks:
6765                search_start = ALIGN(offset, root->stripesize);
6766
6767                /* move on to the next group */
6768                if (search_start + num_bytes >
6769                    block_group->key.objectid + block_group->key.offset) {
6770                        btrfs_add_free_space(block_group, offset, num_bytes);
6771                        goto loop;
6772                }
6773
6774                if (offset < search_start)
6775                        btrfs_add_free_space(block_group, offset,
6776                                             search_start - offset);
6777                BUG_ON(offset > search_start);
6778
6779                ret = btrfs_update_reserved_bytes(block_group, num_bytes,
6780                                                  alloc_type, delalloc);
6781                if (ret == -EAGAIN) {
6782                        btrfs_add_free_space(block_group, offset, num_bytes);
6783                        goto loop;
6784                }
6785
6786                /* we are all good, lets return */
6787                ins->objectid = search_start;
6788                ins->offset = num_bytes;
6789
6790                trace_btrfs_reserve_extent(orig_root, block_group,
6791                                           search_start, num_bytes);
6792                btrfs_release_block_group(block_group, delalloc);
6793                break;
6794loop:
6795                failed_cluster_refill = false;
6796                failed_alloc = false;
6797                BUG_ON(index != get_block_group_index(block_group));
6798                btrfs_release_block_group(block_group, delalloc);
6799        }
6800        up_read(&space_info->groups_sem);
6801
6802        if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
6803                goto search;
6804
6805        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
6806                goto search;
6807
6808        /*
6809         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
6810         *                      caching kthreads as we move along
6811         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
6812         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
6813         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
6814         *                      again
6815         */
6816        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
6817                index = 0;
6818                loop++;
6819                if (loop == LOOP_ALLOC_CHUNK) {
6820                        struct btrfs_trans_handle *trans;
6821                        int exist = 0;
6822
6823                        trans = current->journal_info;
6824                        if (trans)
6825                                exist = 1;
6826                        else
6827                                trans = btrfs_join_transaction(root);
6828
6829                        if (IS_ERR(trans)) {
6830                                ret = PTR_ERR(trans);
6831                                goto out;
6832                        }
6833
6834                        ret = do_chunk_alloc(trans, root, flags,
6835                                             CHUNK_ALLOC_FORCE);
6836                        /*
6837                         * Do not bail out on ENOSPC since we
6838                         * can do more things.
6839                         */
6840                        if (ret < 0 && ret != -ENOSPC)
6841                                btrfs_abort_transaction(trans,
6842                                                        root, ret);
6843                        else
6844                                ret = 0;
6845                        if (!exist)
6846                                btrfs_end_transaction(trans, root);
6847                        if (ret)
6848                                goto out;
6849                }
6850
6851                if (loop == LOOP_NO_EMPTY_SIZE) {
6852                        empty_size = 0;
6853                        empty_cluster = 0;
6854                }
6855
6856                goto search;
6857        } else if (!ins->objectid) {
6858                ret = -ENOSPC;
6859        } else if (ins->objectid) {
6860                ret = 0;
6861        }
6862out:
6863        if (ret == -ENOSPC)
6864                ins->offset = max_extent_size;
6865        return ret;
6866}
6867
6868static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
6869                            int dump_block_groups)
6870{
6871        struct btrfs_block_group_cache *cache;
6872        int index = 0;
6873
6874        spin_lock(&info->lock);
6875        printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
6876               info->flags,
6877               info->total_bytes - info->bytes_used - info->bytes_pinned -
6878               info->bytes_reserved - info->bytes_readonly,
6879               (info->full) ? "" : "not ");
6880        printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
6881               "reserved=%llu, may_use=%llu, readonly=%llu\n",
6882               info->total_bytes, info->bytes_used, info->bytes_pinned,
6883               info->bytes_reserved, info->bytes_may_use,
6884               info->bytes_readonly);
6885        spin_unlock(&info->lock);
6886
6887        if (!dump_block_groups)
6888                return;
6889
6890        down_read(&info->groups_sem);
6891again:
6892        list_for_each_entry(cache, &info->block_groups[index], list) {
6893                spin_lock(&cache->lock);
6894                printk(KERN_INFO "BTRFS: "
6895                           "block group %llu has %llu bytes, "
6896                           "%llu used %llu pinned %llu reserved %s\n",
6897                       cache->key.objectid, cache->key.offset,
6898                       btrfs_block_group_used(&cache->item), cache->pinned,
6899                       cache->reserved, cache->ro ? "[readonly]" : "");
6900                btrfs_dump_free_space(cache, bytes);
6901                spin_unlock(&cache->lock);
6902        }
6903        if (++index < BTRFS_NR_RAID_TYPES)
6904                goto again;
6905        up_read(&info->groups_sem);
6906}
6907
6908int btrfs_reserve_extent(struct btrfs_root *root,
6909                         u64 num_bytes, u64 min_alloc_size,
6910                         u64 empty_size, u64 hint_byte,
6911                         struct btrfs_key *ins, int is_data, int delalloc)
6912{
6913        bool final_tried = false;
6914        u64 flags;
6915        int ret;
6916
6917        flags = btrfs_get_alloc_profile(root, is_data);
6918again:
6919        WARN_ON(num_bytes < root->sectorsize);
6920        ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
6921                               flags, delalloc);
6922
6923        if (ret == -ENOSPC) {
6924                if (!final_tried && ins->offset) {
6925                        num_bytes = min(num_bytes >> 1, ins->offset);
6926                        num_bytes = round_down(num_bytes, root->sectorsize);
6927                        num_bytes = max(num_bytes, min_alloc_size);
6928                        if (num_bytes == min_alloc_size)
6929                                final_tried = true;
6930                        goto again;
6931                } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6932                        struct btrfs_space_info *sinfo;
6933
6934                        sinfo = __find_space_info(root->fs_info, flags);
6935                        btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
6936                                flags, num_bytes);
6937                        if (sinfo)
6938                                dump_space_info(sinfo, num_bytes, 1);
6939                }
6940        }
6941
6942        return ret;
6943}
6944
6945static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6946                                        u64 start, u64 len,
6947                                        int pin, int delalloc)
6948{
6949        struct btrfs_block_group_cache *cache;
6950        int ret = 0;
6951
6952        cache = btrfs_lookup_block_group(root->fs_info, start);
6953        if (!cache) {
6954                btrfs_err(root->fs_info, "Unable to find block group for %llu",
6955                        start);
6956                return -ENOSPC;
6957        }
6958
6959        if (btrfs_test_opt(root, DISCARD))
6960                ret = btrfs_discard_extent(root, start, len, NULL);
6961
6962        if (pin)
6963                pin_down_extent(root, cache, start, len, 1);
6964        else {
6965                btrfs_add_free_space(cache, start, len);
6966                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
6967        }
6968        btrfs_put_block_group(cache);
6969
6970        trace_btrfs_reserved_extent_free(root, start, len);
6971
6972        return ret;
6973}
6974
6975int btrfs_free_reserved_extent(struct btrfs_root *root,
6976                               u64 start, u64 len, int delalloc)
6977{
6978        return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
6979}
6980
6981int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6982                                       u64 start, u64 len)
6983{
6984        return __btrfs_free_reserved_extent(root, start, len, 1, 0);
6985}
6986
6987static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6988                                      struct btrfs_root *root,
6989                                      u64 parent, u64 root_objectid,
6990                                      u64 flags, u64 owner, u64 offset,
6991                                      struct btrfs_key *ins, int ref_mod)
6992{
6993        int ret;
6994        struct btrfs_fs_info *fs_info = root->fs_info;
6995        struct btrfs_extent_item *extent_item;
6996        struct btrfs_extent_inline_ref *iref;
6997        struct btrfs_path *path;
6998        struct extent_buffer *leaf;
6999        int type;
7000        u32 size;
7001
7002        if (parent > 0)
7003                type = BTRFS_SHARED_DATA_REF_KEY;
7004        else
7005                type = BTRFS_EXTENT_DATA_REF_KEY;
7006
7007        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7008
7009        path = btrfs_alloc_path();
7010        if (!path)
7011                return -ENOMEM;
7012
7013        path->leave_spinning = 1;
7014        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7015                                      ins, size);
7016        if (ret) {
7017                btrfs_free_path(path);
7018                return ret;
7019        }
7020
7021        leaf = path->nodes[0];
7022        extent_item = btrfs_item_ptr(leaf, path->slots[0],
7023                                     struct btrfs_extent_item);
7024        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7025        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7026        btrfs_set_extent_flags(leaf, extent_item,
7027                               flags | BTRFS_EXTENT_FLAG_DATA);
7028
7029        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7030        btrfs_set_extent_inline_ref_type(leaf, iref, type);
7031        if (parent > 0) {
7032                struct btrfs_shared_data_ref *ref;
7033                ref = (struct btrfs_shared_data_ref *)(iref + 1);
7034                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7035                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
7036        } else {
7037                struct btrfs_extent_data_ref *ref;
7038                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
7039                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
7040                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
7041                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
7042                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
7043        }
7044
7045        btrfs_mark_buffer_dirty(path->nodes[0]);
7046        btrfs_free_path(path);
7047
7048        /* Always set parent to 0 here since its exclusive anyway. */
7049        ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
7050                                      ins->objectid, ins->offset,
7051                                      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
7052        if (ret)
7053                return ret;
7054
7055        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
7056        if (ret) { /* -ENOENT, logic error */
7057                btrfs_err(fs_info, "update block group failed for %llu %llu",
7058                        ins->objectid, ins->offset);
7059                BUG();
7060        }
7061        trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
7062        return ret;
7063}
7064
7065static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7066                                     struct btrfs_root *root,
7067                                     u64 parent, u64 root_objectid,
7068                                     u64 flags, struct btrfs_disk_key *key,
7069                                     int level, struct btrfs_key *ins,
7070                                     int no_quota)
7071{
7072        int ret;
7073        struct btrfs_fs_info *fs_info = root->fs_info;
7074        struct btrfs_extent_item *extent_item;
7075        struct btrfs_tree_block_info *block_info;
7076        struct btrfs_extent_inline_ref *iref;
7077        struct btrfs_path *path;
7078        struct extent_buffer *leaf;
7079        u32 size = sizeof(*extent_item) + sizeof(*iref);
7080        u64 num_bytes = ins->offset;
7081        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7082                                                 SKINNY_METADATA);
7083
7084        if (!skinny_metadata)
7085                size += sizeof(*block_info);
7086
7087        path = btrfs_alloc_path();
7088        if (!path) {
7089                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7090                                                   root->nodesize);
7091                return -ENOMEM;
7092        }
7093
7094        path->leave_spinning = 1;
7095        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7096                                      ins, size);
7097        if (ret) {
7098                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7099                                                   root->nodesize);
7100                btrfs_free_path(path);
7101                return ret;
7102        }
7103
7104        leaf = path->nodes[0];
7105        extent_item = btrfs_item_ptr(leaf, path->slots[0],
7106                                     struct btrfs_extent_item);
7107        btrfs_set_extent_refs(leaf, extent_item, 1);
7108        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7109        btrfs_set_extent_flags(leaf, extent_item,
7110                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
7111
7112        if (skinny_metadata) {
7113                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7114                num_bytes = root->nodesize;
7115        } else {
7116                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7117                btrfs_set_tree_block_key(leaf, block_info, key);
7118                btrfs_set_tree_block_level(leaf, block_info, level);
7119                iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
7120        }
7121
7122        if (parent > 0) {
7123                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
7124                btrfs_set_extent_inline_ref_type(leaf, iref,
7125                                                 BTRFS_SHARED_BLOCK_REF_KEY);
7126                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7127        } else {
7128                btrfs_set_extent_inline_ref_type(leaf, iref,
7129                                                 BTRFS_TREE_BLOCK_REF_KEY);
7130                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
7131        }
7132
7133        btrfs_mark_buffer_dirty(leaf);
7134        btrfs_free_path(path);
7135
7136        if (!no_quota) {
7137                ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
7138                                              ins->objectid, num_bytes,
7139                                              BTRFS_QGROUP_OPER_ADD_EXCL, 0);
7140                if (ret)
7141                        return ret;
7142        }
7143
7144        ret = update_block_group(trans, root, ins->objectid, root->nodesize,
7145                                 1);
7146        if (ret) { /* -ENOENT, logic error */
7147                btrfs_err(fs_info, "update block group failed for %llu %llu",
7148                        ins->objectid, ins->offset);
7149                BUG();
7150        }
7151
7152        trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
7153        return ret;
7154}
7155
7156int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7157                                     struct btrfs_root *root,
7158                                     u64 root_objectid, u64 owner,
7159                                     u64 offset, struct btrfs_key *ins)
7160{
7161        int ret;
7162
7163        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
7164
7165        ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
7166                                         ins->offset, 0,
7167                                         root_objectid, owner, offset,
7168                                         BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
7169        return ret;
7170}
7171
7172/*
7173 * this is used by the tree logging recovery code.  It records that
7174 * an extent has been allocated and makes sure to clear the free
7175 * space cache bits as well
7176 */
7177int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
7178                                   struct btrfs_root *root,
7179                                   u64 root_objectid, u64 owner, u64 offset,
7180                                   struct btrfs_key *ins)
7181{
7182        int ret;
7183        struct btrfs_block_group_cache *block_group;
7184
7185        /*
7186         * Mixed block groups will exclude before processing the log so we only
7187         * need to do the exlude dance if this fs isn't mixed.
7188         */
7189        if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
7190                ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
7191                if (ret)
7192                        return ret;
7193        }
7194
7195        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
7196        if (!block_group)
7197                return -EINVAL;
7198
7199        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
7200                                          RESERVE_ALLOC_NO_ACCOUNT, 0);
7201        BUG_ON(ret); /* logic error */
7202        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
7203                                         0, owner, offset, ins, 1);
7204        btrfs_put_block_group(block_group);
7205        return ret;
7206}
7207
7208static struct extent_buffer *
7209btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7210                      u64 bytenr, int level)
7211{
7212        struct extent_buffer *buf;
7213
7214        buf = btrfs_find_create_tree_block(root, bytenr);
7215        if (!buf)
7216                return ERR_PTR(-ENOMEM);
7217        btrfs_set_header_generation(buf, trans->transid);
7218        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
7219        btrfs_tree_lock(buf);
7220        clean_tree_block(trans, root, buf);
7221        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
7222
7223        btrfs_set_lock_blocking(buf);
7224        btrfs_set_buffer_uptodate(buf);
7225
7226        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7227                buf->log_index = root->log_transid % 2;
7228                /*
7229                 * we allow two log transactions at a time, use different
7230                 * EXENT bit to differentiate dirty pages.
7231                 */
7232                if (buf->log_index == 0)
7233                        set_extent_dirty(&root->dirty_log_pages, buf->start,
7234                                        buf->start + buf->len - 1, GFP_NOFS);
7235                else
7236                        set_extent_new(&root->dirty_log_pages, buf->start,
7237                                        buf->start + buf->len - 1, GFP_NOFS);
7238        } else {
7239                buf->log_index = -1;
7240                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7241                         buf->start + buf->len - 1, GFP_NOFS);
7242        }
7243        trans->blocks_used++;
7244        /* this returns a buffer locked for blocking */
7245        return buf;
7246}
7247
7248static struct btrfs_block_rsv *
7249use_block_rsv(struct btrfs_trans_handle *trans,
7250              struct btrfs_root *root, u32 blocksize)
7251{
7252        struct btrfs_block_rsv *block_rsv;
7253        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
7254        int ret;
7255        bool global_updated = false;
7256
7257        block_rsv = get_block_rsv(trans, root);
7258
7259        if (unlikely(block_rsv->size == 0))
7260                goto try_reserve;
7261again:
7262        ret = block_rsv_use_bytes(block_rsv, blocksize);
7263        if (!ret)
7264                return block_rsv;
7265
7266        if (block_rsv->failfast)
7267                return ERR_PTR(ret);
7268
7269        if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
7270                global_updated = true;
7271                update_global_block_rsv(root->fs_info);
7272                goto again;
7273        }
7274
7275        if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7276                static DEFINE_RATELIMIT_STATE(_rs,
7277                                DEFAULT_RATELIMIT_INTERVAL * 10,
7278                                /*DEFAULT_RATELIMIT_BURST*/ 1);
7279                if (__ratelimit(&_rs))
7280                        WARN(1, KERN_DEBUG
7281                                "BTRFS: block rsv returned %d\n", ret);
7282        }
7283try_reserve:
7284        ret = reserve_metadata_bytes(root, block_rsv, blocksize,
7285                                     BTRFS_RESERVE_NO_FLUSH);
7286        if (!ret)
7287                return block_rsv;
7288        /*
7289         * If we couldn't reserve metadata bytes try and use some from
7290         * the global reserve if its space type is the same as the global
7291         * reservation.
7292         */
7293        if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
7294            block_rsv->space_info == global_rsv->space_info) {
7295                ret = block_rsv_use_bytes(global_rsv, blocksize);
7296                if (!ret)
7297                        return global_rsv;
7298        }
7299        return ERR_PTR(ret);
7300}
7301
7302static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7303                            struct btrfs_block_rsv *block_rsv, u32 blocksize)
7304{
7305        block_rsv_add_bytes(block_rsv, blocksize, 0);
7306        block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
7307}
7308
7309/*
7310 * finds a free extent and does all the dirty work required for allocation
7311 * returns the key for the extent through ins, and a tree buffer for
7312 * the first block of the extent through buf.
7313 *
7314 * returns the tree buffer or NULL.
7315 */
7316struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7317                                        struct btrfs_root *root,
7318                                        u64 parent, u64 root_objectid,
7319                                        struct btrfs_disk_key *key, int level,
7320                                        u64 hint, u64 empty_size)
7321{
7322        struct btrfs_key ins;
7323        struct btrfs_block_rsv *block_rsv;
7324        struct extent_buffer *buf;
7325        u64 flags = 0;
7326        int ret;
7327        u32 blocksize = root->nodesize;
7328        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7329                                                 SKINNY_METADATA);
7330
7331        if (btrfs_test_is_dummy_root(root)) {
7332                buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7333                                            level);
7334                if (!IS_ERR(buf))
7335                        root->alloc_bytenr += blocksize;
7336                return buf;
7337        }
7338
7339        block_rsv = use_block_rsv(trans, root, blocksize);
7340        if (IS_ERR(block_rsv))
7341                return ERR_CAST(block_rsv);
7342
7343        ret = btrfs_reserve_extent(root, blocksize, blocksize,
7344                                   empty_size, hint, &ins, 0, 0);
7345        if (ret) {
7346                unuse_block_rsv(root->fs_info, block_rsv, blocksize);
7347                return ERR_PTR(ret);
7348        }
7349
7350        buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
7351        BUG_ON(IS_ERR(buf)); /* -ENOMEM */
7352
7353        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
7354                if (parent == 0)
7355                        parent = ins.objectid;
7356                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7357        } else
7358                BUG_ON(parent > 0);
7359
7360        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
7361                struct btrfs_delayed_extent_op *extent_op;
7362                extent_op = btrfs_alloc_delayed_extent_op();
7363                BUG_ON(!extent_op); /* -ENOMEM */
7364                if (key)
7365                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
7366                else
7367                        memset(&extent_op->key, 0, sizeof(extent_op->key));
7368                extent_op->flags_to_set = flags;
7369                if (skinny_metadata)
7370                        extent_op->update_key = 0;
7371                else
7372                        extent_op->update_key = 1;
7373                extent_op->update_flags = 1;
7374                extent_op->is_data = 0;
7375                extent_op->level = level;
7376
7377                ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
7378                                        ins.objectid,
7379                                        ins.offset, parent, root_objectid,
7380                                        level, BTRFS_ADD_DELAYED_EXTENT,
7381                                        extent_op, 0);
7382                BUG_ON(ret); /* -ENOMEM */
7383        }
7384        return buf;
7385}
7386
7387struct walk_control {
7388        u64 refs[BTRFS_MAX_LEVEL];
7389        u64 flags[BTRFS_MAX_LEVEL];
7390        struct btrfs_key update_progress;
7391        int stage;
7392        int level;
7393        int shared_level;
7394        int update_ref;
7395        int keep_locks;
7396        int reada_slot;
7397        int reada_count;
7398        int for_reloc;
7399};
7400
7401#define DROP_REFERENCE  1
7402#define UPDATE_BACKREF  2
7403
7404static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7405                                     struct btrfs_root *root,
7406                                     struct walk_control *wc,
7407                                     struct btrfs_path *path)
7408{
7409        u64 bytenr;
7410        u64 generation;
7411        u64 refs;
7412        u64 flags;
7413        u32 nritems;
7414        u32 blocksize;
7415        struct btrfs_key key;
7416        struct extent_buffer *eb;
7417        int ret;
7418        int slot;
7419        int nread = 0;
7420
7421        if (path->slots[wc->level] < wc->reada_slot) {
7422                wc->reada_count = wc->reada_count * 2 / 3;
7423                wc->reada_count = max(wc->reada_count, 2);
7424        } else {
7425                wc->reada_count = wc->reada_count * 3 / 2;
7426                wc->reada_count = min_t(int, wc->reada_count,
7427                                        BTRFS_NODEPTRS_PER_BLOCK(root));
7428        }
7429
7430        eb = path->nodes[wc->level];
7431        nritems = btrfs_header_nritems(eb);
7432        blocksize = root->nodesize;
7433
7434        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7435                if (nread >= wc->reada_count)
7436                        break;
7437
7438                cond_resched();
7439                bytenr = btrfs_node_blockptr(eb, slot);
7440                generation = btrfs_node_ptr_generation(eb, slot);
7441
7442                if (slot == path->slots[wc->level])
7443                        goto reada;
7444
7445                if (wc->stage == UPDATE_BACKREF &&
7446                    generation <= root->root_key.offset)
7447                        continue;
7448
7449                /* We don't lock the tree block, it's OK to be racy here */
7450                ret = btrfs_lookup_extent_info(trans, root, bytenr,
7451                                               wc->level - 1, 1, &refs,
7452                                               &flags);
7453                /* We don't care about errors in readahead. */
7454                if (ret < 0)
7455                        continue;
7456                BUG_ON(refs == 0);
7457
7458                if (wc->stage == DROP_REFERENCE) {
7459                        if (refs == 1)
7460                                goto reada;
7461
7462                        if (wc->level == 1 &&
7463                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7464                                continue;
7465                        if (!wc->update_ref ||
7466                            generation <= root->root_key.offset)
7467                                continue;
7468                        btrfs_node_key_to_cpu(eb, &key, slot);
7469                        ret = btrfs_comp_cpu_keys(&key,
7470                                                  &wc->update_progress);
7471                        if (ret < 0)
7472                                continue;
7473                } else {
7474                        if (wc->level == 1 &&
7475                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7476                                continue;
7477                }
7478reada:
7479                readahead_tree_block(root, bytenr);
7480                nread++;
7481        }
7482        wc->reada_slot = slot;
7483}
7484
7485static int account_leaf_items(struct btrfs_trans_handle *trans,
7486                              struct btrfs_root *root,
7487                              struct extent_buffer *eb)
7488{
7489        int nr = btrfs_header_nritems(eb);
7490        int i, extent_type, ret;
7491        struct btrfs_key key;
7492        struct btrfs_file_extent_item *fi;
7493        u64 bytenr, num_bytes;
7494
7495        for (i = 0; i < nr; i++) {
7496                btrfs_item_key_to_cpu(eb, &key, i);
7497
7498                if (key.type != BTRFS_EXTENT_DATA_KEY)
7499                        continue;
7500
7501                fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
7502                /* filter out non qgroup-accountable extents  */
7503                extent_type = btrfs_file_extent_type(eb, fi);
7504
7505                if (extent_type == BTRFS_FILE_EXTENT_INLINE)
7506                        continue;
7507
7508                bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
7509                if (!bytenr)
7510                        continue;
7511
7512                num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
7513
7514                ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7515                                              root->objectid,
7516                                              bytenr, num_bytes,
7517                                              BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
7518                if (ret)
7519                        return ret;
7520        }
7521        return 0;
7522}
7523
7524/*
7525 * Walk up the tree from the bottom, freeing leaves and any interior
7526 * nodes which have had all slots visited. If a node (leaf or
7527 * interior) is freed, the node above it will have it's slot
7528 * incremented. The root node will never be freed.
7529 *
7530 * At the end of this function, we should have a path which has all
7531 * slots incremented to the next position for a search. If we need to
7532 * read a new node it will be NULL and the node above it will have the
7533 * correct slot selected for a later read.
7534 *
7535 * If we increment the root nodes slot counter past the number of
7536 * elements, 1 is returned to signal completion of the search.
7537 */
7538static int adjust_slots_upwards(struct btrfs_root *root,
7539                                struct btrfs_path *path, int root_level)
7540{
7541        int level = 0;
7542        int nr, slot;
7543        struct extent_buffer *eb;
7544
7545        if (root_level == 0)
7546                return 1;
7547
7548        while (level <= root_level) {
7549                eb = path->nodes[level];
7550                nr = btrfs_header_nritems(eb);
7551                path->slots[level]++;
7552                slot = path->slots[level];
7553                if (slot >= nr || level == 0) {
7554                        /*
7555                         * Don't free the root -  we will detect this
7556                         * condition after our loop and return a
7557                         * positive value for caller to stop walking the tree.
7558                         */
7559                        if (level != root_level) {
7560                                btrfs_tree_unlock_rw(eb, path->locks[level]);
7561                                path->locks[level] = 0;
7562
7563                                free_extent_buffer(eb);
7564                                path->nodes[level] = NULL;
7565                                path->slots[level] = 0;
7566                        }
7567                } else {
7568                        /*
7569                         * We have a valid slot to walk back down
7570                         * from. Stop here so caller can process these
7571                         * new nodes.
7572                         */
7573                        break;
7574                }
7575
7576                level++;
7577        }
7578
7579        eb = path->nodes[root_level];
7580        if (path->slots[root_level] >= btrfs_header_nritems(eb))
7581                return 1;
7582
7583        return 0;
7584}
7585
7586/*
7587 * root_eb is the subtree root and is locked before this function is called.
7588 */
7589static int account_shared_subtree(struct btrfs_trans_handle *trans,
7590                                  struct btrfs_root *root,
7591                                  struct extent_buffer *root_eb,
7592                                  u64 root_gen,
7593                                  int root_level)
7594{
7595        int ret = 0;
7596        int level;
7597        struct extent_buffer *eb = root_eb;
7598        struct btrfs_path *path = NULL;
7599
7600        BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
7601        BUG_ON(root_eb == NULL);
7602
7603        if (!root->fs_info->quota_enabled)
7604                return 0;
7605
7606        if (!extent_buffer_uptodate(root_eb)) {
7607                ret = btrfs_read_buffer(root_eb, root_gen);
7608                if (ret)
7609                        goto out;
7610        }
7611
7612        if (root_level == 0) {
7613                ret = account_leaf_items(trans, root, root_eb);
7614                goto out;
7615        }
7616
7617        path = btrfs_alloc_path();
7618        if (!path)
7619                return -ENOMEM;
7620
7621        /*
7622         * Walk down the tree.  Missing extent blocks are filled in as
7623         * we go. Metadata is accounted every time we read a new
7624         * extent block.
7625         *
7626         * When we reach a leaf, we account for file extent items in it,
7627         * walk back up the tree (adjusting slot pointers as we go)
7628         * and restart the search process.
7629         */
7630        extent_buffer_get(root_eb); /* For path */
7631        path->nodes[root_level] = root_eb;
7632        path->slots[root_level] = 0;
7633        path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
7634walk_down:
7635        level = root_level;
7636        while (level >= 0) {
7637                if (path->nodes[level] == NULL) {
7638                        int parent_slot;
7639                        u64 child_gen;
7640                        u64 child_bytenr;
7641
7642                        /* We need to get child blockptr/gen from
7643                         * parent before we can read it. */
7644                        eb = path->nodes[level + 1];
7645                        parent_slot = path->slots[level + 1];
7646                        child_bytenr = btrfs_node_blockptr(eb, parent_slot);
7647                        child_gen = btrfs_node_ptr_generation(eb, parent_slot);
7648
7649                        eb = read_tree_block(root, child_bytenr, child_gen);
7650                        if (!eb || !extent_buffer_uptodate(eb)) {
7651                                ret = -EIO;
7652                                goto out;
7653                        }
7654
7655                        path->nodes[level] = eb;
7656                        path->slots[level] = 0;
7657
7658                        btrfs_tree_read_lock(eb);
7659                        btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
7660                        path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
7661
7662                        ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7663                                                root->objectid,
7664                                                child_bytenr,
7665                                                root->nodesize,
7666                                                BTRFS_QGROUP_OPER_SUB_SUBTREE,
7667                                                0);
7668                        if (ret)
7669                                goto out;
7670
7671                }
7672
7673                if (level == 0) {
7674                        ret = account_leaf_items(trans, root, path->nodes[level]);
7675                        if (ret)
7676                                goto out;
7677
7678                        /* Nonzero return here means we completed our search */
7679                        ret = adjust_slots_upwards(root, path, root_level);
7680                        if (ret)
7681                                break;
7682
7683                        /* Restart search with new slots */
7684                        goto walk_down;
7685                }
7686
7687                level--;
7688        }
7689
7690        ret = 0;
7691out:
7692        btrfs_free_path(path);
7693
7694        return ret;
7695}
7696
7697/*
7698 * helper to process tree block while walking down the tree.
7699 *
7700 * when wc->stage == UPDATE_BACKREF, this function updates
7701 * back refs for pointers in the block.
7702 *
7703 * NOTE: return value 1 means we should stop walking down.
7704 */
7705static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
7706                                   struct btrfs_root *root,
7707                                   struct btrfs_path *path,
7708                                   struct walk_control *wc, int lookup_info)
7709{
7710        int level = wc->level;
7711        struct extent_buffer *eb = path->nodes[level];
7712        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7713        int ret;
7714
7715        if (wc->stage == UPDATE_BACKREF &&
7716            btrfs_header_owner(eb) != root->root_key.objectid)
7717                return 1;
7718
7719        /*
7720         * when reference count of tree block is 1, it won't increase
7721         * again. once full backref flag is set, we never clear it.
7722         */
7723        if (lookup_info &&
7724            ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
7725             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
7726                BUG_ON(!path->locks[level]);
7727                ret = btrfs_lookup_extent_info(trans, root,
7728                                               eb->start, level, 1,
7729                                               &wc->refs[level],
7730                                               &wc->flags[level]);
7731                BUG_ON(ret == -ENOMEM);
7732                if (ret)
7733                        return ret;
7734                BUG_ON(wc->refs[level] == 0);
7735        }
7736
7737        if (wc->stage == DROP_REFERENCE) {
7738                if (wc->refs[level] > 1)
7739                        return 1;
7740
7741                if (path->locks[level] && !wc->keep_locks) {
7742                        btrfs_tree_unlock_rw(eb, path->locks[level]);
7743                        path->locks[level] = 0;
7744                }
7745                return 0;
7746        }
7747
7748        /* wc->stage == UPDATE_BACKREF */
7749        if (!(wc->flags[level] & flag)) {
7750                BUG_ON(!path->locks[level]);
7751                ret = btrfs_inc_ref(trans, root, eb, 1);
7752                BUG_ON(ret); /* -ENOMEM */
7753                ret = btrfs_dec_ref(trans, root, eb, 0);
7754                BUG_ON(ret); /* -ENOMEM */
7755                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
7756                                                  eb->len, flag,
7757                                                  btrfs_header_level(eb), 0);
7758                BUG_ON(ret); /* -ENOMEM */
7759                wc->flags[level] |= flag;
7760        }
7761
7762        /*
7763         * the block is shared by multiple trees, so it's not good to
7764         * keep the tree lock
7765         */
7766        if (path->locks[level] && level > 0) {
7767                btrfs_tree_unlock_rw(eb, path->locks[level]);
7768                path->locks[level] = 0;
7769        }
7770        return 0;
7771}
7772
7773/*
7774 * helper to process tree block pointer.
7775 *
7776 * when wc->stage == DROP_REFERENCE, this function checks
7777 * reference count of the block pointed to. if the block
7778 * is shared and we need update back refs for the subtree
7779 * rooted at the block, this function changes wc->stage to
7780 * UPDATE_BACKREF. if the block is shared and there is no
7781 * need to update back, this function drops the reference
7782 * to the block.
7783 *
7784 * NOTE: return value 1 means we should stop walking down.
7785 */
7786static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7787                                 struct btrfs_root *root,
7788                                 struct btrfs_path *path,
7789                                 struct walk_control *wc, int *lookup_info)
7790{
7791        u64 bytenr;
7792        u64 generation;
7793        u64 parent;
7794        u32 blocksize;
7795        struct btrfs_key key;
7796        struct extent_buffer *next;
7797        int level = wc->level;
7798        int reada = 0;
7799        int ret = 0;
7800        bool need_account = false;
7801
7802        generation = btrfs_node_ptr_generation(path->nodes[level],
7803                                               path->slots[level]);
7804        /*
7805         * if the lower level block was created before the snapshot
7806         * was created, we know there is no need to update back refs
7807         * for the subtree
7808         */
7809        if (wc->stage == UPDATE_BACKREF &&
7810            generation <= root->root_key.offset) {
7811                *lookup_info = 1;
7812                return 1;
7813        }
7814
7815        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7816        blocksize = root->nodesize;
7817
7818        next = btrfs_find_tree_block(root, bytenr);
7819        if (!next) {
7820                next = btrfs_find_create_tree_block(root, bytenr);
7821                if (!next)
7822                        return -ENOMEM;
7823                btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
7824                                               level - 1);
7825                reada = 1;
7826        }
7827        btrfs_tree_lock(next);
7828        btrfs_set_lock_blocking(next);
7829
7830        ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
7831                                       &wc->refs[level - 1],
7832                                       &wc->flags[level - 1]);
7833        if (ret < 0) {
7834                btrfs_tree_unlock(next);
7835                return ret;
7836        }
7837
7838        if (unlikely(wc->refs[level - 1] == 0)) {
7839                btrfs_err(root->fs_info, "Missing references.");
7840                BUG();
7841        }
7842        *lookup_info = 0;
7843
7844        if (wc->stage == DROP_REFERENCE) {
7845                if (wc->refs[level - 1] > 1) {
7846                        need_account = true;
7847                        if (level == 1 &&
7848                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7849                                goto skip;
7850
7851                        if (!wc->update_ref ||
7852                            generation <= root->root_key.offset)
7853                                goto skip;
7854
7855                        btrfs_node_key_to_cpu(path->nodes[level], &key,
7856                                              path->slots[level]);
7857                        ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
7858                        if (ret < 0)
7859                                goto skip;
7860
7861                        wc->stage = UPDATE_BACKREF;
7862                        wc->shared_level = level - 1;
7863                }
7864        } else {
7865                if (level == 1 &&
7866                    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7867                        goto skip;
7868        }
7869
7870        if (!btrfs_buffer_uptodate(next, generation, 0)) {
7871                btrfs_tree_unlock(next);
7872                free_extent_buffer(next);
7873                next = NULL;
7874                *lookup_info = 1;
7875        }
7876
7877        if (!next) {
7878                if (reada && level == 1)
7879                        reada_walk_down(trans, root, wc, path);
7880                next = read_tree_block(root, bytenr, generation);
7881                if (!next || !extent_buffer_uptodate(next)) {
7882                        free_extent_buffer(next);
7883                        return -EIO;
7884                }
7885                btrfs_tree_lock(next);
7886                btrfs_set_lock_blocking(next);
7887        }
7888
7889        level--;
7890        BUG_ON(level != btrfs_header_level(next));
7891        path->nodes[level] = next;
7892        path->slots[level] = 0;
7893        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7894        wc->level = level;
7895        if (wc->level == 1)
7896                wc->reada_slot = 0;
7897        return 0;
7898skip:
7899        wc->refs[level - 1] = 0;
7900        wc->flags[level - 1] = 0;
7901        if (wc->stage == DROP_REFERENCE) {
7902                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
7903                        parent = path->nodes[level]->start;
7904                } else {
7905                        BUG_ON(root->root_key.objectid !=
7906                               btrfs_header_owner(path->nodes[level]));
7907                        parent = 0;
7908                }
7909
7910                if (need_account) {
7911                        ret = account_shared_subtree(trans, root, next,
7912                                                     generation, level - 1);
7913                        if (ret) {
7914                                printk_ratelimited(KERN_ERR "BTRFS: %s Error "
7915                                        "%d accounting shared subtree. Quota "
7916                                        "is out of sync, rescan required.\n",
7917                                        root->fs_info->sb->s_id, ret);
7918                        }
7919                }
7920                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
7921                                root->root_key.objectid, level - 1, 0, 0);
7922                BUG_ON(ret); /* -ENOMEM */
7923        }
7924        btrfs_tree_unlock(next);
7925        free_extent_buffer(next);
7926        *lookup_info = 1;
7927        return 1;
7928}
7929
7930/*
7931 * helper to process tree block while walking up the tree.
7932 *
7933 * when wc->stage == DROP_REFERENCE, this function drops
7934 * reference count on the block.
7935 *
7936 * when wc->stage == UPDATE_BACKREF, this function changes
7937 * wc->stage back to DROP_REFERENCE if we changed wc->stage
7938 * to UPDATE_BACKREF previously while processing the block.
7939 *
7940 * NOTE: return value 1 means we should stop walking up.
7941 */
7942static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7943                                 struct btrfs_root *root,
7944                                 struct btrfs_path *path,
7945                                 struct walk_control *wc)
7946{
7947        int ret;
7948        int level = wc->level;
7949        struct extent_buffer *eb = path->nodes[level];
7950        u64 parent = 0;
7951
7952        if (wc->stage == UPDATE_BACKREF) {
7953                BUG_ON(wc->shared_level < level);
7954                if (level < wc->shared_level)
7955                        goto out;
7956
7957                ret = find_next_key(path, level + 1, &wc->update_progress);
7958                if (ret > 0)
7959                        wc->update_ref = 0;
7960
7961                wc->stage = DROP_REFERENCE;
7962                wc->shared_level = -1;
7963                path->slots[level] = 0;
7964
7965                /*
7966                 * check reference count again if the block isn't locked.
7967                 * we should start walking down the tree again if reference
7968                 * count is one.
7969                 */
7970                if (!path->locks[level]) {
7971                        BUG_ON(level == 0);
7972                        btrfs_tree_lock(eb);
7973                        btrfs_set_lock_blocking(eb);
7974                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7975
7976                        ret = btrfs_lookup_extent_info(trans, root,
7977                                                       eb->start, level, 1,
7978                                                       &wc->refs[level],
7979                                                       &wc->flags[level]);
7980                        if (ret < 0) {
7981                                btrfs_tree_unlock_rw(eb, path->locks[level]);
7982                                path->locks[level] = 0;
7983                                return ret;
7984                        }
7985                        BUG_ON(wc->refs[level] == 0);
7986                        if (wc->refs[level] == 1) {
7987                                btrfs_tree_unlock_rw(eb, path->locks[level]);
7988                                path->locks[level] = 0;
7989                                return 1;
7990                        }
7991                }
7992        }
7993
7994        /* wc->stage == DROP_REFERENCE */
7995        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
7996
7997        if (wc->refs[level] == 1) {
7998                if (level == 0) {
7999                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8000                                ret = btrfs_dec_ref(trans, root, eb, 1);
8001                        else
8002                                ret = btrfs_dec_ref(trans, root, eb, 0);
8003                        BUG_ON(ret); /* -ENOMEM */
8004                        ret = account_leaf_items(trans, root, eb);
8005                        if (ret) {
8006                                printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8007                                        "%d accounting leaf items. Quota "
8008                                        "is out of sync, rescan required.\n",
8009                                        root->fs_info->sb->s_id, ret);
8010                        }
8011                }
8012                /* make block locked assertion in clean_tree_block happy */
8013                if (!path->locks[level] &&
8014                    btrfs_header_generation(eb) == trans->transid) {
8015                        btrfs_tree_lock(eb);
8016                        btrfs_set_lock_blocking(eb);
8017                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8018                }
8019                clean_tree_block(trans, root, eb);
8020        }
8021
8022        if (eb == root->node) {
8023                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8024                        parent = eb->start;
8025                else
8026                        BUG_ON(root->root_key.objectid !=
8027                               btrfs_header_owner(eb));
8028        } else {
8029                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8030                        parent = path->nodes[level + 1]->start;
8031                else
8032                        BUG_ON(root->root_key.objectid !=
8033                               btrfs_header_owner(path->nodes[level + 1]));
8034        }
8035
8036        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
8037out:
8038        wc->refs[level] = 0;
8039        wc->flags[level] = 0;
8040        return 0;
8041}
8042
8043static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
8044                                   struct btrfs_root *root,
8045                                   struct btrfs_path *path,
8046                                   struct walk_control *wc)
8047{
8048        int level = wc->level;
8049        int lookup_info = 1;
8050        int ret;
8051
8052        while (level >= 0) {
8053                ret = walk_down_proc(trans, root, path, wc, lookup_info);
8054                if (ret > 0)
8055                        break;
8056
8057                if (level == 0)
8058                        break;
8059
8060                if (path->slots[level] >=
8061                    btrfs_header_nritems(path->nodes[level]))
8062                        break;
8063
8064                ret = do_walk_down(trans, root, path, wc, &lookup_info);
8065                if (ret > 0) {
8066                        path->slots[level]++;
8067                        continue;
8068                } else if (ret < 0)
8069                        return ret;
8070                level = wc->level;
8071        }
8072        return 0;
8073}
8074
8075static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
8076                                 struct btrfs_root *root,
8077                                 struct btrfs_path *path,
8078                                 struct walk_control *wc, int max_level)
8079{
8080        int level = wc->level;
8081        int ret;
8082
8083        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
8084        while (level < max_level && path->nodes[level]) {
8085                wc->level = level;
8086                if (path->slots[level] + 1 <
8087                    btrfs_header_nritems(path->nodes[level])) {
8088                        path->slots[level]++;
8089                        return 0;
8090                } else {
8091                        ret = walk_up_proc(trans, root, path, wc);
8092                        if (ret > 0)
8093                                return 0;
8094
8095                        if (path->locks[level]) {
8096                                btrfs_tree_unlock_rw(path->nodes[level],
8097                                                     path->locks[level]);
8098                                path->locks[level] = 0;
8099                        }
8100                        free_extent_buffer(path->nodes[level]);
8101                        path->nodes[level] = NULL;
8102                        level++;
8103                }
8104        }
8105        return 1;
8106}
8107
8108/*
8109 * drop a subvolume tree.
8110 *
8111 * this function traverses the tree freeing any blocks that only
8112 * referenced by the tree.
8113 *
8114 * when a shared tree block is found. this function decreases its
8115 * reference count by one. if update_ref is true, this function
8116 * also make sure backrefs for the shared block and all lower level
8117 * blocks are properly updated.
8118 *
8119 * If called with for_reloc == 0, may exit early with -EAGAIN
8120 */
8121int btrfs_drop_snapshot(struct btrfs_root *root,
8122                         struct btrfs_block_rsv *block_rsv, int update_ref,
8123                         int for_reloc)
8124{
8125        struct btrfs_path *path;
8126        struct btrfs_trans_handle *trans;
8127        struct btrfs_root *tree_root = root->fs_info->tree_root;
8128        struct btrfs_root_item *root_item = &root->root_item;
8129        struct walk_control *wc;
8130        struct btrfs_key key;
8131        int err = 0;
8132        int ret;
8133        int level;
8134        bool root_dropped = false;
8135
8136        btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
8137
8138        path = btrfs_alloc_path();
8139        if (!path) {
8140                err = -ENOMEM;
8141                goto out;
8142        }
8143
8144        wc = kzalloc(sizeof(*wc), GFP_NOFS);
8145        if (!wc) {
8146                btrfs_free_path(path);
8147                err = -ENOMEM;
8148                goto out;
8149        }
8150
8151        trans = btrfs_start_transaction(tree_root, 0);
8152        if (IS_ERR(trans)) {
8153                err = PTR_ERR(trans);
8154                goto out_free;
8155        }
8156
8157        if (block_rsv)
8158                trans->block_rsv = block_rsv;
8159
8160        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
8161                level = btrfs_header_level(root->node);
8162                path->nodes[level] = btrfs_lock_root_node(root);
8163                btrfs_set_lock_blocking(path->nodes[level]);
8164                path->slots[level] = 0;
8165                path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8166                memset(&wc->update_progress, 0,
8167                       sizeof(wc->update_progress));
8168        } else {
8169                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
8170                memcpy(&wc->update_progress, &key,
8171                       sizeof(wc->update_progress));
8172
8173                level = root_item->drop_level;
8174                BUG_ON(level == 0);
8175                path->lowest_level = level;
8176                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8177                path->lowest_level = 0;
8178                if (ret < 0) {
8179                        err = ret;
8180                        goto out_end_trans;
8181                }
8182                WARN_ON(ret > 0);
8183
8184                /*
8185                 * unlock our path, this is safe because only this
8186                 * function is allowed to delete this snapshot
8187                 */
8188                btrfs_unlock_up_safe(path, 0);
8189
8190                level = btrfs_header_level(root->node);
8191                while (1) {
8192                        btrfs_tree_lock(path->nodes[level]);
8193                        btrfs_set_lock_blocking(path->nodes[level]);
8194                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8195
8196                        ret = btrfs_lookup_extent_info(trans, root,
8197                                                path->nodes[level]->start,
8198                                                level, 1, &wc->refs[level],
8199                                                &wc->flags[level]);
8200                        if (ret < 0) {
8201                                err = ret;
8202                                goto out_end_trans;
8203                        }
8204                        BUG_ON(wc->refs[level] == 0);
8205
8206                        if (level == root_item->drop_level)
8207                                break;
8208
8209                        btrfs_tree_unlock(path->nodes[level]);
8210                        path->locks[level] = 0;
8211                        WARN_ON(wc->refs[level] != 1);
8212                        level--;
8213                }
8214        }
8215
8216        wc->level = level;
8217        wc->shared_level = -1;
8218        wc->stage = DROP_REFERENCE;
8219        wc->update_ref = update_ref;
8220        wc->keep_locks = 0;
8221        wc->for_reloc = for_reloc;
8222        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8223
8224        while (1) {
8225
8226                ret = walk_down_tree(trans, root, path, wc);
8227                if (ret < 0) {
8228                        err = ret;
8229                        break;
8230                }
8231
8232                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
8233                if (ret < 0) {
8234                        err = ret;
8235                        break;
8236                }
8237
8238                if (ret > 0) {
8239                        BUG_ON(wc->stage != DROP_REFERENCE);
8240                        break;
8241                }
8242
8243                if (wc->stage == DROP_REFERENCE) {
8244                        level = wc->level;
8245                        btrfs_node_key(path->nodes[level],
8246                                       &root_item->drop_progress,
8247                                       path->slots[level]);
8248                        root_item->drop_level = level;
8249                }
8250
8251                BUG_ON(wc->level == 0);
8252                if (btrfs_should_end_transaction(trans, tree_root) ||
8253                    (!for_reloc && btrfs_need_cleaner_sleep(root))) {
8254                        ret = btrfs_update_root(trans, tree_root,
8255                                                &root->root_key,
8256                                                root_item);
8257                        if (ret) {
8258                                btrfs_abort_transaction(trans, tree_root, ret);
8259                                err = ret;
8260                                goto out_end_trans;
8261                        }
8262
8263                        /*
8264                         * Qgroup update accounting is run from
8265                         * delayed ref handling. This usually works
8266                         * out because delayed refs are normally the
8267                         * only way qgroup updates are added. However,
8268                         * we may have added updates during our tree
8269                         * walk so run qgroups here to make sure we
8270                         * don't lose any updates.
8271                         */
8272                        ret = btrfs_delayed_qgroup_accounting(trans,
8273                                                              root->fs_info);
8274                        if (ret)
8275                                printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
8276                                                   "running qgroup updates "
8277                                                   "during snapshot delete. "
8278                                                   "Quota is out of sync, "
8279                                                   "rescan required.\n", ret);
8280
8281                        btrfs_end_transaction_throttle(trans, tree_root);
8282                        if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8283                                pr_debug("BTRFS: drop snapshot early exit\n");
8284                                err = -EAGAIN;
8285                                goto out_free;
8286                        }
8287
8288                        trans = btrfs_start_transaction(tree_root, 0);
8289                        if (IS_ERR(trans)) {
8290                                err = PTR_ERR(trans);
8291                                goto out_free;
8292                        }
8293                        if (block_rsv)
8294                                trans->block_rsv = block_rsv;
8295                }
8296        }
8297        btrfs_release_path(path);
8298        if (err)
8299                goto out_end_trans;
8300
8301        ret = btrfs_del_root(trans, tree_root, &root->root_key);
8302        if (ret) {
8303                btrfs_abort_transaction(trans, tree_root, ret);
8304                goto out_end_trans;
8305        }
8306
8307        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
8308                ret = btrfs_find_root(tree_root, &root->root_key, path,
8309                                      NULL, NULL);
8310                if (ret < 0) {
8311                        btrfs_abort_transaction(trans, tree_root, ret);
8312                        err = ret;
8313                        goto out_end_trans;
8314                } else if (ret > 0) {
8315                        /* if we fail to delete the orphan item this time
8316                         * around, it'll get picked up the next time.
8317                         *
8318                         * The most common failure here is just -ENOENT.
8319                         */
8320                        btrfs_del_orphan_item(trans, tree_root,
8321                                              root->root_key.objectid);
8322                }
8323        }
8324
8325        if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
8326                btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
8327        } else {
8328                free_extent_buffer(root->node);
8329                free_extent_buffer(root->commit_root);
8330                btrfs_put_fs_root(root);
8331        }
8332        root_dropped = true;
8333out_end_trans:
8334        ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
8335        if (ret)
8336                printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
8337                                   "running qgroup updates "
8338                                   "during snapshot delete. "
8339                                   "Quota is out of sync, "
8340                                   "rescan required.\n", ret);
8341
8342        btrfs_end_transaction_throttle(trans, tree_root);
8343out_free:
8344        kfree(wc);
8345        btrfs_free_path(path);
8346out:
8347        /*
8348         * So if we need to stop dropping the snapshot for whatever reason we
8349         * need to make sure to add it back to the dead root list so that we
8350         * keep trying to do the work later.  This also cleans up roots if we
8351         * don't have it in the radix (like when we recover after a power fail
8352         * or unmount) so we don't leak memory.
8353         */
8354        if (!for_reloc && root_dropped == false)
8355                btrfs_add_dead_root(root);
8356        if (err && err != -EAGAIN)
8357                btrfs_std_error(root->fs_info, err);
8358        return err;
8359}
8360
8361/*
8362 * drop subtree rooted at tree block 'node'.
8363 *
8364 * NOTE: this function will unlock and release tree block 'node'
8365 * only used by relocation code
8366 */
8367int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
8368                        struct btrfs_root *root,
8369                        struct extent_buffer *node,
8370                        struct extent_buffer *parent)
8371{
8372        struct btrfs_path *path;
8373        struct walk_control *wc;
8374        int level;
8375        int parent_level;
8376        int ret = 0;
8377        int wret;
8378
8379        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
8380
8381        path = btrfs_alloc_path();
8382        if (!path)
8383                return -ENOMEM;
8384
8385        wc = kzalloc(sizeof(*wc), GFP_NOFS);
8386        if (!wc) {
8387                btrfs_free_path(path);
8388                return -ENOMEM;
8389        }
8390
8391        btrfs_assert_tree_locked(parent);
8392        parent_level = btrfs_header_level(parent);
8393        extent_buffer_get(parent);
8394        path->nodes[parent_level] = parent;
8395        path->slots[parent_level] = btrfs_header_nritems(parent);
8396
8397        btrfs_assert_tree_locked(node);
8398        level = btrfs_header_level(node);
8399        path->nodes[level] = node;
8400        path->slots[level] = 0;
8401        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8402
8403        wc->refs[parent_level] = 1;
8404        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8405        wc->level = level;
8406        wc->shared_level = -1;
8407        wc->stage = DROP_REFERENCE;
8408        wc->update_ref = 0;
8409        wc->keep_locks = 1;
8410        wc->for_reloc = 1;
8411        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8412
8413        while (1) {
8414                wret = walk_down_tree(trans, root, path, wc);
8415                if (wret < 0) {
8416                        ret = wret;
8417                        break;
8418                }
8419
8420                wret = walk_up_tree(trans, root, path, wc, parent_level);
8421                if (wret < 0)
8422                        ret = wret;
8423                if (wret != 0)
8424                        break;
8425        }
8426
8427        kfree(wc);
8428        btrfs_free_path(path);
8429        return ret;
8430}
8431
8432static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
8433{
8434        u64 num_devices;
8435        u64 stripped;
8436
8437        /*
8438         * if restripe for this chunk_type is on pick target profile and
8439         * return, otherwise do the usual balance
8440         */
8441        stripped = get_restripe_target(root->fs_info, flags);
8442        if (stripped)
8443                return extended_to_chunk(stripped);
8444
8445        num_devices = root->fs_info->fs_devices->rw_devices;
8446
8447        stripped = BTRFS_BLOCK_GROUP_RAID0 |
8448                BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
8449                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
8450
8451        if (num_devices == 1) {
8452                stripped |= BTRFS_BLOCK_GROUP_DUP;
8453                stripped = flags & ~stripped;
8454
8455                /* turn raid0 into single device chunks */
8456                if (flags & BTRFS_BLOCK_GROUP_RAID0)
8457                        return stripped;
8458
8459                /* turn mirroring into duplication */
8460                if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
8461                             BTRFS_BLOCK_GROUP_RAID10))
8462                        return stripped | BTRFS_BLOCK_GROUP_DUP;
8463        } else {
8464                /* they already had raid on here, just return */
8465                if (flags & stripped)
8466                        return flags;
8467
8468                stripped |= BTRFS_BLOCK_GROUP_DUP;
8469                stripped = flags & ~stripped;
8470
8471                /* switch duplicated blocks with raid1 */
8472                if (flags & BTRFS_BLOCK_GROUP_DUP)
8473                        return stripped | BTRFS_BLOCK_GROUP_RAID1;
8474
8475                /* this is drive concat, leave it alone */
8476        }
8477
8478        return flags;
8479}
8480
8481static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
8482{
8483        struct btrfs_space_info *sinfo = cache->space_info;
8484        u64 num_bytes;
8485        u64 min_allocable_bytes;
8486        int ret = -ENOSPC;
8487
8488
8489        /*
8490         * We need some metadata space and system metadata space for
8491         * allocating chunks in some corner cases until we force to set
8492         * it to be readonly.
8493         */
8494        if ((sinfo->flags &
8495             (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
8496            !force)
8497                min_allocable_bytes = 1 * 1024 * 1024;
8498        else
8499                min_allocable_bytes = 0;
8500
8501        spin_lock(&sinfo->lock);
8502        spin_lock(&cache->lock);
8503
8504        if (cache->ro) {
8505                ret = 0;
8506                goto out;
8507        }
8508
8509        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8510                    cache->bytes_super - btrfs_block_group_used(&cache->item);
8511
8512        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
8513            sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
8514            min_allocable_bytes <= sinfo->total_bytes) {
8515                sinfo->bytes_readonly += num_bytes;
8516                cache->ro = 1;
8517                list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
8518                ret = 0;
8519        }
8520out:
8521        spin_unlock(&cache->lock);
8522        spin_unlock(&sinfo->lock);
8523        return ret;
8524}
8525
8526int btrfs_set_block_group_ro(struct btrfs_root *root,
8527                             struct btrfs_block_group_cache *cache)
8528
8529{
8530        struct btrfs_trans_handle *trans;
8531        u64 alloc_flags;
8532        int ret;
8533
8534        BUG_ON(cache->ro);
8535
8536        trans = btrfs_join_transaction(root);
8537        if (IS_ERR(trans))
8538                return PTR_ERR(trans);
8539
8540        ret = set_block_group_ro(cache, 0);
8541        if (!ret)
8542                goto out;
8543        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
8544        ret = do_chunk_alloc(trans, root, alloc_flags,
8545                             CHUNK_ALLOC_FORCE);
8546        if (ret < 0)
8547                goto out;
8548        ret = set_block_group_ro(cache, 0);
8549out:
8550        if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
8551                alloc_flags = update_block_group_flags(root, cache->flags);
8552                check_system_chunk(trans, root, alloc_flags);
8553        }
8554
8555        btrfs_end_transaction(trans, root);
8556        return ret;
8557}
8558
8559int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8560                            struct btrfs_root *root, u64 type)
8561{
8562        u64 alloc_flags = get_alloc_profile(root, type);
8563        return do_chunk_alloc(trans, root, alloc_flags,
8564                              CHUNK_ALLOC_FORCE);
8565}
8566
8567/*
8568 * helper to account the unused space of all the readonly block group in the
8569 * space_info. takes mirrors into account.
8570 */
8571u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8572{
8573        struct btrfs_block_group_cache *block_group;
8574        u64 free_bytes = 0;
8575        int factor;
8576
8577        /* It's df, we don't care if it's racey */
8578        if (list_empty(&sinfo->ro_bgs))
8579                return 0;
8580
8581        spin_lock(&sinfo->lock);
8582        list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
8583                spin_lock(&block_group->lock);
8584
8585                if (!block_group->ro) {
8586                        spin_unlock(&block_group->lock);
8587                        continue;
8588                }
8589
8590                if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8591                                          BTRFS_BLOCK_GROUP_RAID10 |
8592                                          BTRFS_BLOCK_GROUP_DUP))
8593                        factor = 2;
8594                else
8595                        factor = 1;
8596
8597                free_bytes += (block_group->key.offset -
8598                               btrfs_block_group_used(&block_group->item)) *
8599                               factor;
8600
8601                spin_unlock(&block_group->lock);
8602        }
8603        spin_unlock(&sinfo->lock);
8604
8605        return free_bytes;
8606}
8607
8608void btrfs_set_block_group_rw(struct btrfs_root *root,
8609                              struct btrfs_block_group_cache *cache)
8610{
8611        struct btrfs_space_info *sinfo = cache->space_info;
8612        u64 num_bytes;
8613
8614        BUG_ON(!cache->ro);
8615
8616        spin_lock(&sinfo->lock);
8617        spin_lock(&cache->lock);
8618        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8619                    cache->bytes_super - btrfs_block_group_used(&cache->item);
8620        sinfo->bytes_readonly -= num_bytes;
8621        cache->ro = 0;
8622        list_del_init(&cache->ro_list);
8623        spin_unlock(&cache->lock);
8624        spin_unlock(&sinfo->lock);
8625}
8626
8627/*
8628 * checks to see if its even possible to relocate this block group.
8629 *
8630 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
8631 * ok to go ahead and try.
8632 */
8633int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8634{
8635        struct btrfs_block_group_cache *block_group;
8636        struct btrfs_space_info *space_info;
8637        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
8638        struct btrfs_device *device;
8639        struct btrfs_trans_handle *trans;
8640        u64 min_free;
8641        u64 dev_min = 1;
8642        u64 dev_nr = 0;
8643        u64 target;
8644        int index;
8645        int full = 0;
8646        int ret = 0;
8647
8648        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
8649
8650        /* odd, couldn't find the block group, leave it alone */
8651        if (!block_group)
8652                return -1;
8653
8654        min_free = btrfs_block_group_used(&block_group->item);
8655
8656        /* no bytes used, we're good */
8657        if (!min_free)
8658                goto out;
8659
8660        space_info = block_group->space_info;
8661        spin_lock(&space_info->lock);
8662
8663        full = space_info->full;
8664
8665        /*
8666         * if this is the last block group we have in this space, we can't
8667         * relocate it unless we're able to allocate a new chunk below.
8668         *
8669         * Otherwise, we need to make sure we have room in the space to handle
8670         * all of the extents from this block group.  If we can, we're good
8671         */
8672        if ((space_info->total_bytes != block_group->key.offset) &&
8673            (space_info->bytes_used + space_info->bytes_reserved +
8674             space_info->bytes_pinned + space_info->bytes_readonly +
8675             min_free < space_info->total_bytes)) {
8676                spin_unlock(&space_info->lock);
8677                goto out;
8678        }
8679        spin_unlock(&space_info->lock);
8680
8681        /*
8682         * ok we don't have enough space, but maybe we have free space on our
8683         * devices to allocate new chunks for relocation, so loop through our
8684         * alloc devices and guess if we have enough space.  if this block
8685         * group is going to be restriped, run checks against the target
8686         * profile instead of the current one.
8687         */
8688        ret = -1;
8689
8690        /*
8691         * index:
8692         *      0: raid10
8693         *      1: raid1
8694         *      2: dup
8695         *      3: raid0
8696         *      4: single
8697         */
8698        target = get_restripe_target(root->fs_info, block_group->flags);
8699        if (target) {
8700                index = __get_raid_index(extended_to_chunk(target));
8701        } else {
8702                /*
8703                 * this is just a balance, so if we were marked as full
8704                 * we know there is no space for a new chunk
8705                 */
8706                if (full)
8707                        goto out;
8708
8709                index = get_block_group_index(block_group);
8710        }
8711
8712        if (index == BTRFS_RAID_RAID10) {
8713                dev_min = 4;
8714                /* Divide by 2 */
8715                min_free >>= 1;
8716        } else if (index == BTRFS_RAID_RAID1) {
8717                dev_min = 2;
8718        } else if (index == BTRFS_RAID_DUP) {
8719                /* Multiply by 2 */
8720                min_free <<= 1;
8721        } else if (index == BTRFS_RAID_RAID0) {
8722                dev_min = fs_devices->rw_devices;
8723                do_div(min_free, dev_min);
8724        }
8725
8726        /* We need to do this so that we can look at pending chunks */
8727        trans = btrfs_join_transaction(root);
8728        if (IS_ERR(trans)) {
8729                ret = PTR_ERR(trans);
8730                goto out;
8731        }
8732
8733        mutex_lock(&root->fs_info->chunk_mutex);
8734        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8735                u64 dev_offset;
8736
8737                /*
8738                 * check to make sure we can actually find a chunk with enough
8739                 * space to fit our block group in.
8740                 */
8741                if (device->total_bytes > device->bytes_used + min_free &&
8742                    !device->is_tgtdev_for_dev_replace) {
8743                        ret = find_free_dev_extent(trans, device, min_free,
8744                                                   &dev_offset, NULL);
8745                        if (!ret)
8746                                dev_nr++;
8747
8748                        if (dev_nr >= dev_min)
8749                                break;
8750
8751                        ret = -1;
8752                }
8753        }
8754        mutex_unlock(&root->fs_info->chunk_mutex);
8755        btrfs_end_transaction(trans, root);
8756out:
8757        btrfs_put_block_group(block_group);
8758        return ret;
8759}
8760
8761static int find_first_block_group(struct btrfs_root *root,
8762                struct btrfs_path *path, struct btrfs_key *key)
8763{
8764        int ret = 0;
8765        struct btrfs_key found_key;
8766        struct extent_buffer *leaf;
8767        int slot;
8768
8769        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
8770        if (ret < 0)
8771                goto out;
8772
8773        while (1) {
8774                slot = path->slots[0];
8775                leaf = path->nodes[0];
8776                if (slot >= btrfs_header_nritems(leaf)) {
8777                        ret = btrfs_next_leaf(root, path);
8778                        if (ret == 0)
8779                                continue;
8780                        if (ret < 0)
8781                                goto out;
8782                        break;
8783                }
8784                btrfs_item_key_to_cpu(leaf, &found_key, slot);
8785
8786                if (found_key.objectid >= key->objectid &&
8787                    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
8788                        ret = 0;
8789                        goto out;
8790                }
8791                path->slots[0]++;
8792        }
8793out:
8794        return ret;
8795}
8796
8797void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8798{
8799        struct btrfs_block_group_cache *block_group;
8800        u64 last = 0;
8801
8802        while (1) {
8803                struct inode *inode;
8804
8805                block_group = btrfs_lookup_first_block_group(info, last);
8806                while (block_group) {
8807                        spin_lock(&block_group->lock);
8808                        if (block_group->iref)
8809                                break;
8810                        spin_unlock(&block_group->lock);
8811                        block_group = next_block_group(info->tree_root,
8812                                                       block_group);
8813                }
8814                if (!block_group) {
8815                        if (last == 0)
8816                                break;
8817                        last = 0;
8818                        continue;
8819                }
8820
8821                inode = block_group->inode;
8822                block_group->iref = 0;
8823                block_group->inode = NULL;
8824                spin_unlock(&block_group->lock);
8825                iput(inode);
8826                last = block_group->key.objectid + block_group->key.offset;
8827                btrfs_put_block_group(block_group);
8828        }
8829}
8830
8831int btrfs_free_block_groups(struct btrfs_fs_info *info)
8832{
8833        struct btrfs_block_group_cache *block_group;
8834        struct btrfs_space_info *space_info;
8835        struct btrfs_caching_control *caching_ctl;
8836        struct rb_node *n;
8837
8838        down_write(&info->commit_root_sem);
8839        while (!list_empty(&info->caching_block_groups)) {
8840                caching_ctl = list_entry(info->caching_block_groups.next,
8841                                         struct btrfs_caching_control, list);
8842                list_del(&caching_ctl->list);
8843                put_caching_control(caching_ctl);
8844        }
8845        up_write(&info->commit_root_sem);
8846
8847        spin_lock(&info->unused_bgs_lock);
8848        while (!list_empty(&info->unused_bgs)) {
8849                block_group = list_first_entry(&info->unused_bgs,
8850                                               struct btrfs_block_group_cache,
8851                                               bg_list);
8852                list_del_init(&block_group->bg_list);
8853                btrfs_put_block_group(block_group);
8854        }
8855        spin_unlock(&info->unused_bgs_lock);
8856
8857        spin_lock(&info->block_group_cache_lock);
8858        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8859                block_group = rb_entry(n, struct btrfs_block_group_cache,
8860                                       cache_node);
8861                rb_erase(&block_group->cache_node,
8862                         &info->block_group_cache_tree);
8863                RB_CLEAR_NODE(&block_group->cache_node);
8864                spin_unlock(&info->block_group_cache_lock);
8865
8866                down_write(&block_group->space_info->groups_sem);
8867                list_del(&block_group->list);
8868                up_write(&block_group->space_info->groups_sem);
8869
8870                if (block_group->cached == BTRFS_CACHE_STARTED)
8871                        wait_block_group_cache_done(block_group);
8872
8873                /*
8874                 * We haven't cached this block group, which means we could
8875                 * possibly have excluded extents on this block group.
8876                 */
8877                if (block_group->cached == BTRFS_CACHE_NO ||
8878                    block_group->cached == BTRFS_CACHE_ERROR)
8879                        free_excluded_extents(info->extent_root, block_group);
8880
8881                btrfs_remove_free_space_cache(block_group);
8882                btrfs_put_block_group(block_group);
8883
8884                spin_lock(&info->block_group_cache_lock);
8885        }
8886        spin_unlock(&info->block_group_cache_lock);
8887
8888        /* now that all the block groups are freed, go through and
8889         * free all the space_info structs.  This is only called during
8890         * the final stages of unmount, and so we know nobody is
8891         * using them.  We call synchronize_rcu() once before we start,
8892         * just to be on the safe side.
8893         */
8894        synchronize_rcu();
8895
8896        release_global_block_rsv(info);
8897
8898        while (!list_empty(&info->space_info)) {
8899                int i;
8900
8901                space_info = list_entry(info->space_info.next,
8902                                        struct btrfs_space_info,
8903                                        list);
8904                if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
8905                        if (WARN_ON(space_info->bytes_pinned > 0 ||
8906                            space_info->bytes_reserved > 0 ||
8907                            space_info->bytes_may_use > 0)) {
8908                                dump_space_info(space_info, 0, 0);
8909                        }
8910                }
8911                list_del(&space_info->list);
8912                for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
8913                        struct kobject *kobj;
8914                        kobj = space_info->block_group_kobjs[i];
8915                        space_info->block_group_kobjs[i] = NULL;
8916                        if (kobj) {
8917                                kobject_del(kobj);
8918                                kobject_put(kobj);
8919                        }
8920                }
8921                kobject_del(&space_info->kobj);
8922                kobject_put(&space_info->kobj);
8923        }
8924        return 0;
8925}
8926
8927static void __link_block_group(struct btrfs_space_info *space_info,
8928                               struct btrfs_block_group_cache *cache)
8929{
8930        int index = get_block_group_index(cache);
8931        bool first = false;
8932
8933        down_write(&space_info->groups_sem);
8934        if (list_empty(&space_info->block_groups[index]))
8935                first = true;
8936        list_add_tail(&cache->list, &space_info->block_groups[index]);
8937        up_write(&space_info->groups_sem);
8938
8939        if (first) {
8940                struct raid_kobject *rkobj;
8941                int ret;
8942
8943                rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
8944                if (!rkobj)
8945                        goto out_err;
8946                rkobj->raid_type = index;
8947                kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
8948                ret = kobject_add(&rkobj->kobj, &space_info->kobj,
8949                                  "%s", get_raid_name(index));
8950                if (ret) {
8951                        kobject_put(&rkobj->kobj);
8952                        goto out_err;
8953                }
8954                space_info->block_group_kobjs[index] = &rkobj->kobj;
8955        }
8956
8957        return;
8958out_err:
8959        pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
8960}
8961
8962static struct btrfs_block_group_cache *
8963btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
8964{
8965        struct btrfs_block_group_cache *cache;
8966
8967        cache = kzalloc(sizeof(*cache), GFP_NOFS);
8968        if (!cache)
8969                return NULL;
8970
8971        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8972                                        GFP_NOFS);
8973        if (!cache->free_space_ctl) {
8974                kfree(cache);
8975                return NULL;
8976        }
8977
8978        cache->key.objectid = start;
8979        cache->key.offset = size;
8980        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8981
8982        cache->sectorsize = root->sectorsize;
8983        cache->fs_info = root->fs_info;
8984        cache->full_stripe_len = btrfs_full_stripe_len(root,
8985                                               &root->fs_info->mapping_tree,
8986                                               start);
8987        atomic_set(&cache->count, 1);
8988        spin_lock_init(&cache->lock);
8989        init_rwsem(&cache->data_rwsem);
8990        INIT_LIST_HEAD(&cache->list);
8991        INIT_LIST_HEAD(&cache->cluster_list);
8992        INIT_LIST_HEAD(&cache->bg_list);
8993        INIT_LIST_HEAD(&cache->ro_list);
8994        INIT_LIST_HEAD(&cache->dirty_list);
8995        btrfs_init_free_space_ctl(cache);
8996        atomic_set(&cache->trimming, 0);
8997
8998        return cache;
8999}
9000
9001int btrfs_read_block_groups(struct btrfs_root *root)
9002{
9003        struct btrfs_path *path;
9004        int ret;
9005        struct btrfs_block_group_cache *cache;
9006        struct btrfs_fs_info *info = root->fs_info;
9007        struct btrfs_space_info *space_info;
9008        struct btrfs_key key;
9009        struct btrfs_key found_key;
9010        struct extent_buffer *leaf;
9011        int need_clear = 0;
9012        u64 cache_gen;
9013
9014        root = info->extent_root;
9015        key.objectid = 0;
9016        key.offset = 0;
9017        key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9018        path = btrfs_alloc_path();
9019        if (!path)
9020                return -ENOMEM;
9021        path->reada = 1;
9022
9023        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
9024        if (btrfs_test_opt(root, SPACE_CACHE) &&
9025            btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
9026                need_clear = 1;
9027        if (btrfs_test_opt(root, CLEAR_CACHE))
9028                need_clear = 1;
9029
9030        while (1) {
9031                ret = find_first_block_group(root, path, &key);
9032                if (ret > 0)
9033                        break;
9034                if (ret != 0)
9035                        goto error;
9036
9037                leaf = path->nodes[0];
9038                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
9039
9040                cache = btrfs_create_block_group_cache(root, found_key.objectid,
9041                                                       found_key.offset);
9042                if (!cache) {
9043                        ret = -ENOMEM;
9044                        goto error;
9045                }
9046
9047                if (need_clear) {
9048                        /*
9049                         * When we mount with old space cache, we need to
9050                         * set BTRFS_DC_CLEAR and set dirty flag.
9051                         *
9052                         * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
9053                         *    truncate the old free space cache inode and
9054                         *    setup a new one.
9055                         * b) Setting 'dirty flag' makes sure that we flush
9056                         *    the new space cache info onto disk.
9057                         */
9058                        if (btrfs_test_opt(root, SPACE_CACHE))
9059                                cache->disk_cache_state = BTRFS_DC_CLEAR;
9060                }
9061
9062                read_extent_buffer(leaf, &cache->item,
9063                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
9064                                   sizeof(cache->item));
9065                cache->flags = btrfs_block_group_flags(&cache->item);
9066
9067                key.objectid = found_key.objectid + found_key.offset;
9068                btrfs_release_path(path);
9069
9070                /*
9071                 * We need to exclude the super stripes now so that the space
9072                 * info has super bytes accounted for, otherwise we'll think
9073                 * we have more space than we actually do.
9074                 */
9075                ret = exclude_super_stripes(root, cache);
9076                if (ret) {
9077                        /*
9078                         * We may have excluded something, so call this just in
9079                         * case.
9080                         */
9081                        free_excluded_extents(root, cache);
9082                        btrfs_put_block_group(cache);
9083                        goto error;
9084                }
9085
9086                /*
9087                 * check for two cases, either we are full, and therefore
9088                 * don't need to bother with the caching work since we won't
9089                 * find any space, or we are empty, and we can just add all
9090                 * the space in and be done with it.  This saves us _alot_ of
9091                 * time, particularly in the full case.
9092                 */
9093                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
9094                        cache->last_byte_to_unpin = (u64)-1;
9095                        cache->cached = BTRFS_CACHE_FINISHED;
9096                        free_excluded_extents(root, cache);
9097                } else if (btrfs_block_group_used(&cache->item) == 0) {
9098                        cache->last_byte_to_unpin = (u64)-1;
9099                        cache->cached = BTRFS_CACHE_FINISHED;
9100                        add_new_free_space(cache, root->fs_info,
9101                                           found_key.objectid,
9102                                           found_key.objectid +
9103                                           found_key.offset);
9104                        free_excluded_extents(root, cache);
9105                }
9106
9107                ret = btrfs_add_block_group_cache(root->fs_info, cache);
9108                if (ret) {
9109                        btrfs_remove_free_space_cache(cache);
9110                        btrfs_put_block_group(cache);
9111                        goto error;
9112                }
9113
9114                ret = update_space_info(info, cache->flags, found_key.offset,
9115                                        btrfs_block_group_used(&cache->item),
9116                                        &space_info);
9117                if (ret) {
9118                        btrfs_remove_free_space_cache(cache);
9119                        spin_lock(&info->block_group_cache_lock);
9120                        rb_erase(&cache->cache_node,
9121                                 &info->block_group_cache_tree);
9122                        RB_CLEAR_NODE(&cache->cache_node);
9123                        spin_unlock(&info->block_group_cache_lock);
9124                        btrfs_put_block_group(cache);
9125                        goto error;
9126                }
9127
9128                cache->space_info = space_info;
9129                spin_lock(&cache->space_info->lock);
9130                cache->space_info->bytes_readonly += cache->bytes_super;
9131                spin_unlock(&cache->space_info->lock);
9132
9133                __link_block_group(space_info, cache);
9134
9135                set_avail_alloc_bits(root->fs_info, cache->flags);
9136                if (btrfs_chunk_readonly(root, cache->key.objectid)) {
9137                        set_block_group_ro(cache, 1);
9138                } else if (btrfs_block_group_used(&cache->item) == 0) {
9139                        spin_lock(&info->unused_bgs_lock);
9140                        /* Should always be true but just in case. */
9141                        if (list_empty(&cache->bg_list)) {
9142                                btrfs_get_block_group(cache);
9143                                list_add_tail(&cache->bg_list,
9144                                              &info->unused_bgs);
9145                        }
9146                        spin_unlock(&info->unused_bgs_lock);
9147                }
9148        }
9149
9150        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
9151                if (!(get_alloc_profile(root, space_info->flags) &
9152                      (BTRFS_BLOCK_GROUP_RAID10 |
9153                       BTRFS_BLOCK_GROUP_RAID1 |
9154                       BTRFS_BLOCK_GROUP_RAID5 |
9155                       BTRFS_BLOCK_GROUP_RAID6 |
9156                       BTRFS_BLOCK_GROUP_DUP)))
9157                        continue;
9158                /*
9159                 * avoid allocating from un-mirrored block group if there are
9160                 * mirrored block groups.
9161                 */
9162                list_for_each_entry(cache,
9163                                &space_info->block_groups[BTRFS_RAID_RAID0],
9164                                list)
9165                        set_block_group_ro(cache, 1);
9166                list_for_each_entry(cache,
9167                                &space_info->block_groups[BTRFS_RAID_SINGLE],
9168                                list)
9169                        set_block_group_ro(cache, 1);
9170        }
9171
9172        init_global_block_rsv(info);
9173        ret = 0;
9174error:
9175        btrfs_free_path(path);
9176        return ret;
9177}
9178
9179void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9180                                       struct btrfs_root *root)
9181{
9182        struct btrfs_block_group_cache *block_group, *tmp;
9183        struct btrfs_root *extent_root = root->fs_info->extent_root;
9184        struct btrfs_block_group_item item;
9185        struct btrfs_key key;
9186        int ret = 0;
9187
9188        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9189                if (ret)
9190                        goto next;
9191
9192                spin_lock(&block_group->lock);
9193                memcpy(&item, &block_group->item, sizeof(item));
9194                memcpy(&key, &block_group->key, sizeof(key));
9195                spin_unlock(&block_group->lock);
9196
9197                ret = btrfs_insert_item(trans, extent_root, &key, &item,
9198                                        sizeof(item));
9199                if (ret)
9200                        btrfs_abort_transaction(trans, extent_root, ret);
9201                ret = btrfs_finish_chunk_alloc(trans, extent_root,
9202                                               key.objectid, key.offset);
9203                if (ret)
9204                        btrfs_abort_transaction(trans, extent_root, ret);
9205next:
9206                list_del_init(&block_group->bg_list);
9207        }
9208}
9209
9210int btrfs_make_block_group(struct btrfs_trans_handle *trans,
9211                           struct btrfs_root *root, u64 bytes_used,
9212                           u64 type, u64 chunk_objectid, u64 chunk_offset,
9213                           u64 size)
9214{
9215        int ret;
9216        struct btrfs_root *extent_root;
9217        struct btrfs_block_group_cache *cache;
9218
9219        extent_root = root->fs_info->extent_root;
9220
9221        btrfs_set_log_full_commit(root->fs_info, trans);
9222
9223        cache = btrfs_create_block_group_cache(root, chunk_offset, size);
9224        if (!cache)
9225                return -ENOMEM;
9226
9227        btrfs_set_block_group_used(&cache->item, bytes_used);
9228        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
9229        btrfs_set_block_group_flags(&cache->item, type);
9230
9231        cache->flags = type;
9232        cache->last_byte_to_unpin = (u64)-1;
9233        cache->cached = BTRFS_CACHE_FINISHED;
9234        ret = exclude_super_stripes(root, cache);
9235        if (ret) {
9236                /*
9237                 * We may have excluded something, so call this just in
9238                 * case.
9239                 */
9240                free_excluded_extents(root, cache);
9241                btrfs_put_block_group(cache);
9242                return ret;
9243        }
9244
9245        add_new_free_space(cache, root->fs_info, chunk_offset,
9246                           chunk_offset + size);
9247
9248        free_excluded_extents(root, cache);
9249
9250        ret = btrfs_add_block_group_cache(root->fs_info, cache);
9251        if (ret) {
9252                btrfs_remove_free_space_cache(cache);
9253                btrfs_put_block_group(cache);
9254                return ret;
9255        }
9256
9257        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
9258                                &cache->space_info);
9259        if (ret) {
9260                btrfs_remove_free_space_cache(cache);
9261                spin_lock(&root->fs_info->block_group_cache_lock);
9262                rb_erase(&cache->cache_node,
9263                         &root->fs_info->block_group_cache_tree);
9264                RB_CLEAR_NODE(&cache->cache_node);
9265                spin_unlock(&root->fs_info->block_group_cache_lock);
9266                btrfs_put_block_group(cache);
9267                return ret;
9268        }
9269        update_global_block_rsv(root->fs_info);
9270
9271        spin_lock(&cache->space_info->lock);
9272        cache->space_info->bytes_readonly += cache->bytes_super;
9273        spin_unlock(&cache->space_info->lock);
9274
9275        __link_block_group(cache->space_info, cache);
9276
9277        list_add_tail(&cache->bg_list, &trans->new_bgs);
9278
9279        set_avail_alloc_bits(extent_root->fs_info, type);
9280
9281        return 0;
9282}
9283
9284static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9285{
9286        u64 extra_flags = chunk_to_extended(flags) &
9287                                BTRFS_EXTENDED_PROFILE_MASK;
9288
9289        write_seqlock(&fs_info->profiles_lock);
9290        if (flags & BTRFS_BLOCK_GROUP_DATA)
9291                fs_info->avail_data_alloc_bits &= ~extra_flags;
9292        if (flags & BTRFS_BLOCK_GROUP_METADATA)
9293                fs_info->avail_metadata_alloc_bits &= ~extra_flags;
9294        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
9295                fs_info->avail_system_alloc_bits &= ~extra_flags;
9296        write_sequnlock(&fs_info->profiles_lock);
9297}
9298
9299int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9300                             struct btrfs_root *root, u64 group_start,
9301                             struct extent_map *em)
9302{
9303        struct btrfs_path *path;
9304        struct btrfs_block_group_cache *block_group;
9305        struct btrfs_free_cluster *cluster;
9306        struct btrfs_root *tree_root = root->fs_info->tree_root;
9307        struct btrfs_key key;
9308        struct inode *inode;
9309        struct kobject *kobj = NULL;
9310        int ret;
9311        int index;
9312        int factor;
9313        struct btrfs_caching_control *caching_ctl = NULL;
9314        bool remove_em;
9315
9316        root = root->fs_info->extent_root;
9317
9318        block_group = btrfs_lookup_block_group(root->fs_info, group_start);
9319        BUG_ON(!block_group);
9320        BUG_ON(!block_group->ro);
9321
9322        /*
9323         * Free the reserved super bytes from this block group before
9324         * remove it.
9325         */
9326        free_excluded_extents(root, block_group);
9327
9328        memcpy(&key, &block_group->key, sizeof(key));
9329        index = get_block_group_index(block_group);
9330        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
9331                                  BTRFS_BLOCK_GROUP_RAID1 |
9332                                  BTRFS_BLOCK_GROUP_RAID10))
9333                factor = 2;
9334        else
9335                factor = 1;
9336
9337        /* make sure this block group isn't part of an allocation cluster */
9338        cluster = &root->fs_info->data_alloc_cluster;
9339        spin_lock(&cluster->refill_lock);
9340        btrfs_return_cluster_to_free_space(block_group, cluster);
9341        spin_unlock(&cluster->refill_lock);
9342
9343        /*
9344         * make sure this block group isn't part of a metadata
9345         * allocation cluster
9346         */
9347        cluster = &root->fs_info->meta_alloc_cluster;
9348        spin_lock(&cluster->refill_lock);
9349        btrfs_return_cluster_to_free_space(block_group, cluster);
9350        spin_unlock(&cluster->refill_lock);
9351
9352        path = btrfs_alloc_path();
9353        if (!path) {
9354                ret = -ENOMEM;
9355                goto out;
9356        }
9357
9358        inode = lookup_free_space_inode(tree_root, block_group, path);
9359        if (!IS_ERR(inode)) {
9360                ret = btrfs_orphan_add(trans, inode);
9361                if (ret) {
9362                        btrfs_add_delayed_iput(inode);
9363                        goto out;
9364                }
9365                clear_nlink(inode);
9366                /* One for the block groups ref */
9367                spin_lock(&block_group->lock);
9368                if (block_group->iref) {
9369                        block_group->iref = 0;
9370                        block_group->inode = NULL;
9371                        spin_unlock(&block_group->lock);
9372                        iput(inode);
9373                } else {
9374                        spin_unlock(&block_group->lock);
9375                }
9376                /* One for our lookup ref */
9377                btrfs_add_delayed_iput(inode);
9378        }
9379
9380        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
9381        key.offset = block_group->key.objectid;
9382        key.type = 0;
9383
9384        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
9385        if (ret < 0)
9386                goto out;
9387        if (ret > 0)
9388                btrfs_release_path(path);
9389        if (ret == 0) {
9390                ret = btrfs_del_item(trans, tree_root, path);
9391                if (ret)
9392                        goto out;
9393                btrfs_release_path(path);
9394        }
9395
9396        spin_lock(&root->fs_info->block_group_cache_lock);
9397        rb_erase(&block_group->cache_node,
9398                 &root->fs_info->block_group_cache_tree);
9399        RB_CLEAR_NODE(&block_group->cache_node);
9400
9401        if (root->fs_info->first_logical_byte == block_group->key.objectid)
9402                root->fs_info->first_logical_byte = (u64)-1;
9403        spin_unlock(&root->fs_info->block_group_cache_lock);
9404
9405        down_write(&block_group->space_info->groups_sem);
9406        /*
9407         * we must use list_del_init so people can check to see if they
9408         * are still on the list after taking the semaphore
9409         */
9410        list_del_init(&block_group->list);
9411        if (list_empty(&block_group->space_info->block_groups[index])) {
9412                kobj = block_group->space_info->block_group_kobjs[index];
9413                block_group->space_info->block_group_kobjs[index] = NULL;
9414                clear_avail_alloc_bits(root->fs_info, block_group->flags);
9415        }
9416        up_write(&block_group->space_info->groups_sem);
9417        if (kobj) {
9418                kobject_del(kobj);
9419                kobject_put(kobj);
9420        }
9421
9422        if (block_group->has_caching_ctl)
9423                caching_ctl = get_caching_control(block_group);
9424        if (block_group->cached == BTRFS_CACHE_STARTED)
9425                wait_block_group_cache_done(block_group);
9426        if (block_group->has_caching_ctl) {
9427                down_write(&root->fs_info->commit_root_sem);
9428                if (!caching_ctl) {
9429                        struct btrfs_caching_control *ctl;
9430
9431                        list_for_each_entry(ctl,
9432                                    &root->fs_info->caching_block_groups, list)
9433                                if (ctl->block_group == block_group) {
9434                                        caching_ctl = ctl;
9435                                        atomic_inc(&caching_ctl->count);
9436                                        break;
9437                                }
9438                }
9439                if (caching_ctl)
9440                        list_del_init(&caching_ctl->list);
9441                up_write(&root->fs_info->commit_root_sem);
9442                if (caching_ctl) {
9443                        /* Once for the caching bgs list and once for us. */
9444                        put_caching_control(caching_ctl);
9445                        put_caching_control(caching_ctl);
9446                }
9447        }
9448
9449        spin_lock(&trans->transaction->dirty_bgs_lock);
9450        if (!list_empty(&block_group->dirty_list)) {
9451                list_del_init(&block_group->dirty_list);
9452                btrfs_put_block_group(block_group);
9453        }
9454        spin_unlock(&trans->transaction->dirty_bgs_lock);
9455
9456        btrfs_remove_free_space_cache(block_group);
9457
9458        spin_lock(&block_group->space_info->lock);
9459        list_del_init(&block_group->ro_list);
9460        block_group->space_info->total_bytes -= block_group->key.offset;
9461        block_group->space_info->bytes_readonly -= block_group->key.offset;
9462        block_group->space_info->disk_total -= block_group->key.offset * factor;
9463        spin_unlock(&block_group->space_info->lock);
9464
9465        memcpy(&key, &block_group->key, sizeof(key));
9466
9467        lock_chunks(root);
9468        if (!list_empty(&em->list)) {
9469                /* We're in the transaction->pending_chunks list. */
9470                free_extent_map(em);
9471        }
9472        spin_lock(&block_group->lock);
9473        block_group->removed = 1;
9474        /*
9475         * At this point trimming can't start on this block group, because we
9476         * removed the block group from the tree fs_info->block_group_cache_tree
9477         * so no one can't find it anymore and even if someone already got this
9478         * block group before we removed it from the rbtree, they have already
9479         * incremented block_group->trimming - if they didn't, they won't find
9480         * any free space entries because we already removed them all when we
9481         * called btrfs_remove_free_space_cache().
9482         *
9483         * And we must not remove the extent map from the fs_info->mapping_tree
9484         * to prevent the same logical address range and physical device space
9485         * ranges from being reused for a new block group. This is because our
9486         * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
9487         * completely transactionless, so while it is trimming a range the
9488         * currently running transaction might finish and a new one start,
9489         * allowing for new block groups to be created that can reuse the same
9490         * physical device locations unless we take this special care.
9491         */
9492        remove_em = (atomic_read(&block_group->trimming) == 0);
9493        /*
9494         * Make sure a trimmer task always sees the em in the pinned_chunks list
9495         * if it sees block_group->removed == 1 (needs to lock block_group->lock
9496         * before checking block_group->removed).
9497         */
9498        if (!remove_em) {
9499                /*
9500                 * Our em might be in trans->transaction->pending_chunks which
9501                 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
9502                 * and so is the fs_info->pinned_chunks list.
9503                 *
9504                 * So at this point we must be holding the chunk_mutex to avoid
9505                 * any races with chunk allocation (more specifically at
9506                 * volumes.c:contains_pending_extent()), to ensure it always
9507                 * sees the em, either in the pending_chunks list or in the
9508                 * pinned_chunks list.
9509                 */
9510                list_move_tail(&em->list, &root->fs_info->pinned_chunks);
9511        }
9512        spin_unlock(&block_group->lock);
9513
9514        if (remove_em) {
9515                struct extent_map_tree *em_tree;
9516
9517                em_tree = &root->fs_info->mapping_tree.map_tree;
9518                write_lock(&em_tree->lock);
9519                /*
9520                 * The em might be in the pending_chunks list, so make sure the
9521                 * chunk mutex is locked, since remove_extent_mapping() will
9522                 * delete us from that list.
9523                 */
9524                remove_extent_mapping(em_tree, em);
9525                write_unlock(&em_tree->lock);
9526                /* once for the tree */
9527                free_extent_map(em);
9528        }
9529
9530        unlock_chunks(root);
9531
9532        btrfs_put_block_group(block_group);
9533        btrfs_put_block_group(block_group);
9534
9535        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
9536        if (ret > 0)
9537                ret = -EIO;
9538        if (ret < 0)
9539                goto out;
9540
9541        ret = btrfs_del_item(trans, root, path);
9542out:
9543        btrfs_free_path(path);
9544        return ret;
9545}
9546
9547/*
9548 * Process the unused_bgs list and remove any that don't have any allocated
9549 * space inside of them.
9550 */
9551void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9552{
9553        struct btrfs_block_group_cache *block_group;
9554        struct btrfs_space_info *space_info;
9555        struct btrfs_root *root = fs_info->extent_root;
9556        struct btrfs_trans_handle *trans;
9557        int ret = 0;
9558
9559        if (!fs_info->open)
9560                return;
9561
9562        spin_lock(&fs_info->unused_bgs_lock);
9563        while (!list_empty(&fs_info->unused_bgs)) {
9564                u64 start, end;
9565
9566                block_group = list_first_entry(&fs_info->unused_bgs,
9567                                               struct btrfs_block_group_cache,
9568                                               bg_list);
9569                space_info = block_group->space_info;
9570                list_del_init(&block_group->bg_list);
9571                if (ret || btrfs_mixed_space_info(space_info)) {
9572                        btrfs_put_block_group(block_group);
9573                        continue;
9574                }
9575                spin_unlock(&fs_info->unused_bgs_lock);
9576
9577                /* Don't want to race with allocators so take the groups_sem */
9578                down_write(&space_info->groups_sem);
9579                spin_lock(&block_group->lock);
9580                if (block_group->reserved ||
9581                    btrfs_block_group_used(&block_group->item) ||
9582                    block_group->ro) {
9583                        /*
9584                         * We want to bail if we made new allocations or have
9585                         * outstanding allocations in this block group.  We do
9586                         * the ro check in case balance is currently acting on
9587                         * this block group.
9588                         */
9589                        spin_unlock(&block_group->lock);
9590                        up_write(&space_info->groups_sem);
9591                        goto next;
9592                }
9593                spin_unlock(&block_group->lock);
9594
9595                /* We don't want to force the issue, only flip if it's ok. */
9596                ret = set_block_group_ro(block_group, 0);
9597                up_write(&space_info->groups_sem);
9598                if (ret < 0) {
9599                        ret = 0;
9600                        goto next;
9601                }
9602
9603                /*
9604                 * Want to do this before we do anything else so we can recover
9605                 * properly if we fail to join the transaction.
9606                 */
9607                /* 1 for btrfs_orphan_reserve_metadata() */
9608                trans = btrfs_start_transaction(root, 1);
9609                if (IS_ERR(trans)) {
9610                        btrfs_set_block_group_rw(root, block_group);
9611                        ret = PTR_ERR(trans);
9612                        goto next;
9613                }
9614
9615                /*
9616                 * We could have pending pinned extents for this block group,
9617                 * just delete them, we don't care about them anymore.
9618                 */
9619                start = block_group->key.objectid;
9620                end = start + block_group->key.offset - 1;
9621                /*
9622                 * Hold the unused_bg_unpin_mutex lock to avoid racing with
9623                 * btrfs_finish_extent_commit(). If we are at transaction N,
9624                 * another task might be running finish_extent_commit() for the
9625                 * previous transaction N - 1, and have seen a range belonging
9626                 * to the block group in freed_extents[] before we were able to
9627                 * clear the whole block group range from freed_extents[]. This
9628                 * means that task can lookup for the block group after we
9629                 * unpinned it from freed_extents[] and removed it, leading to
9630                 * a BUG_ON() at btrfs_unpin_extent_range().
9631                 */
9632                mutex_lock(&fs_info->unused_bg_unpin_mutex);
9633                ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
9634                                  EXTENT_DIRTY, GFP_NOFS);
9635                if (ret) {
9636                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9637                        btrfs_set_block_group_rw(root, block_group);
9638                        goto end_trans;
9639                }
9640                ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
9641                                  EXTENT_DIRTY, GFP_NOFS);
9642                if (ret) {
9643                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9644                        btrfs_set_block_group_rw(root, block_group);
9645                        goto end_trans;
9646                }
9647                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9648
9649                /* Reset pinned so btrfs_put_block_group doesn't complain */
9650                block_group->pinned = 0;
9651
9652                /*
9653                 * Btrfs_remove_chunk will abort the transaction if things go
9654                 * horribly wrong.
9655                 */
9656                ret = btrfs_remove_chunk(trans, root,
9657                                         block_group->key.objectid);
9658end_trans:
9659                btrfs_end_transaction(trans, root);
9660next:
9661                btrfs_put_block_group(block_group);
9662                spin_lock(&fs_info->unused_bgs_lock);
9663        }
9664        spin_unlock(&fs_info->unused_bgs_lock);
9665}
9666
9667int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
9668{
9669        struct btrfs_space_info *space_info;
9670        struct btrfs_super_block *disk_super;
9671        u64 features;
9672        u64 flags;
9673        int mixed = 0;
9674        int ret;
9675
9676        disk_super = fs_info->super_copy;
9677        if (!btrfs_super_root(disk_super))
9678                return 1;
9679
9680        features = btrfs_super_incompat_flags(disk_super);
9681        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
9682                mixed = 1;
9683
9684        flags = BTRFS_BLOCK_GROUP_SYSTEM;
9685        ret = update_space_info(fs_info, flags, 0, 0, &space_info);
9686        if (ret)
9687                goto out;
9688
9689        if (mixed) {
9690                flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
9691                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
9692        } else {
9693                flags = BTRFS_BLOCK_GROUP_METADATA;
9694                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
9695                if (ret)
9696                        goto out;
9697
9698                flags = BTRFS_BLOCK_GROUP_DATA;
9699                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
9700        }
9701out:
9702        return ret;
9703}
9704
9705int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
9706{
9707        return unpin_extent_range(root, start, end, false);
9708}
9709
9710int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
9711{
9712        struct btrfs_fs_info *fs_info = root->fs_info;
9713        struct btrfs_block_group_cache *cache = NULL;
9714        u64 group_trimmed;
9715        u64 start;
9716        u64 end;
9717        u64 trimmed = 0;
9718        u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
9719        int ret = 0;
9720
9721        /*
9722         * try to trim all FS space, our block group may start from non-zero.
9723         */
9724        if (range->len == total_bytes)
9725                cache = btrfs_lookup_first_block_group(fs_info, range->start);
9726        else
9727                cache = btrfs_lookup_block_group(fs_info, range->start);
9728
9729        while (cache) {
9730                if (cache->key.objectid >= (range->start + range->len)) {
9731                        btrfs_put_block_group(cache);
9732                        break;
9733                }
9734
9735                start = max(range->start, cache->key.objectid);
9736                end = min(range->start + range->len,
9737                                cache->key.objectid + cache->key.offset);
9738
9739                if (end - start >= range->minlen) {
9740                        if (!block_group_cache_done(cache)) {
9741                                ret = cache_block_group(cache, 0);
9742                                if (ret) {
9743                                        btrfs_put_block_group(cache);
9744                                        break;
9745                                }
9746                                ret = wait_block_group_cache_done(cache);
9747                                if (ret) {
9748                                        btrfs_put_block_group(cache);
9749                                        break;
9750                                }
9751                        }
9752                        ret = btrfs_trim_block_group(cache,
9753                                                     &group_trimmed,
9754                                                     start,
9755                                                     end,
9756                                                     range->minlen);
9757
9758                        trimmed += group_trimmed;
9759                        if (ret) {
9760                                btrfs_put_block_group(cache);
9761                                break;
9762                        }
9763                }
9764
9765                cache = next_block_group(fs_info->tree_root, cache);
9766        }
9767
9768        range->len = trimmed;
9769        return ret;
9770}
9771
9772/*
9773 * btrfs_{start,end}_write_no_snapshoting() are similar to
9774 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
9775 * data into the page cache through nocow before the subvolume is snapshoted,
9776 * but flush the data into disk after the snapshot creation, or to prevent
9777 * operations while snapshoting is ongoing and that cause the snapshot to be
9778 * inconsistent (writes followed by expanding truncates for example).
9779 */
9780void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
9781{
9782        percpu_counter_dec(&root->subv_writers->counter);
9783        /*
9784         * Make sure counter is updated before we wake up
9785         * waiters.
9786         */
9787        smp_mb();
9788        if (waitqueue_active(&root->subv_writers->wait))
9789                wake_up(&root->subv_writers->wait);
9790}
9791
9792int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
9793{
9794        if (atomic_read(&root->will_be_snapshoted))
9795                return 0;
9796
9797        percpu_counter_inc(&root->subv_writers->counter);
9798        /*
9799         * Make sure counter is updated before we check for snapshot creation.
9800         */
9801        smp_mb();
9802        if (atomic_read(&root->will_be_snapshoted)) {
9803                btrfs_end_write_no_snapshoting(root);
9804                return 0;
9805        }
9806        return 1;
9807}
9808