linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/sched/signal.h>
   8#include <linux/pagemap.h>
   9#include <linux/writeback.h>
  10#include <linux/blkdev.h>
  11#include <linux/sort.h>
  12#include <linux/rcupdate.h>
  13#include <linux/kthread.h>
  14#include <linux/slab.h>
  15#include <linux/ratelimit.h>
  16#include <linux/percpu_counter.h>
  17#include <linux/lockdep.h>
  18#include <linux/crc32c.h>
  19#include "tree-log.h"
  20#include "disk-io.h"
  21#include "print-tree.h"
  22#include "volumes.h"
  23#include "raid56.h"
  24#include "locking.h"
  25#include "free-space-cache.h"
  26#include "free-space-tree.h"
  27#include "math.h"
  28#include "sysfs.h"
  29#include "qgroup.h"
  30#include "ref-verify.h"
  31
  32#undef SCRAMBLE_DELAYED_REFS
  33
  34/*
  35 * control flags for do_chunk_alloc's force field
  36 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  37 * if we really need one.
  38 *
  39 * CHUNK_ALLOC_LIMITED means to only try and allocate one
  40 * if we have very few chunks already allocated.  This is
  41 * used as part of the clustering code to help make sure
  42 * we have a good pool of storage to cluster in, without
  43 * filling the FS with empty chunks
  44 *
  45 * CHUNK_ALLOC_FORCE means it must try to allocate one
  46 *
  47 */
  48enum {
  49        CHUNK_ALLOC_NO_FORCE = 0,
  50        CHUNK_ALLOC_LIMITED = 1,
  51        CHUNK_ALLOC_FORCE = 2,
  52};
  53
  54static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  55                               struct btrfs_fs_info *fs_info,
  56                                struct btrfs_delayed_ref_node *node, u64 parent,
  57                                u64 root_objectid, u64 owner_objectid,
  58                                u64 owner_offset, int refs_to_drop,
  59                                struct btrfs_delayed_extent_op *extra_op);
  60static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  61                                    struct extent_buffer *leaf,
  62                                    struct btrfs_extent_item *ei);
  63static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  64                                      struct btrfs_fs_info *fs_info,
  65                                      u64 parent, u64 root_objectid,
  66                                      u64 flags, u64 owner, u64 offset,
  67                                      struct btrfs_key *ins, int ref_mod);
  68static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  69                                     struct btrfs_fs_info *fs_info,
  70                                     u64 parent, u64 root_objectid,
  71                                     u64 flags, struct btrfs_disk_key *key,
  72                                     int level, struct btrfs_key *ins);
  73static int do_chunk_alloc(struct btrfs_trans_handle *trans,
  74                          struct btrfs_fs_info *fs_info, u64 flags,
  75                          int force);
  76static int find_next_key(struct btrfs_path *path, int level,
  77                         struct btrfs_key *key);
  78static void dump_space_info(struct btrfs_fs_info *fs_info,
  79                            struct btrfs_space_info *info, u64 bytes,
  80                            int dump_block_groups);
  81static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
  82                               u64 num_bytes);
  83static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
  84                                     struct btrfs_space_info *space_info,
  85                                     u64 num_bytes);
  86static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
  87                                     struct btrfs_space_info *space_info,
  88                                     u64 num_bytes);
  89
  90static noinline int
  91block_group_cache_done(struct btrfs_block_group_cache *cache)
  92{
  93        smp_mb();
  94        return cache->cached == BTRFS_CACHE_FINISHED ||
  95                cache->cached == BTRFS_CACHE_ERROR;
  96}
  97
  98static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
  99{
 100        return (cache->flags & bits) == bits;
 101}
 102
 103void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 104{
 105        atomic_inc(&cache->count);
 106}
 107
 108void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 109{
 110        if (atomic_dec_and_test(&cache->count)) {
 111                WARN_ON(cache->pinned > 0);
 112                WARN_ON(cache->reserved > 0);
 113
 114                /*
 115                 * If not empty, someone is still holding mutex of
 116                 * full_stripe_lock, which can only be released by caller.
 117                 * And it will definitely cause use-after-free when caller
 118                 * tries to release full stripe lock.
 119                 *
 120                 * No better way to resolve, but only to warn.
 121                 */
 122                WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
 123                kfree(cache->free_space_ctl);
 124                kfree(cache);
 125        }
 126}
 127
 128/*
 129 * this adds the block group to the fs_info rb tree for the block group
 130 * cache
 131 */
 132static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 133                                struct btrfs_block_group_cache *block_group)
 134{
 135        struct rb_node **p;
 136        struct rb_node *parent = NULL;
 137        struct btrfs_block_group_cache *cache;
 138
 139        spin_lock(&info->block_group_cache_lock);
 140        p = &info->block_group_cache_tree.rb_node;
 141
 142        while (*p) {
 143                parent = *p;
 144                cache = rb_entry(parent, struct btrfs_block_group_cache,
 145                                 cache_node);
 146                if (block_group->key.objectid < cache->key.objectid) {
 147                        p = &(*p)->rb_left;
 148                } else if (block_group->key.objectid > cache->key.objectid) {
 149                        p = &(*p)->rb_right;
 150                } else {
 151                        spin_unlock(&info->block_group_cache_lock);
 152                        return -EEXIST;
 153                }
 154        }
 155
 156        rb_link_node(&block_group->cache_node, parent, p);
 157        rb_insert_color(&block_group->cache_node,
 158                        &info->block_group_cache_tree);
 159
 160        if (info->first_logical_byte > block_group->key.objectid)
 161                info->first_logical_byte = block_group->key.objectid;
 162
 163        spin_unlock(&info->block_group_cache_lock);
 164
 165        return 0;
 166}
 167
 168/*
 169 * This will return the block group at or after bytenr if contains is 0, else
 170 * it will return the block group that contains the bytenr
 171 */
 172static struct btrfs_block_group_cache *
 173block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 174                              int contains)
 175{
 176        struct btrfs_block_group_cache *cache, *ret = NULL;
 177        struct rb_node *n;
 178        u64 end, start;
 179
 180        spin_lock(&info->block_group_cache_lock);
 181        n = info->block_group_cache_tree.rb_node;
 182
 183        while (n) {
 184                cache = rb_entry(n, struct btrfs_block_group_cache,
 185                                 cache_node);
 186                end = cache->key.objectid + cache->key.offset - 1;
 187                start = cache->key.objectid;
 188
 189                if (bytenr < start) {
 190                        if (!contains && (!ret || start < ret->key.objectid))
 191                                ret = cache;
 192                        n = n->rb_left;
 193                } else if (bytenr > start) {
 194                        if (contains && bytenr <= end) {
 195                                ret = cache;
 196                                break;
 197                        }
 198                        n = n->rb_right;
 199                } else {
 200                        ret = cache;
 201                        break;
 202                }
 203        }
 204        if (ret) {
 205                btrfs_get_block_group(ret);
 206                if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 207                        info->first_logical_byte = ret->key.objectid;
 208        }
 209        spin_unlock(&info->block_group_cache_lock);
 210
 211        return ret;
 212}
 213
 214static int add_excluded_extent(struct btrfs_fs_info *fs_info,
 215                               u64 start, u64 num_bytes)
 216{
 217        u64 end = start + num_bytes - 1;
 218        set_extent_bits(&fs_info->freed_extents[0],
 219                        start, end, EXTENT_UPTODATE);
 220        set_extent_bits(&fs_info->freed_extents[1],
 221                        start, end, EXTENT_UPTODATE);
 222        return 0;
 223}
 224
 225static void free_excluded_extents(struct btrfs_fs_info *fs_info,
 226                                  struct btrfs_block_group_cache *cache)
 227{
 228        u64 start, end;
 229
 230        start = cache->key.objectid;
 231        end = start + cache->key.offset - 1;
 232
 233        clear_extent_bits(&fs_info->freed_extents[0],
 234                          start, end, EXTENT_UPTODATE);
 235        clear_extent_bits(&fs_info->freed_extents[1],
 236                          start, end, EXTENT_UPTODATE);
 237}
 238
 239static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
 240                                 struct btrfs_block_group_cache *cache)
 241{
 242        u64 bytenr;
 243        u64 *logical;
 244        int stripe_len;
 245        int i, nr, ret;
 246
 247        if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 248                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 249                cache->bytes_super += stripe_len;
 250                ret = add_excluded_extent(fs_info, cache->key.objectid,
 251                                          stripe_len);
 252                if (ret)
 253                        return ret;
 254        }
 255
 256        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 257                bytenr = btrfs_sb_offset(i);
 258                ret = btrfs_rmap_block(fs_info, cache->key.objectid,
 259                                       bytenr, 0, &logical, &nr, &stripe_len);
 260                if (ret)
 261                        return ret;
 262
 263                while (nr--) {
 264                        u64 start, len;
 265
 266                        if (logical[nr] > cache->key.objectid +
 267                            cache->key.offset)
 268                                continue;
 269
 270                        if (logical[nr] + stripe_len <= cache->key.objectid)
 271                                continue;
 272
 273                        start = logical[nr];
 274                        if (start < cache->key.objectid) {
 275                                start = cache->key.objectid;
 276                                len = (logical[nr] + stripe_len) - start;
 277                        } else {
 278                                len = min_t(u64, stripe_len,
 279                                            cache->key.objectid +
 280                                            cache->key.offset - start);
 281                        }
 282
 283                        cache->bytes_super += len;
 284                        ret = add_excluded_extent(fs_info, start, len);
 285                        if (ret) {
 286                                kfree(logical);
 287                                return ret;
 288                        }
 289                }
 290
 291                kfree(logical);
 292        }
 293        return 0;
 294}
 295
 296static struct btrfs_caching_control *
 297get_caching_control(struct btrfs_block_group_cache *cache)
 298{
 299        struct btrfs_caching_control *ctl;
 300
 301        spin_lock(&cache->lock);
 302        if (!cache->caching_ctl) {
 303                spin_unlock(&cache->lock);
 304                return NULL;
 305        }
 306
 307        ctl = cache->caching_ctl;
 308        refcount_inc(&ctl->count);
 309        spin_unlock(&cache->lock);
 310        return ctl;
 311}
 312
 313static void put_caching_control(struct btrfs_caching_control *ctl)
 314{
 315        if (refcount_dec_and_test(&ctl->count))
 316                kfree(ctl);
 317}
 318
 319#ifdef CONFIG_BTRFS_DEBUG
 320static void fragment_free_space(struct btrfs_block_group_cache *block_group)
 321{
 322        struct btrfs_fs_info *fs_info = block_group->fs_info;
 323        u64 start = block_group->key.objectid;
 324        u64 len = block_group->key.offset;
 325        u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 326                fs_info->nodesize : fs_info->sectorsize;
 327        u64 step = chunk << 1;
 328
 329        while (len > chunk) {
 330                btrfs_remove_free_space(block_group, start, chunk);
 331                start += step;
 332                if (len < step)
 333                        len = 0;
 334                else
 335                        len -= step;
 336        }
 337}
 338#endif
 339
 340/*
 341 * this is only called by cache_block_group, since we could have freed extents
 342 * we need to check the pinned_extents for any extents that can't be used yet
 343 * since their free space will be released as soon as the transaction commits.
 344 */
 345u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 346                       struct btrfs_fs_info *info, u64 start, u64 end)
 347{
 348        u64 extent_start, extent_end, size, total_added = 0;
 349        int ret;
 350
 351        while (start < end) {
 352                ret = find_first_extent_bit(info->pinned_extents, start,
 353                                            &extent_start, &extent_end,
 354                                            EXTENT_DIRTY | EXTENT_UPTODATE,
 355                                            NULL);
 356                if (ret)
 357                        break;
 358
 359                if (extent_start <= start) {
 360                        start = extent_end + 1;
 361                } else if (extent_start > start && extent_start < end) {
 362                        size = extent_start - start;
 363                        total_added += size;
 364                        ret = btrfs_add_free_space(block_group, start,
 365                                                   size);
 366                        BUG_ON(ret); /* -ENOMEM or logic error */
 367                        start = extent_end + 1;
 368                } else {
 369                        break;
 370                }
 371        }
 372
 373        if (start < end) {
 374                size = end - start;
 375                total_added += size;
 376                ret = btrfs_add_free_space(block_group, start, size);
 377                BUG_ON(ret); /* -ENOMEM or logic error */
 378        }
 379
 380        return total_added;
 381}
 382
 383static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 384{
 385        struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
 386        struct btrfs_fs_info *fs_info = block_group->fs_info;
 387        struct btrfs_root *extent_root = fs_info->extent_root;
 388        struct btrfs_path *path;
 389        struct extent_buffer *leaf;
 390        struct btrfs_key key;
 391        u64 total_found = 0;
 392        u64 last = 0;
 393        u32 nritems;
 394        int ret;
 395        bool wakeup = true;
 396
 397        path = btrfs_alloc_path();
 398        if (!path)
 399                return -ENOMEM;
 400
 401        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 402
 403#ifdef CONFIG_BTRFS_DEBUG
 404        /*
 405         * If we're fragmenting we don't want to make anybody think we can
 406         * allocate from this block group until we've had a chance to fragment
 407         * the free space.
 408         */
 409        if (btrfs_should_fragment_free_space(block_group))
 410                wakeup = false;
 411#endif
 412        /*
 413         * We don't want to deadlock with somebody trying to allocate a new
 414         * extent for the extent root while also trying to search the extent
 415         * root to add free space.  So we skip locking and search the commit
 416         * root, since its read-only
 417         */
 418        path->skip_locking = 1;
 419        path->search_commit_root = 1;
 420        path->reada = READA_FORWARD;
 421
 422        key.objectid = last;
 423        key.offset = 0;
 424        key.type = BTRFS_EXTENT_ITEM_KEY;
 425
 426next:
 427        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 428        if (ret < 0)
 429                goto out;
 430
 431        leaf = path->nodes[0];
 432        nritems = btrfs_header_nritems(leaf);
 433
 434        while (1) {
 435                if (btrfs_fs_closing(fs_info) > 1) {
 436                        last = (u64)-1;
 437                        break;
 438                }
 439
 440                if (path->slots[0] < nritems) {
 441                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 442                } else {
 443                        ret = find_next_key(path, 0, &key);
 444                        if (ret)
 445                                break;
 446
 447                        if (need_resched() ||
 448                            rwsem_is_contended(&fs_info->commit_root_sem)) {
 449                                if (wakeup)
 450                                        caching_ctl->progress = last;
 451                                btrfs_release_path(path);
 452                                up_read(&fs_info->commit_root_sem);
 453                                mutex_unlock(&caching_ctl->mutex);
 454                                cond_resched();
 455                                mutex_lock(&caching_ctl->mutex);
 456                                down_read(&fs_info->commit_root_sem);
 457                                goto next;
 458                        }
 459
 460                        ret = btrfs_next_leaf(extent_root, path);
 461                        if (ret < 0)
 462                                goto out;
 463                        if (ret)
 464                                break;
 465                        leaf = path->nodes[0];
 466                        nritems = btrfs_header_nritems(leaf);
 467                        continue;
 468                }
 469
 470                if (key.objectid < last) {
 471                        key.objectid = last;
 472                        key.offset = 0;
 473                        key.type = BTRFS_EXTENT_ITEM_KEY;
 474
 475                        if (wakeup)
 476                                caching_ctl->progress = last;
 477                        btrfs_release_path(path);
 478                        goto next;
 479                }
 480
 481                if (key.objectid < block_group->key.objectid) {
 482                        path->slots[0]++;
 483                        continue;
 484                }
 485
 486                if (key.objectid >= block_group->key.objectid +
 487                    block_group->key.offset)
 488                        break;
 489
 490                if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 491                    key.type == BTRFS_METADATA_ITEM_KEY) {
 492                        total_found += add_new_free_space(block_group,
 493                                                          fs_info, last,
 494                                                          key.objectid);
 495                        if (key.type == BTRFS_METADATA_ITEM_KEY)
 496                                last = key.objectid +
 497                                        fs_info->nodesize;
 498                        else
 499                                last = key.objectid + key.offset;
 500
 501                        if (total_found > CACHING_CTL_WAKE_UP) {
 502                                total_found = 0;
 503                                if (wakeup)
 504                                        wake_up(&caching_ctl->wait);
 505                        }
 506                }
 507                path->slots[0]++;
 508        }
 509        ret = 0;
 510
 511        total_found += add_new_free_space(block_group, fs_info, last,
 512                                          block_group->key.objectid +
 513                                          block_group->key.offset);
 514        caching_ctl->progress = (u64)-1;
 515
 516out:
 517        btrfs_free_path(path);
 518        return ret;
 519}
 520
 521static noinline void caching_thread(struct btrfs_work *work)
 522{
 523        struct btrfs_block_group_cache *block_group;
 524        struct btrfs_fs_info *fs_info;
 525        struct btrfs_caching_control *caching_ctl;
 526        int ret;
 527
 528        caching_ctl = container_of(work, struct btrfs_caching_control, work);
 529        block_group = caching_ctl->block_group;
 530        fs_info = block_group->fs_info;
 531
 532        mutex_lock(&caching_ctl->mutex);
 533        down_read(&fs_info->commit_root_sem);
 534
 535        if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 536                ret = load_free_space_tree(caching_ctl);
 537        else
 538                ret = load_extent_tree_free(caching_ctl);
 539
 540        spin_lock(&block_group->lock);
 541        block_group->caching_ctl = NULL;
 542        block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 543        spin_unlock(&block_group->lock);
 544
 545#ifdef CONFIG_BTRFS_DEBUG
 546        if (btrfs_should_fragment_free_space(block_group)) {
 547                u64 bytes_used;
 548
 549                spin_lock(&block_group->space_info->lock);
 550                spin_lock(&block_group->lock);
 551                bytes_used = block_group->key.offset -
 552                        btrfs_block_group_used(&block_group->item);
 553                block_group->space_info->bytes_used += bytes_used >> 1;
 554                spin_unlock(&block_group->lock);
 555                spin_unlock(&block_group->space_info->lock);
 556                fragment_free_space(block_group);
 557        }
 558#endif
 559
 560        caching_ctl->progress = (u64)-1;
 561
 562        up_read(&fs_info->commit_root_sem);
 563        free_excluded_extents(fs_info, block_group);
 564        mutex_unlock(&caching_ctl->mutex);
 565
 566        wake_up(&caching_ctl->wait);
 567
 568        put_caching_control(caching_ctl);
 569        btrfs_put_block_group(block_group);
 570}
 571
 572static int cache_block_group(struct btrfs_block_group_cache *cache,
 573                             int load_cache_only)
 574{
 575        DEFINE_WAIT(wait);
 576        struct btrfs_fs_info *fs_info = cache->fs_info;
 577        struct btrfs_caching_control *caching_ctl;
 578        int ret = 0;
 579
 580        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 581        if (!caching_ctl)
 582                return -ENOMEM;
 583
 584        INIT_LIST_HEAD(&caching_ctl->list);
 585        mutex_init(&caching_ctl->mutex);
 586        init_waitqueue_head(&caching_ctl->wait);
 587        caching_ctl->block_group = cache;
 588        caching_ctl->progress = cache->key.objectid;
 589        refcount_set(&caching_ctl->count, 1);
 590        btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 591                        caching_thread, NULL, NULL);
 592
 593        spin_lock(&cache->lock);
 594        /*
 595         * This should be a rare occasion, but this could happen I think in the
 596         * case where one thread starts to load the space cache info, and then
 597         * some other thread starts a transaction commit which tries to do an
 598         * allocation while the other thread is still loading the space cache
 599         * info.  The previous loop should have kept us from choosing this block
 600         * group, but if we've moved to the state where we will wait on caching
 601         * block groups we need to first check if we're doing a fast load here,
 602         * so we can wait for it to finish, otherwise we could end up allocating
 603         * from a block group who's cache gets evicted for one reason or
 604         * another.
 605         */
 606        while (cache->cached == BTRFS_CACHE_FAST) {
 607                struct btrfs_caching_control *ctl;
 608
 609                ctl = cache->caching_ctl;
 610                refcount_inc(&ctl->count);
 611                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 612                spin_unlock(&cache->lock);
 613
 614                schedule();
 615
 616                finish_wait(&ctl->wait, &wait);
 617                put_caching_control(ctl);
 618                spin_lock(&cache->lock);
 619        }
 620
 621        if (cache->cached != BTRFS_CACHE_NO) {
 622                spin_unlock(&cache->lock);
 623                kfree(caching_ctl);
 624                return 0;
 625        }
 626        WARN_ON(cache->caching_ctl);
 627        cache->caching_ctl = caching_ctl;
 628        cache->cached = BTRFS_CACHE_FAST;
 629        spin_unlock(&cache->lock);
 630
 631        if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 632                mutex_lock(&caching_ctl->mutex);
 633                ret = load_free_space_cache(fs_info, cache);
 634
 635                spin_lock(&cache->lock);
 636                if (ret == 1) {
 637                        cache->caching_ctl = NULL;
 638                        cache->cached = BTRFS_CACHE_FINISHED;
 639                        cache->last_byte_to_unpin = (u64)-1;
 640                        caching_ctl->progress = (u64)-1;
 641                } else {
 642                        if (load_cache_only) {
 643                                cache->caching_ctl = NULL;
 644                                cache->cached = BTRFS_CACHE_NO;
 645                        } else {
 646                                cache->cached = BTRFS_CACHE_STARTED;
 647                                cache->has_caching_ctl = 1;
 648                        }
 649                }
 650                spin_unlock(&cache->lock);
 651#ifdef CONFIG_BTRFS_DEBUG
 652                if (ret == 1 &&
 653                    btrfs_should_fragment_free_space(cache)) {
 654                        u64 bytes_used;
 655
 656                        spin_lock(&cache->space_info->lock);
 657                        spin_lock(&cache->lock);
 658                        bytes_used = cache->key.offset -
 659                                btrfs_block_group_used(&cache->item);
 660                        cache->space_info->bytes_used += bytes_used >> 1;
 661                        spin_unlock(&cache->lock);
 662                        spin_unlock(&cache->space_info->lock);
 663                        fragment_free_space(cache);
 664                }
 665#endif
 666                mutex_unlock(&caching_ctl->mutex);
 667
 668                wake_up(&caching_ctl->wait);
 669                if (ret == 1) {
 670                        put_caching_control(caching_ctl);
 671                        free_excluded_extents(fs_info, cache);
 672                        return 0;
 673                }
 674        } else {
 675                /*
 676                 * We're either using the free space tree or no caching at all.
 677                 * Set cached to the appropriate value and wakeup any waiters.
 678                 */
 679                spin_lock(&cache->lock);
 680                if (load_cache_only) {
 681                        cache->caching_ctl = NULL;
 682                        cache->cached = BTRFS_CACHE_NO;
 683                } else {
 684                        cache->cached = BTRFS_CACHE_STARTED;
 685                        cache->has_caching_ctl = 1;
 686                }
 687                spin_unlock(&cache->lock);
 688                wake_up(&caching_ctl->wait);
 689        }
 690
 691        if (load_cache_only) {
 692                put_caching_control(caching_ctl);
 693                return 0;
 694        }
 695
 696        down_write(&fs_info->commit_root_sem);
 697        refcount_inc(&caching_ctl->count);
 698        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 699        up_write(&fs_info->commit_root_sem);
 700
 701        btrfs_get_block_group(cache);
 702
 703        btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 704
 705        return ret;
 706}
 707
 708/*
 709 * return the block group that starts at or after bytenr
 710 */
 711static struct btrfs_block_group_cache *
 712btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 713{
 714        return block_group_cache_tree_search(info, bytenr, 0);
 715}
 716
 717/*
 718 * return the block group that contains the given bytenr
 719 */
 720struct btrfs_block_group_cache *btrfs_lookup_block_group(
 721                                                 struct btrfs_fs_info *info,
 722                                                 u64 bytenr)
 723{
 724        return block_group_cache_tree_search(info, bytenr, 1);
 725}
 726
 727static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 728                                                  u64 flags)
 729{
 730        struct list_head *head = &info->space_info;
 731        struct btrfs_space_info *found;
 732
 733        flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 734
 735        rcu_read_lock();
 736        list_for_each_entry_rcu(found, head, list) {
 737                if (found->flags & flags) {
 738                        rcu_read_unlock();
 739                        return found;
 740                }
 741        }
 742        rcu_read_unlock();
 743        return NULL;
 744}
 745
 746static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
 747                             u64 owner, u64 root_objectid)
 748{
 749        struct btrfs_space_info *space_info;
 750        u64 flags;
 751
 752        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
 753                if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
 754                        flags = BTRFS_BLOCK_GROUP_SYSTEM;
 755                else
 756                        flags = BTRFS_BLOCK_GROUP_METADATA;
 757        } else {
 758                flags = BTRFS_BLOCK_GROUP_DATA;
 759        }
 760
 761        space_info = __find_space_info(fs_info, flags);
 762        ASSERT(space_info);
 763        percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
 764}
 765
 766/*
 767 * after adding space to the filesystem, we need to clear the full flags
 768 * on all the space infos.
 769 */
 770void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 771{
 772        struct list_head *head = &info->space_info;
 773        struct btrfs_space_info *found;
 774
 775        rcu_read_lock();
 776        list_for_each_entry_rcu(found, head, list)
 777                found->full = 0;
 778        rcu_read_unlock();
 779}
 780
 781/* simple helper to search for an existing data extent at a given offset */
 782int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
 783{
 784        int ret;
 785        struct btrfs_key key;
 786        struct btrfs_path *path;
 787
 788        path = btrfs_alloc_path();
 789        if (!path)
 790                return -ENOMEM;
 791
 792        key.objectid = start;
 793        key.offset = len;
 794        key.type = BTRFS_EXTENT_ITEM_KEY;
 795        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 796        btrfs_free_path(path);
 797        return ret;
 798}
 799
 800/*
 801 * helper function to lookup reference count and flags of a tree block.
 802 *
 803 * the head node for delayed ref is used to store the sum of all the
 804 * reference count modifications queued up in the rbtree. the head
 805 * node may also store the extent flags to set. This way you can check
 806 * to see what the reference count and extent flags would be if all of
 807 * the delayed refs are not processed.
 808 */
 809int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 810                             struct btrfs_fs_info *fs_info, u64 bytenr,
 811                             u64 offset, int metadata, u64 *refs, u64 *flags)
 812{
 813        struct btrfs_delayed_ref_head *head;
 814        struct btrfs_delayed_ref_root *delayed_refs;
 815        struct btrfs_path *path;
 816        struct btrfs_extent_item *ei;
 817        struct extent_buffer *leaf;
 818        struct btrfs_key key;
 819        u32 item_size;
 820        u64 num_refs;
 821        u64 extent_flags;
 822        int ret;
 823
 824        /*
 825         * If we don't have skinny metadata, don't bother doing anything
 826         * different
 827         */
 828        if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
 829                offset = fs_info->nodesize;
 830                metadata = 0;
 831        }
 832
 833        path = btrfs_alloc_path();
 834        if (!path)
 835                return -ENOMEM;
 836
 837        if (!trans) {
 838                path->skip_locking = 1;
 839                path->search_commit_root = 1;
 840        }
 841
 842search_again:
 843        key.objectid = bytenr;
 844        key.offset = offset;
 845        if (metadata)
 846                key.type = BTRFS_METADATA_ITEM_KEY;
 847        else
 848                key.type = BTRFS_EXTENT_ITEM_KEY;
 849
 850        ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
 851        if (ret < 0)
 852                goto out_free;
 853
 854        if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 855                if (path->slots[0]) {
 856                        path->slots[0]--;
 857                        btrfs_item_key_to_cpu(path->nodes[0], &key,
 858                                              path->slots[0]);
 859                        if (key.objectid == bytenr &&
 860                            key.type == BTRFS_EXTENT_ITEM_KEY &&
 861                            key.offset == fs_info->nodesize)
 862                                ret = 0;
 863                }
 864        }
 865
 866        if (ret == 0) {
 867                leaf = path->nodes[0];
 868                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 869                if (item_size >= sizeof(*ei)) {
 870                        ei = btrfs_item_ptr(leaf, path->slots[0],
 871                                            struct btrfs_extent_item);
 872                        num_refs = btrfs_extent_refs(leaf, ei);
 873                        extent_flags = btrfs_extent_flags(leaf, ei);
 874                } else {
 875#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 876                        struct btrfs_extent_item_v0 *ei0;
 877                        BUG_ON(item_size != sizeof(*ei0));
 878                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 879                                             struct btrfs_extent_item_v0);
 880                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
 881                        /* FIXME: this isn't correct for data */
 882                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 883#else
 884                        BUG();
 885#endif
 886                }
 887                BUG_ON(num_refs == 0);
 888        } else {
 889                num_refs = 0;
 890                extent_flags = 0;
 891                ret = 0;
 892        }
 893
 894        if (!trans)
 895                goto out;
 896
 897        delayed_refs = &trans->transaction->delayed_refs;
 898        spin_lock(&delayed_refs->lock);
 899        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 900        if (head) {
 901                if (!mutex_trylock(&head->mutex)) {
 902                        refcount_inc(&head->refs);
 903                        spin_unlock(&delayed_refs->lock);
 904
 905                        btrfs_release_path(path);
 906
 907                        /*
 908                         * Mutex was contended, block until it's released and try
 909                         * again
 910                         */
 911                        mutex_lock(&head->mutex);
 912                        mutex_unlock(&head->mutex);
 913                        btrfs_put_delayed_ref_head(head);
 914                        goto search_again;
 915                }
 916                spin_lock(&head->lock);
 917                if (head->extent_op && head->extent_op->update_flags)
 918                        extent_flags |= head->extent_op->flags_to_set;
 919                else
 920                        BUG_ON(num_refs == 0);
 921
 922                num_refs += head->ref_mod;
 923                spin_unlock(&head->lock);
 924                mutex_unlock(&head->mutex);
 925        }
 926        spin_unlock(&delayed_refs->lock);
 927out:
 928        WARN_ON(num_refs == 0);
 929        if (refs)
 930                *refs = num_refs;
 931        if (flags)
 932                *flags = extent_flags;
 933out_free:
 934        btrfs_free_path(path);
 935        return ret;
 936}
 937
 938/*
 939 * Back reference rules.  Back refs have three main goals:
 940 *
 941 * 1) differentiate between all holders of references to an extent so that
 942 *    when a reference is dropped we can make sure it was a valid reference
 943 *    before freeing the extent.
 944 *
 945 * 2) Provide enough information to quickly find the holders of an extent
 946 *    if we notice a given block is corrupted or bad.
 947 *
 948 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 949 *    maintenance.  This is actually the same as #2, but with a slightly
 950 *    different use case.
 951 *
 952 * There are two kinds of back refs. The implicit back refs is optimized
 953 * for pointers in non-shared tree blocks. For a given pointer in a block,
 954 * back refs of this kind provide information about the block's owner tree
 955 * and the pointer's key. These information allow us to find the block by
 956 * b-tree searching. The full back refs is for pointers in tree blocks not
 957 * referenced by their owner trees. The location of tree block is recorded
 958 * in the back refs. Actually the full back refs is generic, and can be
 959 * used in all cases the implicit back refs is used. The major shortcoming
 960 * of the full back refs is its overhead. Every time a tree block gets
 961 * COWed, we have to update back refs entry for all pointers in it.
 962 *
 963 * For a newly allocated tree block, we use implicit back refs for
 964 * pointers in it. This means most tree related operations only involve
 965 * implicit back refs. For a tree block created in old transaction, the
 966 * only way to drop a reference to it is COW it. So we can detect the
 967 * event that tree block loses its owner tree's reference and do the
 968 * back refs conversion.
 969 *
 970 * When a tree block is COWed through a tree, there are four cases:
 971 *
 972 * The reference count of the block is one and the tree is the block's
 973 * owner tree. Nothing to do in this case.
 974 *
 975 * The reference count of the block is one and the tree is not the
 976 * block's owner tree. In this case, full back refs is used for pointers
 977 * in the block. Remove these full back refs, add implicit back refs for
 978 * every pointers in the new block.
 979 *
 980 * The reference count of the block is greater than one and the tree is
 981 * the block's owner tree. In this case, implicit back refs is used for
 982 * pointers in the block. Add full back refs for every pointers in the
 983 * block, increase lower level extents' reference counts. The original
 984 * implicit back refs are entailed to the new block.
 985 *
 986 * The reference count of the block is greater than one and the tree is
 987 * not the block's owner tree. Add implicit back refs for every pointer in
 988 * the new block, increase lower level extents' reference count.
 989 *
 990 * Back Reference Key composing:
 991 *
 992 * The key objectid corresponds to the first byte in the extent,
 993 * The key type is used to differentiate between types of back refs.
 994 * There are different meanings of the key offset for different types
 995 * of back refs.
 996 *
 997 * File extents can be referenced by:
 998 *
 999 * - multiple snapshots, subvolumes, or different generations in one subvol
1000 * - different files inside a single subvolume
1001 * - different offsets inside a file (bookend extents in file.c)
1002 *
1003 * The extent ref structure for the implicit back refs has fields for:
1004 *
1005 * - Objectid of the subvolume root
1006 * - objectid of the file holding the reference
1007 * - original offset in the file
1008 * - how many bookend extents
1009 *
1010 * The key offset for the implicit back refs is hash of the first
1011 * three fields.
1012 *
1013 * The extent ref structure for the full back refs has field for:
1014 *
1015 * - number of pointers in the tree leaf
1016 *
1017 * The key offset for the implicit back refs is the first byte of
1018 * the tree leaf
1019 *
1020 * When a file extent is allocated, The implicit back refs is used.
1021 * the fields are filled in:
1022 *
1023 *     (root_key.objectid, inode objectid, offset in file, 1)
1024 *
1025 * When a file extent is removed file truncation, we find the
1026 * corresponding implicit back refs and check the following fields:
1027 *
1028 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1029 *
1030 * Btree extents can be referenced by:
1031 *
1032 * - Different subvolumes
1033 *
1034 * Both the implicit back refs and the full back refs for tree blocks
1035 * only consist of key. The key offset for the implicit back refs is
1036 * objectid of block's owner tree. The key offset for the full back refs
1037 * is the first byte of parent block.
1038 *
1039 * When implicit back refs is used, information about the lowest key and
1040 * level of the tree block are required. These information are stored in
1041 * tree block info structure.
1042 */
1043
1044#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1045static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1046                                  struct btrfs_fs_info *fs_info,
1047                                  struct btrfs_path *path,
1048                                  u64 owner, u32 extra_size)
1049{
1050        struct btrfs_root *root = fs_info->extent_root;
1051        struct btrfs_extent_item *item;
1052        struct btrfs_extent_item_v0 *ei0;
1053        struct btrfs_extent_ref_v0 *ref0;
1054        struct btrfs_tree_block_info *bi;
1055        struct extent_buffer *leaf;
1056        struct btrfs_key key;
1057        struct btrfs_key found_key;
1058        u32 new_size = sizeof(*item);
1059        u64 refs;
1060        int ret;
1061
1062        leaf = path->nodes[0];
1063        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1064
1065        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1066        ei0 = btrfs_item_ptr(leaf, path->slots[0],
1067                             struct btrfs_extent_item_v0);
1068        refs = btrfs_extent_refs_v0(leaf, ei0);
1069
1070        if (owner == (u64)-1) {
1071                while (1) {
1072                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1073                                ret = btrfs_next_leaf(root, path);
1074                                if (ret < 0)
1075                                        return ret;
1076                                BUG_ON(ret > 0); /* Corruption */
1077                                leaf = path->nodes[0];
1078                        }
1079                        btrfs_item_key_to_cpu(leaf, &found_key,
1080                                              path->slots[0]);
1081                        BUG_ON(key.objectid != found_key.objectid);
1082                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1083                                path->slots[0]++;
1084                                continue;
1085                        }
1086                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1087                                              struct btrfs_extent_ref_v0);
1088                        owner = btrfs_ref_objectid_v0(leaf, ref0);
1089                        break;
1090                }
1091        }
1092        btrfs_release_path(path);
1093
1094        if (owner < BTRFS_FIRST_FREE_OBJECTID)
1095                new_size += sizeof(*bi);
1096
1097        new_size -= sizeof(*ei0);
1098        ret = btrfs_search_slot(trans, root, &key, path,
1099                                new_size + extra_size, 1);
1100        if (ret < 0)
1101                return ret;
1102        BUG_ON(ret); /* Corruption */
1103
1104        btrfs_extend_item(fs_info, path, new_size);
1105
1106        leaf = path->nodes[0];
1107        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1108        btrfs_set_extent_refs(leaf, item, refs);
1109        /* FIXME: get real generation */
1110        btrfs_set_extent_generation(leaf, item, 0);
1111        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1112                btrfs_set_extent_flags(leaf, item,
1113                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1114                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1115                bi = (struct btrfs_tree_block_info *)(item + 1);
1116                /* FIXME: get first key of the block */
1117                memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi));
1118                btrfs_set_tree_block_level(leaf, bi, (int)owner);
1119        } else {
1120                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1121        }
1122        btrfs_mark_buffer_dirty(leaf);
1123        return 0;
1124}
1125#endif
1126
1127/*
1128 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1129 * is_data == BTRFS_REF_TYPE_DATA, data type is requried,
1130 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1131 */
1132int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1133                                     struct btrfs_extent_inline_ref *iref,
1134                                     enum btrfs_inline_ref_type is_data)
1135{
1136        int type = btrfs_extent_inline_ref_type(eb, iref);
1137        u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1138
1139        if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1140            type == BTRFS_SHARED_BLOCK_REF_KEY ||
1141            type == BTRFS_SHARED_DATA_REF_KEY ||
1142            type == BTRFS_EXTENT_DATA_REF_KEY) {
1143                if (is_data == BTRFS_REF_TYPE_BLOCK) {
1144                        if (type == BTRFS_TREE_BLOCK_REF_KEY)
1145                                return type;
1146                        if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1147                                ASSERT(eb->fs_info);
1148                                /*
1149                                 * Every shared one has parent tree
1150                                 * block, which must be aligned to
1151                                 * nodesize.
1152                                 */
1153                                if (offset &&
1154                                    IS_ALIGNED(offset, eb->fs_info->nodesize))
1155                                        return type;
1156                        }
1157                } else if (is_data == BTRFS_REF_TYPE_DATA) {
1158                        if (type == BTRFS_EXTENT_DATA_REF_KEY)
1159                                return type;
1160                        if (type == BTRFS_SHARED_DATA_REF_KEY) {
1161                                ASSERT(eb->fs_info);
1162                                /*
1163                                 * Every shared one has parent tree
1164                                 * block, which must be aligned to
1165                                 * nodesize.
1166                                 */
1167                                if (offset &&
1168                                    IS_ALIGNED(offset, eb->fs_info->nodesize))
1169                                        return type;
1170                        }
1171                } else {
1172                        ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1173                        return type;
1174                }
1175        }
1176
1177        btrfs_print_leaf((struct extent_buffer *)eb);
1178        btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1179                  eb->start, type);
1180        WARN_ON(1);
1181
1182        return BTRFS_REF_TYPE_INVALID;
1183}
1184
1185static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1186{
1187        u32 high_crc = ~(u32)0;
1188        u32 low_crc = ~(u32)0;
1189        __le64 lenum;
1190
1191        lenum = cpu_to_le64(root_objectid);
1192        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1193        lenum = cpu_to_le64(owner);
1194        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1195        lenum = cpu_to_le64(offset);
1196        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1197
1198        return ((u64)high_crc << 31) ^ (u64)low_crc;
1199}
1200
1201static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1202                                     struct btrfs_extent_data_ref *ref)
1203{
1204        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1205                                    btrfs_extent_data_ref_objectid(leaf, ref),
1206                                    btrfs_extent_data_ref_offset(leaf, ref));
1207}
1208
1209static int match_extent_data_ref(struct extent_buffer *leaf,
1210                                 struct btrfs_extent_data_ref *ref,
1211                                 u64 root_objectid, u64 owner, u64 offset)
1212{
1213        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1214            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1215            btrfs_extent_data_ref_offset(leaf, ref) != offset)
1216                return 0;
1217        return 1;
1218}
1219
1220static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1221                                           struct btrfs_fs_info *fs_info,
1222                                           struct btrfs_path *path,
1223                                           u64 bytenr, u64 parent,
1224                                           u64 root_objectid,
1225                                           u64 owner, u64 offset)
1226{
1227        struct btrfs_root *root = fs_info->extent_root;
1228        struct btrfs_key key;
1229        struct btrfs_extent_data_ref *ref;
1230        struct extent_buffer *leaf;
1231        u32 nritems;
1232        int ret;
1233        int recow;
1234        int err = -ENOENT;
1235
1236        key.objectid = bytenr;
1237        if (parent) {
1238                key.type = BTRFS_SHARED_DATA_REF_KEY;
1239                key.offset = parent;
1240        } else {
1241                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1242                key.offset = hash_extent_data_ref(root_objectid,
1243                                                  owner, offset);
1244        }
1245again:
1246        recow = 0;
1247        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1248        if (ret < 0) {
1249                err = ret;
1250                goto fail;
1251        }
1252
1253        if (parent) {
1254                if (!ret)
1255                        return 0;
1256#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1257                key.type = BTRFS_EXTENT_REF_V0_KEY;
1258                btrfs_release_path(path);
1259                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1260                if (ret < 0) {
1261                        err = ret;
1262                        goto fail;
1263                }
1264                if (!ret)
1265                        return 0;
1266#endif
1267                goto fail;
1268        }
1269
1270        leaf = path->nodes[0];
1271        nritems = btrfs_header_nritems(leaf);
1272        while (1) {
1273                if (path->slots[0] >= nritems) {
1274                        ret = btrfs_next_leaf(root, path);
1275                        if (ret < 0)
1276                                err = ret;
1277                        if (ret)
1278                                goto fail;
1279
1280                        leaf = path->nodes[0];
1281                        nritems = btrfs_header_nritems(leaf);
1282                        recow = 1;
1283                }
1284
1285                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1286                if (key.objectid != bytenr ||
1287                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1288                        goto fail;
1289
1290                ref = btrfs_item_ptr(leaf, path->slots[0],
1291                                     struct btrfs_extent_data_ref);
1292
1293                if (match_extent_data_ref(leaf, ref, root_objectid,
1294                                          owner, offset)) {
1295                        if (recow) {
1296                                btrfs_release_path(path);
1297                                goto again;
1298                        }
1299                        err = 0;
1300                        break;
1301                }
1302                path->slots[0]++;
1303        }
1304fail:
1305        return err;
1306}
1307
1308static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1309                                           struct btrfs_fs_info *fs_info,
1310                                           struct btrfs_path *path,
1311                                           u64 bytenr, u64 parent,
1312                                           u64 root_objectid, u64 owner,
1313                                           u64 offset, int refs_to_add)
1314{
1315        struct btrfs_root *root = fs_info->extent_root;
1316        struct btrfs_key key;
1317        struct extent_buffer *leaf;
1318        u32 size;
1319        u32 num_refs;
1320        int ret;
1321
1322        key.objectid = bytenr;
1323        if (parent) {
1324                key.type = BTRFS_SHARED_DATA_REF_KEY;
1325                key.offset = parent;
1326                size = sizeof(struct btrfs_shared_data_ref);
1327        } else {
1328                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1329                key.offset = hash_extent_data_ref(root_objectid,
1330                                                  owner, offset);
1331                size = sizeof(struct btrfs_extent_data_ref);
1332        }
1333
1334        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1335        if (ret && ret != -EEXIST)
1336                goto fail;
1337
1338        leaf = path->nodes[0];
1339        if (parent) {
1340                struct btrfs_shared_data_ref *ref;
1341                ref = btrfs_item_ptr(leaf, path->slots[0],
1342                                     struct btrfs_shared_data_ref);
1343                if (ret == 0) {
1344                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1345                } else {
1346                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
1347                        num_refs += refs_to_add;
1348                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1349                }
1350        } else {
1351                struct btrfs_extent_data_ref *ref;
1352                while (ret == -EEXIST) {
1353                        ref = btrfs_item_ptr(leaf, path->slots[0],
1354                                             struct btrfs_extent_data_ref);
1355                        if (match_extent_data_ref(leaf, ref, root_objectid,
1356                                                  owner, offset))
1357                                break;
1358                        btrfs_release_path(path);
1359                        key.offset++;
1360                        ret = btrfs_insert_empty_item(trans, root, path, &key,
1361                                                      size);
1362                        if (ret && ret != -EEXIST)
1363                                goto fail;
1364
1365                        leaf = path->nodes[0];
1366                }
1367                ref = btrfs_item_ptr(leaf, path->slots[0],
1368                                     struct btrfs_extent_data_ref);
1369                if (ret == 0) {
1370                        btrfs_set_extent_data_ref_root(leaf, ref,
1371                                                       root_objectid);
1372                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1373                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1374                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1375                } else {
1376                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
1377                        num_refs += refs_to_add;
1378                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1379                }
1380        }
1381        btrfs_mark_buffer_dirty(leaf);
1382        ret = 0;
1383fail:
1384        btrfs_release_path(path);
1385        return ret;
1386}
1387
1388static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1389                                           struct btrfs_fs_info *fs_info,
1390                                           struct btrfs_path *path,
1391                                           int refs_to_drop, int *last_ref)
1392{
1393        struct btrfs_key key;
1394        struct btrfs_extent_data_ref *ref1 = NULL;
1395        struct btrfs_shared_data_ref *ref2 = NULL;
1396        struct extent_buffer *leaf;
1397        u32 num_refs = 0;
1398        int ret = 0;
1399
1400        leaf = path->nodes[0];
1401        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1402
1403        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1404                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1405                                      struct btrfs_extent_data_ref);
1406                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1407        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1408                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1409                                      struct btrfs_shared_data_ref);
1410                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1411#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1412        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1413                struct btrfs_extent_ref_v0 *ref0;
1414                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1415                                      struct btrfs_extent_ref_v0);
1416                num_refs = btrfs_ref_count_v0(leaf, ref0);
1417#endif
1418        } else {
1419                BUG();
1420        }
1421
1422        BUG_ON(num_refs < refs_to_drop);
1423        num_refs -= refs_to_drop;
1424
1425        if (num_refs == 0) {
1426                ret = btrfs_del_item(trans, fs_info->extent_root, path);
1427                *last_ref = 1;
1428        } else {
1429                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1430                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1431                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1432                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1433#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1434                else {
1435                        struct btrfs_extent_ref_v0 *ref0;
1436                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1437                                        struct btrfs_extent_ref_v0);
1438                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1439                }
1440#endif
1441                btrfs_mark_buffer_dirty(leaf);
1442        }
1443        return ret;
1444}
1445
1446static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1447                                          struct btrfs_extent_inline_ref *iref)
1448{
1449        struct btrfs_key key;
1450        struct extent_buffer *leaf;
1451        struct btrfs_extent_data_ref *ref1;
1452        struct btrfs_shared_data_ref *ref2;
1453        u32 num_refs = 0;
1454        int type;
1455
1456        leaf = path->nodes[0];
1457        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1458        if (iref) {
1459                /*
1460                 * If type is invalid, we should have bailed out earlier than
1461                 * this call.
1462                 */
1463                type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1464                ASSERT(type != BTRFS_REF_TYPE_INVALID);
1465                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1466                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1467                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1468                } else {
1469                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1470                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1471                }
1472        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1473                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1474                                      struct btrfs_extent_data_ref);
1475                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1476        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1477                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1478                                      struct btrfs_shared_data_ref);
1479                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1480#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1481        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1482                struct btrfs_extent_ref_v0 *ref0;
1483                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1484                                      struct btrfs_extent_ref_v0);
1485                num_refs = btrfs_ref_count_v0(leaf, ref0);
1486#endif
1487        } else {
1488                WARN_ON(1);
1489        }
1490        return num_refs;
1491}
1492
1493static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1494                                          struct btrfs_fs_info *fs_info,
1495                                          struct btrfs_path *path,
1496                                          u64 bytenr, u64 parent,
1497                                          u64 root_objectid)
1498{
1499        struct btrfs_root *root = fs_info->extent_root;
1500        struct btrfs_key key;
1501        int ret;
1502
1503        key.objectid = bytenr;
1504        if (parent) {
1505                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1506                key.offset = parent;
1507        } else {
1508                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1509                key.offset = root_objectid;
1510        }
1511
1512        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1513        if (ret > 0)
1514                ret = -ENOENT;
1515#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1516        if (ret == -ENOENT && parent) {
1517                btrfs_release_path(path);
1518                key.type = BTRFS_EXTENT_REF_V0_KEY;
1519                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1520                if (ret > 0)
1521                        ret = -ENOENT;
1522        }
1523#endif
1524        return ret;
1525}
1526
1527static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1528                                          struct btrfs_fs_info *fs_info,
1529                                          struct btrfs_path *path,
1530                                          u64 bytenr, u64 parent,
1531                                          u64 root_objectid)
1532{
1533        struct btrfs_key key;
1534        int ret;
1535
1536        key.objectid = bytenr;
1537        if (parent) {
1538                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1539                key.offset = parent;
1540        } else {
1541                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1542                key.offset = root_objectid;
1543        }
1544
1545        ret = btrfs_insert_empty_item(trans, fs_info->extent_root,
1546                                      path, &key, 0);
1547        btrfs_release_path(path);
1548        return ret;
1549}
1550
1551static inline int extent_ref_type(u64 parent, u64 owner)
1552{
1553        int type;
1554        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1555                if (parent > 0)
1556                        type = BTRFS_SHARED_BLOCK_REF_KEY;
1557                else
1558                        type = BTRFS_TREE_BLOCK_REF_KEY;
1559        } else {
1560                if (parent > 0)
1561                        type = BTRFS_SHARED_DATA_REF_KEY;
1562                else
1563                        type = BTRFS_EXTENT_DATA_REF_KEY;
1564        }
1565        return type;
1566}
1567
1568static int find_next_key(struct btrfs_path *path, int level,
1569                         struct btrfs_key *key)
1570
1571{
1572        for (; level < BTRFS_MAX_LEVEL; level++) {
1573                if (!path->nodes[level])
1574                        break;
1575                if (path->slots[level] + 1 >=
1576                    btrfs_header_nritems(path->nodes[level]))
1577                        continue;
1578                if (level == 0)
1579                        btrfs_item_key_to_cpu(path->nodes[level], key,
1580                                              path->slots[level] + 1);
1581                else
1582                        btrfs_node_key_to_cpu(path->nodes[level], key,
1583                                              path->slots[level] + 1);
1584                return 0;
1585        }
1586        return 1;
1587}
1588
1589/*
1590 * look for inline back ref. if back ref is found, *ref_ret is set
1591 * to the address of inline back ref, and 0 is returned.
1592 *
1593 * if back ref isn't found, *ref_ret is set to the address where it
1594 * should be inserted, and -ENOENT is returned.
1595 *
1596 * if insert is true and there are too many inline back refs, the path
1597 * points to the extent item, and -EAGAIN is returned.
1598 *
1599 * NOTE: inline back refs are ordered in the same way that back ref
1600 *       items in the tree are ordered.
1601 */
1602static noinline_for_stack
1603int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1604                                 struct btrfs_fs_info *fs_info,
1605                                 struct btrfs_path *path,
1606                                 struct btrfs_extent_inline_ref **ref_ret,
1607                                 u64 bytenr, u64 num_bytes,
1608                                 u64 parent, u64 root_objectid,
1609                                 u64 owner, u64 offset, int insert)
1610{
1611        struct btrfs_root *root = fs_info->extent_root;
1612        struct btrfs_key key;
1613        struct extent_buffer *leaf;
1614        struct btrfs_extent_item *ei;
1615        struct btrfs_extent_inline_ref *iref;
1616        u64 flags;
1617        u64 item_size;
1618        unsigned long ptr;
1619        unsigned long end;
1620        int extra_size;
1621        int type;
1622        int want;
1623        int ret;
1624        int err = 0;
1625        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1626        int needed;
1627
1628        key.objectid = bytenr;
1629        key.type = BTRFS_EXTENT_ITEM_KEY;
1630        key.offset = num_bytes;
1631
1632        want = extent_ref_type(parent, owner);
1633        if (insert) {
1634                extra_size = btrfs_extent_inline_ref_size(want);
1635                path->keep_locks = 1;
1636        } else
1637                extra_size = -1;
1638
1639        /*
1640         * Owner is our parent level, so we can just add one to get the level
1641         * for the block we are interested in.
1642         */
1643        if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1644                key.type = BTRFS_METADATA_ITEM_KEY;
1645                key.offset = owner;
1646        }
1647
1648again:
1649        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1650        if (ret < 0) {
1651                err = ret;
1652                goto out;
1653        }
1654
1655        /*
1656         * We may be a newly converted file system which still has the old fat
1657         * extent entries for metadata, so try and see if we have one of those.
1658         */
1659        if (ret > 0 && skinny_metadata) {
1660                skinny_metadata = false;
1661                if (path->slots[0]) {
1662                        path->slots[0]--;
1663                        btrfs_item_key_to_cpu(path->nodes[0], &key,
1664                                              path->slots[0]);
1665                        if (key.objectid == bytenr &&
1666                            key.type == BTRFS_EXTENT_ITEM_KEY &&
1667                            key.offset == num_bytes)
1668                                ret = 0;
1669                }
1670                if (ret) {
1671                        key.objectid = bytenr;
1672                        key.type = BTRFS_EXTENT_ITEM_KEY;
1673                        key.offset = num_bytes;
1674                        btrfs_release_path(path);
1675                        goto again;
1676                }
1677        }
1678
1679        if (ret && !insert) {
1680                err = -ENOENT;
1681                goto out;
1682        } else if (WARN_ON(ret)) {
1683                err = -EIO;
1684                goto out;
1685        }
1686
1687        leaf = path->nodes[0];
1688        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1689#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1690        if (item_size < sizeof(*ei)) {
1691                if (!insert) {
1692                        err = -ENOENT;
1693                        goto out;
1694                }
1695                ret = convert_extent_item_v0(trans, fs_info, path, owner,
1696                                             extra_size);
1697                if (ret < 0) {
1698                        err = ret;
1699                        goto out;
1700                }
1701                leaf = path->nodes[0];
1702                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1703        }
1704#endif
1705        BUG_ON(item_size < sizeof(*ei));
1706
1707        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1708        flags = btrfs_extent_flags(leaf, ei);
1709
1710        ptr = (unsigned long)(ei + 1);
1711        end = (unsigned long)ei + item_size;
1712
1713        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1714                ptr += sizeof(struct btrfs_tree_block_info);
1715                BUG_ON(ptr > end);
1716        }
1717
1718        if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1719                needed = BTRFS_REF_TYPE_DATA;
1720        else
1721                needed = BTRFS_REF_TYPE_BLOCK;
1722
1723        err = -ENOENT;
1724        while (1) {
1725                if (ptr >= end) {
1726                        WARN_ON(ptr > end);
1727                        break;
1728                }
1729                iref = (struct btrfs_extent_inline_ref *)ptr;
1730                type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1731                if (type == BTRFS_REF_TYPE_INVALID) {
1732                        err = -EINVAL;
1733                        goto out;
1734                }
1735
1736                if (want < type)
1737                        break;
1738                if (want > type) {
1739                        ptr += btrfs_extent_inline_ref_size(type);
1740                        continue;
1741                }
1742
1743                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1744                        struct btrfs_extent_data_ref *dref;
1745                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1746                        if (match_extent_data_ref(leaf, dref, root_objectid,
1747                                                  owner, offset)) {
1748                                err = 0;
1749                                break;
1750                        }
1751                        if (hash_extent_data_ref_item(leaf, dref) <
1752                            hash_extent_data_ref(root_objectid, owner, offset))
1753                                break;
1754                } else {
1755                        u64 ref_offset;
1756                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1757                        if (parent > 0) {
1758                                if (parent == ref_offset) {
1759                                        err = 0;
1760                                        break;
1761                                }
1762                                if (ref_offset < parent)
1763                                        break;
1764                        } else {
1765                                if (root_objectid == ref_offset) {
1766                                        err = 0;
1767                                        break;
1768                                }
1769                                if (ref_offset < root_objectid)
1770                                        break;
1771                        }
1772                }
1773                ptr += btrfs_extent_inline_ref_size(type);
1774        }
1775        if (err == -ENOENT && insert) {
1776                if (item_size + extra_size >=
1777                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1778                        err = -EAGAIN;
1779                        goto out;
1780                }
1781                /*
1782                 * To add new inline back ref, we have to make sure
1783                 * there is no corresponding back ref item.
1784                 * For simplicity, we just do not add new inline back
1785                 * ref if there is any kind of item for this block
1786                 */
1787                if (find_next_key(path, 0, &key) == 0 &&
1788                    key.objectid == bytenr &&
1789                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1790                        err = -EAGAIN;
1791                        goto out;
1792                }
1793        }
1794        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1795out:
1796        if (insert) {
1797                path->keep_locks = 0;
1798                btrfs_unlock_up_safe(path, 1);
1799        }
1800        return err;
1801}
1802
1803/*
1804 * helper to add new inline back ref
1805 */
1806static noinline_for_stack
1807void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1808                                 struct btrfs_path *path,
1809                                 struct btrfs_extent_inline_ref *iref,
1810                                 u64 parent, u64 root_objectid,
1811                                 u64 owner, u64 offset, int refs_to_add,
1812                                 struct btrfs_delayed_extent_op *extent_op)
1813{
1814        struct extent_buffer *leaf;
1815        struct btrfs_extent_item *ei;
1816        unsigned long ptr;
1817        unsigned long end;
1818        unsigned long item_offset;
1819        u64 refs;
1820        int size;
1821        int type;
1822
1823        leaf = path->nodes[0];
1824        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1825        item_offset = (unsigned long)iref - (unsigned long)ei;
1826
1827        type = extent_ref_type(parent, owner);
1828        size = btrfs_extent_inline_ref_size(type);
1829
1830        btrfs_extend_item(fs_info, path, size);
1831
1832        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1833        refs = btrfs_extent_refs(leaf, ei);
1834        refs += refs_to_add;
1835        btrfs_set_extent_refs(leaf, ei, refs);
1836        if (extent_op)
1837                __run_delayed_extent_op(extent_op, leaf, ei);
1838
1839        ptr = (unsigned long)ei + item_offset;
1840        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1841        if (ptr < end - size)
1842                memmove_extent_buffer(leaf, ptr + size, ptr,
1843                                      end - size - ptr);
1844
1845        iref = (struct btrfs_extent_inline_ref *)ptr;
1846        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1847        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1848                struct btrfs_extent_data_ref *dref;
1849                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1850                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1851                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1852                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1853                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1854        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1855                struct btrfs_shared_data_ref *sref;
1856                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1857                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1858                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1859        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1860                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1861        } else {
1862                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1863        }
1864        btrfs_mark_buffer_dirty(leaf);
1865}
1866
1867static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1868                                 struct btrfs_fs_info *fs_info,
1869                                 struct btrfs_path *path,
1870                                 struct btrfs_extent_inline_ref **ref_ret,
1871                                 u64 bytenr, u64 num_bytes, u64 parent,
1872                                 u64 root_objectid, u64 owner, u64 offset)
1873{
1874        int ret;
1875
1876        ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret,
1877                                           bytenr, num_bytes, parent,
1878                                           root_objectid, owner, offset, 0);
1879        if (ret != -ENOENT)
1880                return ret;
1881
1882        btrfs_release_path(path);
1883        *ref_ret = NULL;
1884
1885        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1886                ret = lookup_tree_block_ref(trans, fs_info, path, bytenr,
1887                                            parent, root_objectid);
1888        } else {
1889                ret = lookup_extent_data_ref(trans, fs_info, path, bytenr,
1890                                             parent, root_objectid, owner,
1891                                             offset);
1892        }
1893        return ret;
1894}
1895
1896/*
1897 * helper to update/remove inline back ref
1898 */
1899static noinline_for_stack
1900void update_inline_extent_backref(struct btrfs_fs_info *fs_info,
1901                                  struct btrfs_path *path,
1902                                  struct btrfs_extent_inline_ref *iref,
1903                                  int refs_to_mod,
1904                                  struct btrfs_delayed_extent_op *extent_op,
1905                                  int *last_ref)
1906{
1907        struct extent_buffer *leaf;
1908        struct btrfs_extent_item *ei;
1909        struct btrfs_extent_data_ref *dref = NULL;
1910        struct btrfs_shared_data_ref *sref = NULL;
1911        unsigned long ptr;
1912        unsigned long end;
1913        u32 item_size;
1914        int size;
1915        int type;
1916        u64 refs;
1917
1918        leaf = path->nodes[0];
1919        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1920        refs = btrfs_extent_refs(leaf, ei);
1921        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1922        refs += refs_to_mod;
1923        btrfs_set_extent_refs(leaf, ei, refs);
1924        if (extent_op)
1925                __run_delayed_extent_op(extent_op, leaf, ei);
1926
1927        /*
1928         * If type is invalid, we should have bailed out after
1929         * lookup_inline_extent_backref().
1930         */
1931        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1932        ASSERT(type != BTRFS_REF_TYPE_INVALID);
1933
1934        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1935                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1936                refs = btrfs_extent_data_ref_count(leaf, dref);
1937        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1938                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1939                refs = btrfs_shared_data_ref_count(leaf, sref);
1940        } else {
1941                refs = 1;
1942                BUG_ON(refs_to_mod != -1);
1943        }
1944
1945        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1946        refs += refs_to_mod;
1947
1948        if (refs > 0) {
1949                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1950                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1951                else
1952                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1953        } else {
1954                *last_ref = 1;
1955                size =  btrfs_extent_inline_ref_size(type);
1956                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1957                ptr = (unsigned long)iref;
1958                end = (unsigned long)ei + item_size;
1959                if (ptr + size < end)
1960                        memmove_extent_buffer(leaf, ptr, ptr + size,
1961                                              end - ptr - size);
1962                item_size -= size;
1963                btrfs_truncate_item(fs_info, path, item_size, 1);
1964        }
1965        btrfs_mark_buffer_dirty(leaf);
1966}
1967
1968static noinline_for_stack
1969int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1970                                 struct btrfs_fs_info *fs_info,
1971                                 struct btrfs_path *path,
1972                                 u64 bytenr, u64 num_bytes, u64 parent,
1973                                 u64 root_objectid, u64 owner,
1974                                 u64 offset, int refs_to_add,
1975                                 struct btrfs_delayed_extent_op *extent_op)
1976{
1977        struct btrfs_extent_inline_ref *iref;
1978        int ret;
1979
1980        ret = lookup_inline_extent_backref(trans, fs_info, path, &iref,
1981                                           bytenr, num_bytes, parent,
1982                                           root_objectid, owner, offset, 1);
1983        if (ret == 0) {
1984                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1985                update_inline_extent_backref(fs_info, path, iref,
1986                                             refs_to_add, extent_op, NULL);
1987        } else if (ret == -ENOENT) {
1988                setup_inline_extent_backref(fs_info, path, iref, parent,
1989                                            root_objectid, owner, offset,
1990                                            refs_to_add, extent_op);
1991                ret = 0;
1992        }
1993        return ret;
1994}
1995
1996static int insert_extent_backref(struct btrfs_trans_handle *trans,
1997                                 struct btrfs_fs_info *fs_info,
1998                                 struct btrfs_path *path,
1999                                 u64 bytenr, u64 parent, u64 root_objectid,
2000                                 u64 owner, u64 offset, int refs_to_add)
2001{
2002        int ret;
2003        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2004                BUG_ON(refs_to_add != 1);
2005                ret = insert_tree_block_ref(trans, fs_info, path, bytenr,
2006                                            parent, root_objectid);
2007        } else {
2008                ret = insert_extent_data_ref(trans, fs_info, path, bytenr,
2009                                             parent, root_objectid,
2010                                             owner, offset, refs_to_add);
2011        }
2012        return ret;
2013}
2014
2015static int remove_extent_backref(struct btrfs_trans_handle *trans,
2016                                 struct btrfs_fs_info *fs_info,
2017                                 struct btrfs_path *path,
2018                                 struct btrfs_extent_inline_ref *iref,
2019                                 int refs_to_drop, int is_data, int *last_ref)
2020{
2021        int ret = 0;
2022
2023        BUG_ON(!is_data && refs_to_drop != 1);
2024        if (iref) {
2025                update_inline_extent_backref(fs_info, path, iref,
2026                                             -refs_to_drop, NULL, last_ref);
2027        } else if (is_data) {
2028                ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop,
2029                                             last_ref);
2030        } else {
2031                *last_ref = 1;
2032                ret = btrfs_del_item(trans, fs_info->extent_root, path);
2033        }
2034        return ret;
2035}
2036
2037#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
2038static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
2039                               u64 *discarded_bytes)
2040{
2041        int j, ret = 0;
2042        u64 bytes_left, end;
2043        u64 aligned_start = ALIGN(start, 1 << 9);
2044
2045        if (WARN_ON(start != aligned_start)) {
2046                len -= aligned_start - start;
2047                len = round_down(len, 1 << 9);
2048                start = aligned_start;
2049        }
2050
2051        *discarded_bytes = 0;
2052
2053        if (!len)
2054                return 0;
2055
2056        end = start + len;
2057        bytes_left = len;
2058
2059        /* Skip any superblocks on this device. */
2060        for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
2061                u64 sb_start = btrfs_sb_offset(j);
2062                u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
2063                u64 size = sb_start - start;
2064
2065                if (!in_range(sb_start, start, bytes_left) &&
2066                    !in_range(sb_end, start, bytes_left) &&
2067                    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
2068                        continue;
2069
2070                /*
2071                 * Superblock spans beginning of range.  Adjust start and
2072                 * try again.
2073                 */
2074                if (sb_start <= start) {
2075                        start += sb_end - start;
2076                        if (start > end) {
2077                                bytes_left = 0;
2078                                break;
2079                        }
2080                        bytes_left = end - start;
2081                        continue;
2082                }
2083
2084                if (size) {
2085                        ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
2086                                                   GFP_NOFS, 0);
2087                        if (!ret)
2088                                *discarded_bytes += size;
2089                        else if (ret != -EOPNOTSUPP)
2090                                return ret;
2091                }
2092
2093                start = sb_end;
2094                if (start > end) {
2095                        bytes_left = 0;
2096                        break;
2097                }
2098                bytes_left = end - start;
2099        }
2100
2101        if (bytes_left) {
2102                ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2103                                           GFP_NOFS, 0);
2104                if (!ret)
2105                        *discarded_bytes += bytes_left;
2106        }
2107        return ret;
2108}
2109
2110int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
2111                         u64 num_bytes, u64 *actual_bytes)
2112{
2113        int ret;
2114        u64 discarded_bytes = 0;
2115        struct btrfs_bio *bbio = NULL;
2116
2117
2118        /*
2119         * Avoid races with device replace and make sure our bbio has devices
2120         * associated to its stripes that don't go away while we are discarding.
2121         */
2122        btrfs_bio_counter_inc_blocked(fs_info);
2123        /* Tell the block device(s) that the sectors can be discarded */
2124        ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
2125                              &bbio, 0);
2126        /* Error condition is -ENOMEM */
2127        if (!ret) {
2128                struct btrfs_bio_stripe *stripe = bbio->stripes;
2129                int i;
2130
2131
2132                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2133                        u64 bytes;
2134                        struct request_queue *req_q;
2135
2136                        if (!stripe->dev->bdev) {
2137                                ASSERT(btrfs_test_opt(fs_info, DEGRADED));
2138                                continue;
2139                        }
2140                        req_q = bdev_get_queue(stripe->dev->bdev);
2141                        if (!blk_queue_discard(req_q))
2142                                continue;
2143
2144                        ret = btrfs_issue_discard(stripe->dev->bdev,
2145                                                  stripe->physical,
2146                                                  stripe->length,
2147                                                  &bytes);
2148                        if (!ret)
2149                                discarded_bytes += bytes;
2150                        else if (ret != -EOPNOTSUPP)
2151                                break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2152
2153                        /*
2154                         * Just in case we get back EOPNOTSUPP for some reason,
2155                         * just ignore the return value so we don't screw up
2156                         * people calling discard_extent.
2157                         */
2158                        ret = 0;
2159                }
2160                btrfs_put_bbio(bbio);
2161        }
2162        btrfs_bio_counter_dec(fs_info);
2163
2164        if (actual_bytes)
2165                *actual_bytes = discarded_bytes;
2166
2167
2168        if (ret == -EOPNOTSUPP)
2169                ret = 0;
2170        return ret;
2171}
2172
2173/* Can return -ENOMEM */
2174int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2175                         struct btrfs_root *root,
2176                         u64 bytenr, u64 num_bytes, u64 parent,
2177                         u64 root_objectid, u64 owner, u64 offset)
2178{
2179        struct btrfs_fs_info *fs_info = root->fs_info;
2180        int old_ref_mod, new_ref_mod;
2181        int ret;
2182
2183        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2184               root_objectid == BTRFS_TREE_LOG_OBJECTID);
2185
2186        btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2187                           owner, offset, BTRFS_ADD_DELAYED_REF);
2188
2189        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2190                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2191                                                 num_bytes, parent,
2192                                                 root_objectid, (int)owner,
2193                                                 BTRFS_ADD_DELAYED_REF, NULL,
2194                                                 &old_ref_mod, &new_ref_mod);
2195        } else {
2196                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2197                                                 num_bytes, parent,
2198                                                 root_objectid, owner, offset,
2199                                                 0, BTRFS_ADD_DELAYED_REF,
2200                                                 &old_ref_mod, &new_ref_mod);
2201        }
2202
2203        if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
2204                add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
2205
2206        return ret;
2207}
2208
2209static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2210                                  struct btrfs_fs_info *fs_info,
2211                                  struct btrfs_delayed_ref_node *node,
2212                                  u64 parent, u64 root_objectid,
2213                                  u64 owner, u64 offset, int refs_to_add,
2214                                  struct btrfs_delayed_extent_op *extent_op)
2215{
2216        struct btrfs_path *path;
2217        struct extent_buffer *leaf;
2218        struct btrfs_extent_item *item;
2219        struct btrfs_key key;
2220        u64 bytenr = node->bytenr;
2221        u64 num_bytes = node->num_bytes;
2222        u64 refs;
2223        int ret;
2224
2225        path = btrfs_alloc_path();
2226        if (!path)
2227                return -ENOMEM;
2228
2229        path->reada = READA_FORWARD;
2230        path->leave_spinning = 1;
2231        /* this will setup the path even if it fails to insert the back ref */
2232        ret = insert_inline_extent_backref(trans, fs_info, path, bytenr,
2233                                           num_bytes, parent, root_objectid,
2234                                           owner, offset,
2235                                           refs_to_add, extent_op);
2236        if ((ret < 0 && ret != -EAGAIN) || !ret)
2237                goto out;
2238
2239        /*
2240         * Ok we had -EAGAIN which means we didn't have space to insert and
2241         * inline extent ref, so just update the reference count and add a
2242         * normal backref.
2243         */
2244        leaf = path->nodes[0];
2245        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2246        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2247        refs = btrfs_extent_refs(leaf, item);
2248        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2249        if (extent_op)
2250                __run_delayed_extent_op(extent_op, leaf, item);
2251
2252        btrfs_mark_buffer_dirty(leaf);
2253        btrfs_release_path(path);
2254
2255        path->reada = READA_FORWARD;
2256        path->leave_spinning = 1;
2257        /* now insert the actual backref */
2258        ret = insert_extent_backref(trans, fs_info, path, bytenr, parent,
2259                                    root_objectid, owner, offset, refs_to_add);
2260        if (ret)
2261                btrfs_abort_transaction(trans, ret);
2262out:
2263        btrfs_free_path(path);
2264        return ret;
2265}
2266
2267static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2268                                struct btrfs_fs_info *fs_info,
2269                                struct btrfs_delayed_ref_node *node,
2270                                struct btrfs_delayed_extent_op *extent_op,
2271                                int insert_reserved)
2272{
2273        int ret = 0;
2274        struct btrfs_delayed_data_ref *ref;
2275        struct btrfs_key ins;
2276        u64 parent = 0;
2277        u64 ref_root = 0;
2278        u64 flags = 0;
2279
2280        ins.objectid = node->bytenr;
2281        ins.offset = node->num_bytes;
2282        ins.type = BTRFS_EXTENT_ITEM_KEY;
2283
2284        ref = btrfs_delayed_node_to_data_ref(node);
2285        trace_run_delayed_data_ref(fs_info, node, ref, node->action);
2286
2287        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2288                parent = ref->parent;
2289        ref_root = ref->root;
2290
2291        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2292                if (extent_op)
2293                        flags |= extent_op->flags_to_set;
2294                ret = alloc_reserved_file_extent(trans, fs_info,
2295                                                 parent, ref_root, flags,
2296                                                 ref->objectid, ref->offset,
2297                                                 &ins, node->ref_mod);
2298        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2299                ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent,
2300                                             ref_root, ref->objectid,
2301                                             ref->offset, node->ref_mod,
2302                                             extent_op);
2303        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2304                ret = __btrfs_free_extent(trans, fs_info, node, parent,
2305                                          ref_root, ref->objectid,
2306                                          ref->offset, node->ref_mod,
2307                                          extent_op);
2308        } else {
2309                BUG();
2310        }
2311        return ret;
2312}
2313
2314static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2315                                    struct extent_buffer *leaf,
2316                                    struct btrfs_extent_item *ei)
2317{
2318        u64 flags = btrfs_extent_flags(leaf, ei);
2319        if (extent_op->update_flags) {
2320                flags |= extent_op->flags_to_set;
2321                btrfs_set_extent_flags(leaf, ei, flags);
2322        }
2323
2324        if (extent_op->update_key) {
2325                struct btrfs_tree_block_info *bi;
2326                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2327                bi = (struct btrfs_tree_block_info *)(ei + 1);
2328                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2329        }
2330}
2331
2332static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2333                                 struct btrfs_fs_info *fs_info,
2334                                 struct btrfs_delayed_ref_head *head,
2335                                 struct btrfs_delayed_extent_op *extent_op)
2336{
2337        struct btrfs_key key;
2338        struct btrfs_path *path;
2339        struct btrfs_extent_item *ei;
2340        struct extent_buffer *leaf;
2341        u32 item_size;
2342        int ret;
2343        int err = 0;
2344        int metadata = !extent_op->is_data;
2345
2346        if (trans->aborted)
2347                return 0;
2348
2349        if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2350                metadata = 0;
2351
2352        path = btrfs_alloc_path();
2353        if (!path)
2354                return -ENOMEM;
2355
2356        key.objectid = head->bytenr;
2357
2358        if (metadata) {
2359                key.type = BTRFS_METADATA_ITEM_KEY;
2360                key.offset = extent_op->level;
2361        } else {
2362                key.type = BTRFS_EXTENT_ITEM_KEY;
2363                key.offset = head->num_bytes;
2364        }
2365
2366again:
2367        path->reada = READA_FORWARD;
2368        path->leave_spinning = 1;
2369        ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2370        if (ret < 0) {
2371                err = ret;
2372                goto out;
2373        }
2374        if (ret > 0) {
2375                if (metadata) {
2376                        if (path->slots[0] > 0) {
2377                                path->slots[0]--;
2378                                btrfs_item_key_to_cpu(path->nodes[0], &key,
2379                                                      path->slots[0]);
2380                                if (key.objectid == head->bytenr &&
2381                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
2382                                    key.offset == head->num_bytes)
2383                                        ret = 0;
2384                        }
2385                        if (ret > 0) {
2386                                btrfs_release_path(path);
2387                                metadata = 0;
2388
2389                                key.objectid = head->bytenr;
2390                                key.offset = head->num_bytes;
2391                                key.type = BTRFS_EXTENT_ITEM_KEY;
2392                                goto again;
2393                        }
2394                } else {
2395                        err = -EIO;
2396                        goto out;
2397                }
2398        }
2399
2400        leaf = path->nodes[0];
2401        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2402#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2403        if (item_size < sizeof(*ei)) {
2404                ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0);
2405                if (ret < 0) {
2406                        err = ret;
2407                        goto out;
2408                }
2409                leaf = path->nodes[0];
2410                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2411        }
2412#endif
2413        BUG_ON(item_size < sizeof(*ei));
2414        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2415        __run_delayed_extent_op(extent_op, leaf, ei);
2416
2417        btrfs_mark_buffer_dirty(leaf);
2418out:
2419        btrfs_free_path(path);
2420        return err;
2421}
2422
2423static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2424                                struct btrfs_fs_info *fs_info,
2425                                struct btrfs_delayed_ref_node *node,
2426                                struct btrfs_delayed_extent_op *extent_op,
2427                                int insert_reserved)
2428{
2429        int ret = 0;
2430        struct btrfs_delayed_tree_ref *ref;
2431        struct btrfs_key ins;
2432        u64 parent = 0;
2433        u64 ref_root = 0;
2434        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
2435
2436        ref = btrfs_delayed_node_to_tree_ref(node);
2437        trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
2438
2439        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2440                parent = ref->parent;
2441        ref_root = ref->root;
2442
2443        ins.objectid = node->bytenr;
2444        if (skinny_metadata) {
2445                ins.offset = ref->level;
2446                ins.type = BTRFS_METADATA_ITEM_KEY;
2447        } else {
2448                ins.offset = node->num_bytes;
2449                ins.type = BTRFS_EXTENT_ITEM_KEY;
2450        }
2451
2452        if (node->ref_mod != 1) {
2453                btrfs_err(fs_info,
2454        "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2455                          node->bytenr, node->ref_mod, node->action, ref_root,
2456                          parent);
2457                return -EIO;
2458        }
2459        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2460                BUG_ON(!extent_op || !extent_op->update_flags);
2461                ret = alloc_reserved_tree_block(trans, fs_info,
2462                                                parent, ref_root,
2463                                                extent_op->flags_to_set,
2464                                                &extent_op->key,
2465                                                ref->level, &ins);
2466        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2467                ret = __btrfs_inc_extent_ref(trans, fs_info, node,
2468                                             parent, ref_root,
2469                                             ref->level, 0, 1,
2470                                             extent_op);
2471        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2472                ret = __btrfs_free_extent(trans, fs_info, node,
2473                                          parent, ref_root,
2474                                          ref->level, 0, 1, extent_op);
2475        } else {
2476                BUG();
2477        }
2478        return ret;
2479}
2480
2481/* helper function to actually process a single delayed ref entry */
2482static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2483                               struct btrfs_fs_info *fs_info,
2484                               struct btrfs_delayed_ref_node *node,
2485                               struct btrfs_delayed_extent_op *extent_op,
2486                               int insert_reserved)
2487{
2488        int ret = 0;
2489
2490        if (trans->aborted) {
2491                if (insert_reserved)
2492                        btrfs_pin_extent(fs_info, node->bytenr,
2493                                         node->num_bytes, 1);
2494                return 0;
2495        }
2496
2497        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2498            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2499                ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
2500                                           insert_reserved);
2501        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2502                 node->type == BTRFS_SHARED_DATA_REF_KEY)
2503                ret = run_delayed_data_ref(trans, fs_info, node, extent_op,
2504                                           insert_reserved);
2505        else
2506                BUG();
2507        return ret;
2508}
2509
2510static inline struct btrfs_delayed_ref_node *
2511select_delayed_ref(struct btrfs_delayed_ref_head *head)
2512{
2513        struct btrfs_delayed_ref_node *ref;
2514
2515        if (RB_EMPTY_ROOT(&head->ref_tree))
2516                return NULL;
2517
2518        /*
2519         * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2520         * This is to prevent a ref count from going down to zero, which deletes
2521         * the extent item from the extent tree, when there still are references
2522         * to add, which would fail because they would not find the extent item.
2523         */
2524        if (!list_empty(&head->ref_add_list))
2525                return list_first_entry(&head->ref_add_list,
2526                                struct btrfs_delayed_ref_node, add_list);
2527
2528        ref = rb_entry(rb_first(&head->ref_tree),
2529                       struct btrfs_delayed_ref_node, ref_node);
2530        ASSERT(list_empty(&ref->add_list));
2531        return ref;
2532}
2533
2534static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2535                                      struct btrfs_delayed_ref_head *head)
2536{
2537        spin_lock(&delayed_refs->lock);
2538        head->processing = 0;
2539        delayed_refs->num_heads_ready++;
2540        spin_unlock(&delayed_refs->lock);
2541        btrfs_delayed_ref_unlock(head);
2542}
2543
2544static int cleanup_extent_op(struct btrfs_trans_handle *trans,
2545                             struct btrfs_fs_info *fs_info,
2546                             struct btrfs_delayed_ref_head *head)
2547{
2548        struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2549        int ret;
2550
2551        if (!extent_op)
2552                return 0;
2553        head->extent_op = NULL;
2554        if (head->must_insert_reserved) {
2555                btrfs_free_delayed_extent_op(extent_op);
2556                return 0;
2557        }
2558        spin_unlock(&head->lock);
2559        ret = run_delayed_extent_op(trans, fs_info, head, extent_op);
2560        btrfs_free_delayed_extent_op(extent_op);
2561        return ret ? ret : 1;
2562}
2563
2564static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2565                            struct btrfs_fs_info *fs_info,
2566                            struct btrfs_delayed_ref_head *head)
2567{
2568        struct btrfs_delayed_ref_root *delayed_refs;
2569        int ret;
2570
2571        delayed_refs = &trans->transaction->delayed_refs;
2572
2573        ret = cleanup_extent_op(trans, fs_info, head);
2574        if (ret < 0) {
2575                unselect_delayed_ref_head(delayed_refs, head);
2576                btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2577                return ret;
2578        } else if (ret) {
2579                return ret;
2580        }
2581
2582        /*
2583         * Need to drop our head ref lock and re-acquire the delayed ref lock
2584         * and then re-check to make sure nobody got added.
2585         */
2586        spin_unlock(&head->lock);
2587        spin_lock(&delayed_refs->lock);
2588        spin_lock(&head->lock);
2589        if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
2590                spin_unlock(&head->lock);
2591                spin_unlock(&delayed_refs->lock);
2592                return 1;
2593        }
2594        delayed_refs->num_heads--;
2595        rb_erase(&head->href_node, &delayed_refs->href_root);
2596        RB_CLEAR_NODE(&head->href_node);
2597        spin_unlock(&delayed_refs->lock);
2598        spin_unlock(&head->lock);
2599        atomic_dec(&delayed_refs->num_entries);
2600
2601        trace_run_delayed_ref_head(fs_info, head, 0);
2602
2603        if (head->total_ref_mod < 0) {
2604                struct btrfs_space_info *space_info;
2605                u64 flags;
2606
2607                if (head->is_data)
2608                        flags = BTRFS_BLOCK_GROUP_DATA;
2609                else if (head->is_system)
2610                        flags = BTRFS_BLOCK_GROUP_SYSTEM;
2611                else
2612                        flags = BTRFS_BLOCK_GROUP_METADATA;
2613                space_info = __find_space_info(fs_info, flags);
2614                ASSERT(space_info);
2615                percpu_counter_add(&space_info->total_bytes_pinned,
2616                                   -head->num_bytes);
2617
2618                if (head->is_data) {
2619                        spin_lock(&delayed_refs->lock);
2620                        delayed_refs->pending_csums -= head->num_bytes;
2621                        spin_unlock(&delayed_refs->lock);
2622                }
2623        }
2624
2625        if (head->must_insert_reserved) {
2626                btrfs_pin_extent(fs_info, head->bytenr,
2627                                 head->num_bytes, 1);
2628                if (head->is_data) {
2629                        ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2630                                              head->num_bytes);
2631                }
2632        }
2633
2634        /* Also free its reserved qgroup space */
2635        btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2636                                      head->qgroup_reserved);
2637        btrfs_delayed_ref_unlock(head);
2638        btrfs_put_delayed_ref_head(head);
2639        return 0;
2640}
2641
2642/*
2643 * Returns 0 on success or if called with an already aborted transaction.
2644 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2645 */
2646static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2647                                             unsigned long nr)
2648{
2649        struct btrfs_fs_info *fs_info = trans->fs_info;
2650        struct btrfs_delayed_ref_root *delayed_refs;
2651        struct btrfs_delayed_ref_node *ref;
2652        struct btrfs_delayed_ref_head *locked_ref = NULL;
2653        struct btrfs_delayed_extent_op *extent_op;
2654        ktime_t start = ktime_get();
2655        int ret;
2656        unsigned long count = 0;
2657        unsigned long actual_count = 0;
2658        int must_insert_reserved = 0;
2659
2660        delayed_refs = &trans->transaction->delayed_refs;
2661        while (1) {
2662                if (!locked_ref) {
2663                        if (count >= nr)
2664                                break;
2665
2666                        spin_lock(&delayed_refs->lock);
2667                        locked_ref = btrfs_select_ref_head(trans);
2668                        if (!locked_ref) {
2669                                spin_unlock(&delayed_refs->lock);
2670                                break;
2671                        }
2672
2673                        /* grab the lock that says we are going to process
2674                         * all the refs for this head */
2675                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
2676                        spin_unlock(&delayed_refs->lock);
2677                        /*
2678                         * we may have dropped the spin lock to get the head
2679                         * mutex lock, and that might have given someone else
2680                         * time to free the head.  If that's true, it has been
2681                         * removed from our list and we can move on.
2682                         */
2683                        if (ret == -EAGAIN) {
2684                                locked_ref = NULL;
2685                                count++;
2686                                continue;
2687                        }
2688                }
2689
2690                /*
2691                 * We need to try and merge add/drops of the same ref since we
2692                 * can run into issues with relocate dropping the implicit ref
2693                 * and then it being added back again before the drop can
2694                 * finish.  If we merged anything we need to re-loop so we can
2695                 * get a good ref.
2696                 * Or we can get node references of the same type that weren't
2697                 * merged when created due to bumps in the tree mod seq, and
2698                 * we need to merge them to prevent adding an inline extent
2699                 * backref before dropping it (triggering a BUG_ON at
2700                 * insert_inline_extent_backref()).
2701                 */
2702                spin_lock(&locked_ref->lock);
2703                btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2704                                         locked_ref);
2705
2706                /*
2707                 * locked_ref is the head node, so we have to go one
2708                 * node back for any delayed ref updates
2709                 */
2710                ref = select_delayed_ref(locked_ref);
2711
2712                if (ref && ref->seq &&
2713                    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2714                        spin_unlock(&locked_ref->lock);
2715                        unselect_delayed_ref_head(delayed_refs, locked_ref);
2716                        locked_ref = NULL;
2717                        cond_resched();
2718                        count++;
2719                        continue;
2720                }
2721
2722                /*
2723                 * We're done processing refs in this ref_head, clean everything
2724                 * up and move on to the next ref_head.
2725                 */
2726                if (!ref) {
2727                        ret = cleanup_ref_head(trans, fs_info, locked_ref);
2728                        if (ret > 0 ) {
2729                                /* We dropped our lock, we need to loop. */
2730                                ret = 0;
2731                                continue;
2732                        } else if (ret) {
2733                                return ret;
2734                        }
2735                        locked_ref = NULL;
2736                        count++;
2737                        continue;
2738                }
2739
2740                actual_count++;
2741                ref->in_tree = 0;
2742                rb_erase(&ref->ref_node, &locked_ref->ref_tree);
2743                RB_CLEAR_NODE(&ref->ref_node);
2744                if (!list_empty(&ref->add_list))
2745                        list_del(&ref->add_list);
2746                /*
2747                 * When we play the delayed ref, also correct the ref_mod on
2748                 * head
2749                 */
2750                switch (ref->action) {
2751                case BTRFS_ADD_DELAYED_REF:
2752                case BTRFS_ADD_DELAYED_EXTENT:
2753                        locked_ref->ref_mod -= ref->ref_mod;
2754                        break;
2755                case BTRFS_DROP_DELAYED_REF:
2756                        locked_ref->ref_mod += ref->ref_mod;
2757                        break;
2758                default:
2759                        WARN_ON(1);
2760                }
2761                atomic_dec(&delayed_refs->num_entries);
2762
2763                /*
2764                 * Record the must-insert_reserved flag before we drop the spin
2765                 * lock.
2766                 */
2767                must_insert_reserved = locked_ref->must_insert_reserved;
2768                locked_ref->must_insert_reserved = 0;
2769
2770                extent_op = locked_ref->extent_op;
2771                locked_ref->extent_op = NULL;
2772                spin_unlock(&locked_ref->lock);
2773
2774                ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
2775                                          must_insert_reserved);
2776
2777                btrfs_free_delayed_extent_op(extent_op);
2778                if (ret) {
2779                        unselect_delayed_ref_head(delayed_refs, locked_ref);
2780                        btrfs_put_delayed_ref(ref);
2781                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2782                                    ret);
2783                        return ret;
2784                }
2785
2786                btrfs_put_delayed_ref(ref);
2787                count++;
2788                cond_resched();
2789        }
2790
2791        /*
2792         * We don't want to include ref heads since we can have empty ref heads
2793         * and those will drastically skew our runtime down since we just do
2794         * accounting, no actual extent tree updates.
2795         */
2796        if (actual_count > 0) {
2797                u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2798                u64 avg;
2799
2800                /*
2801                 * We weigh the current average higher than our current runtime
2802                 * to avoid large swings in the average.
2803                 */
2804                spin_lock(&delayed_refs->lock);
2805                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2806                fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2807                spin_unlock(&delayed_refs->lock);
2808        }
2809        return 0;
2810}
2811
2812#ifdef SCRAMBLE_DELAYED_REFS
2813/*
2814 * Normally delayed refs get processed in ascending bytenr order. This
2815 * correlates in most cases to the order added. To expose dependencies on this
2816 * order, we start to process the tree in the middle instead of the beginning
2817 */
2818static u64 find_middle(struct rb_root *root)
2819{
2820        struct rb_node *n = root->rb_node;
2821        struct btrfs_delayed_ref_node *entry;
2822        int alt = 1;
2823        u64 middle;
2824        u64 first = 0, last = 0;
2825
2826        n = rb_first(root);
2827        if (n) {
2828                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2829                first = entry->bytenr;
2830        }
2831        n = rb_last(root);
2832        if (n) {
2833                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2834                last = entry->bytenr;
2835        }
2836        n = root->rb_node;
2837
2838        while (n) {
2839                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2840                WARN_ON(!entry->in_tree);
2841
2842                middle = entry->bytenr;
2843
2844                if (alt)
2845                        n = n->rb_left;
2846                else
2847                        n = n->rb_right;
2848
2849                alt = 1 - alt;
2850        }
2851        return middle;
2852}
2853#endif
2854
2855static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2856{
2857        u64 num_bytes;
2858
2859        num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2860                             sizeof(struct btrfs_extent_inline_ref));
2861        if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2862                num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2863
2864        /*
2865         * We don't ever fill up leaves all the way so multiply by 2 just to be
2866         * closer to what we're really going to want to use.
2867         */
2868        return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2869}
2870
2871/*
2872 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2873 * would require to store the csums for that many bytes.
2874 */
2875u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2876{
2877        u64 csum_size;
2878        u64 num_csums_per_leaf;
2879        u64 num_csums;
2880
2881        csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2882        num_csums_per_leaf = div64_u64(csum_size,
2883                        (u64)btrfs_super_csum_size(fs_info->super_copy));
2884        num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2885        num_csums += num_csums_per_leaf - 1;
2886        num_csums = div64_u64(num_csums, num_csums_per_leaf);
2887        return num_csums;
2888}
2889
2890int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2891                                       struct btrfs_fs_info *fs_info)
2892{
2893        struct btrfs_block_rsv *global_rsv;
2894        u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2895        u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2896        unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
2897        u64 num_bytes, num_dirty_bgs_bytes;
2898        int ret = 0;
2899
2900        num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
2901        num_heads = heads_to_leaves(fs_info, num_heads);
2902        if (num_heads > 1)
2903                num_bytes += (num_heads - 1) * fs_info->nodesize;
2904        num_bytes <<= 1;
2905        num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
2906                                                        fs_info->nodesize;
2907        num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
2908                                                             num_dirty_bgs);
2909        global_rsv = &fs_info->global_block_rsv;
2910
2911        /*
2912         * If we can't allocate any more chunks lets make sure we have _lots_ of
2913         * wiggle room since running delayed refs can create more delayed refs.
2914         */
2915        if (global_rsv->space_info->full) {
2916                num_dirty_bgs_bytes <<= 1;
2917                num_bytes <<= 1;
2918        }
2919
2920        spin_lock(&global_rsv->lock);
2921        if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2922                ret = 1;
2923        spin_unlock(&global_rsv->lock);
2924        return ret;
2925}
2926
2927int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2928                                       struct btrfs_fs_info *fs_info)
2929{
2930        u64 num_entries =
2931                atomic_read(&trans->transaction->delayed_refs.num_entries);
2932        u64 avg_runtime;
2933        u64 val;
2934
2935        smp_mb();
2936        avg_runtime = fs_info->avg_delayed_ref_runtime;
2937        val = num_entries * avg_runtime;
2938        if (val >= NSEC_PER_SEC)
2939                return 1;
2940        if (val >= NSEC_PER_SEC / 2)
2941                return 2;
2942
2943        return btrfs_check_space_for_delayed_refs(trans, fs_info);
2944}
2945
2946struct async_delayed_refs {
2947        struct btrfs_root *root;
2948        u64 transid;
2949        int count;
2950        int error;
2951        int sync;
2952        struct completion wait;
2953        struct btrfs_work work;
2954};
2955
2956static inline struct async_delayed_refs *
2957to_async_delayed_refs(struct btrfs_work *work)
2958{
2959        return container_of(work, struct async_delayed_refs, work);
2960}
2961
2962static void delayed_ref_async_start(struct btrfs_work *work)
2963{
2964        struct async_delayed_refs *async = to_async_delayed_refs(work);
2965        struct btrfs_trans_handle *trans;
2966        struct btrfs_fs_info *fs_info = async->root->fs_info;
2967        int ret;
2968
2969        /* if the commit is already started, we don't need to wait here */
2970        if (btrfs_transaction_blocked(fs_info))
2971                goto done;
2972
2973        trans = btrfs_join_transaction(async->root);
2974        if (IS_ERR(trans)) {
2975                async->error = PTR_ERR(trans);
2976                goto done;
2977        }
2978
2979        /*
2980         * trans->sync means that when we call end_transaction, we won't
2981         * wait on delayed refs
2982         */
2983        trans->sync = true;
2984
2985        /* Don't bother flushing if we got into a different transaction */
2986        if (trans->transid > async->transid)
2987                goto end;
2988
2989        ret = btrfs_run_delayed_refs(trans, async->count);
2990        if (ret)
2991                async->error = ret;
2992end:
2993        ret = btrfs_end_transaction(trans);
2994        if (ret && !async->error)
2995                async->error = ret;
2996done:
2997        if (async->sync)
2998                complete(&async->wait);
2999        else
3000                kfree(async);
3001}
3002
3003int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
3004                                 unsigned long count, u64 transid, int wait)
3005{
3006        struct async_delayed_refs *async;
3007        int ret;
3008
3009        async = kmalloc(sizeof(*async), GFP_NOFS);
3010        if (!async)
3011                return -ENOMEM;
3012
3013        async->root = fs_info->tree_root;
3014        async->count = count;
3015        async->error = 0;
3016        async->transid = transid;
3017        if (wait)
3018                async->sync = 1;
3019        else
3020                async->sync = 0;
3021        init_completion(&async->wait);
3022
3023        btrfs_init_work(&async->work, btrfs_extent_refs_helper,
3024                        delayed_ref_async_start, NULL, NULL);
3025
3026        btrfs_queue_work(fs_info->extent_workers, &async->work);
3027
3028        if (wait) {
3029                wait_for_completion(&async->wait);
3030                ret = async->error;
3031                kfree(async);
3032                return ret;
3033        }
3034        return 0;
3035}
3036
3037/*
3038 * this starts processing the delayed reference count updates and
3039 * extent insertions we have queued up so far.  count can be
3040 * 0, which means to process everything in the tree at the start
3041 * of the run (but not newly added entries), or it can be some target
3042 * number you'd like to process.
3043 *
3044 * Returns 0 on success or if called with an aborted transaction
3045 * Returns <0 on error and aborts the transaction
3046 */
3047int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3048                           unsigned long count)
3049{
3050        struct btrfs_fs_info *fs_info = trans->fs_info;
3051        struct rb_node *node;
3052        struct btrfs_delayed_ref_root *delayed_refs;
3053        struct btrfs_delayed_ref_head *head;
3054        int ret;
3055        int run_all = count == (unsigned long)-1;
3056        bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
3057
3058        /* We'll clean this up in btrfs_cleanup_transaction */
3059        if (trans->aborted)
3060                return 0;
3061
3062        if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
3063                return 0;
3064
3065        delayed_refs = &trans->transaction->delayed_refs;
3066        if (count == 0)
3067                count = atomic_read(&delayed_refs->num_entries) * 2;
3068
3069again:
3070#ifdef SCRAMBLE_DELAYED_REFS
3071        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
3072#endif
3073        trans->can_flush_pending_bgs = false;
3074        ret = __btrfs_run_delayed_refs(trans, count);
3075        if (ret < 0) {
3076                btrfs_abort_transaction(trans, ret);
3077                return ret;
3078        }
3079
3080        if (run_all) {
3081                if (!list_empty(&trans->new_bgs))
3082                        btrfs_create_pending_block_groups(trans);
3083
3084                spin_lock(&delayed_refs->lock);
3085                node = rb_first(&delayed_refs->href_root);
3086                if (!node) {
3087                        spin_unlock(&delayed_refs->lock);
3088                        goto out;
3089                }
3090                head = rb_entry(node, struct btrfs_delayed_ref_head,
3091                                href_node);
3092                refcount_inc(&head->refs);
3093                spin_unlock(&delayed_refs->lock);
3094
3095                /* Mutex was contended, block until it's released and retry. */
3096                mutex_lock(&head->mutex);
3097                mutex_unlock(&head->mutex);
3098
3099                btrfs_put_delayed_ref_head(head);
3100                cond_resched();
3101                goto again;
3102        }
3103out:
3104        trans->can_flush_pending_bgs = can_flush_pending_bgs;
3105        return 0;
3106}
3107
3108int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3109                                struct btrfs_fs_info *fs_info,
3110                                u64 bytenr, u64 num_bytes, u64 flags,
3111                                int level, int is_data)
3112{
3113        struct btrfs_delayed_extent_op *extent_op;
3114        int ret;
3115
3116        extent_op = btrfs_alloc_delayed_extent_op();
3117        if (!extent_op)
3118                return -ENOMEM;
3119
3120        extent_op->flags_to_set = flags;
3121        extent_op->update_flags = true;
3122        extent_op->update_key = false;
3123        extent_op->is_data = is_data ? true : false;
3124        extent_op->level = level;
3125
3126        ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
3127                                          num_bytes, extent_op);
3128        if (ret)
3129                btrfs_free_delayed_extent_op(extent_op);
3130        return ret;
3131}
3132
3133static noinline int check_delayed_ref(struct btrfs_root *root,
3134                                      struct btrfs_path *path,
3135                                      u64 objectid, u64 offset, u64 bytenr)
3136{
3137        struct btrfs_delayed_ref_head *head;
3138        struct btrfs_delayed_ref_node *ref;
3139        struct btrfs_delayed_data_ref *data_ref;
3140        struct btrfs_delayed_ref_root *delayed_refs;
3141        struct btrfs_transaction *cur_trans;
3142        struct rb_node *node;
3143        int ret = 0;
3144
3145        spin_lock(&root->fs_info->trans_lock);
3146        cur_trans = root->fs_info->running_transaction;
3147        if (cur_trans)
3148                refcount_inc(&cur_trans->use_count);
3149        spin_unlock(&root->fs_info->trans_lock);
3150        if (!cur_trans)
3151                return 0;
3152
3153        delayed_refs = &cur_trans->delayed_refs;
3154        spin_lock(&delayed_refs->lock);
3155        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3156        if (!head) {
3157                spin_unlock(&delayed_refs->lock);
3158                btrfs_put_transaction(cur_trans);
3159                return 0;
3160        }
3161
3162        if (!mutex_trylock(&head->mutex)) {
3163                refcount_inc(&head->refs);
3164                spin_unlock(&delayed_refs->lock);
3165
3166                btrfs_release_path(path);
3167
3168                /*
3169                 * Mutex was contended, block until it's released and let
3170                 * caller try again
3171                 */
3172                mutex_lock(&head->mutex);
3173                mutex_unlock(&head->mutex);
3174                btrfs_put_delayed_ref_head(head);
3175                btrfs_put_transaction(cur_trans);
3176                return -EAGAIN;
3177        }
3178        spin_unlock(&delayed_refs->lock);
3179
3180        spin_lock(&head->lock);
3181        /*
3182         * XXX: We should replace this with a proper search function in the
3183         * future.
3184         */
3185        for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
3186                ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
3187                /* If it's a shared ref we know a cross reference exists */
3188                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3189                        ret = 1;
3190                        break;
3191                }
3192
3193                data_ref = btrfs_delayed_node_to_data_ref(ref);
3194
3195                /*
3196                 * If our ref doesn't match the one we're currently looking at
3197                 * then we have a cross reference.
3198                 */
3199                if (data_ref->root != root->root_key.objectid ||
3200                    data_ref->objectid != objectid ||
3201                    data_ref->offset != offset) {
3202                        ret = 1;
3203                        break;
3204                }
3205        }
3206        spin_unlock(&head->lock);
3207        mutex_unlock(&head->mutex);
3208        btrfs_put_transaction(cur_trans);
3209        return ret;
3210}
3211
3212static noinline int check_committed_ref(struct btrfs_root *root,
3213                                        struct btrfs_path *path,
3214                                        u64 objectid, u64 offset, u64 bytenr)
3215{
3216        struct btrfs_fs_info *fs_info = root->fs_info;
3217        struct btrfs_root *extent_root = fs_info->extent_root;
3218        struct extent_buffer *leaf;
3219        struct btrfs_extent_data_ref *ref;
3220        struct btrfs_extent_inline_ref *iref;
3221        struct btrfs_extent_item *ei;
3222        struct btrfs_key key;
3223        u32 item_size;
3224        int type;
3225        int ret;
3226
3227        key.objectid = bytenr;
3228        key.offset = (u64)-1;
3229        key.type = BTRFS_EXTENT_ITEM_KEY;
3230
3231        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3232        if (ret < 0)
3233                goto out;
3234        BUG_ON(ret == 0); /* Corruption */
3235
3236        ret = -ENOENT;
3237        if (path->slots[0] == 0)
3238                goto out;
3239
3240        path->slots[0]--;
3241        leaf = path->nodes[0];
3242        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3243
3244        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3245                goto out;
3246
3247        ret = 1;
3248        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3249#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
3250        if (item_size < sizeof(*ei)) {
3251                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3252                goto out;
3253        }
3254#endif
3255        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3256
3257        if (item_size != sizeof(*ei) +
3258            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3259                goto out;
3260
3261        if (btrfs_extent_generation(leaf, ei) <=
3262            btrfs_root_last_snapshot(&root->root_item))
3263                goto out;
3264
3265        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3266
3267        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3268        if (type != BTRFS_EXTENT_DATA_REF_KEY)
3269                goto out;
3270
3271        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3272        if (btrfs_extent_refs(leaf, ei) !=
3273            btrfs_extent_data_ref_count(leaf, ref) ||
3274            btrfs_extent_data_ref_root(leaf, ref) !=
3275            root->root_key.objectid ||
3276            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3277            btrfs_extent_data_ref_offset(leaf, ref) != offset)
3278                goto out;
3279
3280        ret = 0;
3281out:
3282        return ret;
3283}
3284
3285int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3286                          u64 bytenr)
3287{
3288        struct btrfs_path *path;
3289        int ret;
3290        int ret2;
3291
3292        path = btrfs_alloc_path();
3293        if (!path)
3294                return -ENOENT;
3295
3296        do {
3297                ret = check_committed_ref(root, path, objectid,
3298                                          offset, bytenr);
3299                if (ret && ret != -ENOENT)
3300                        goto out;
3301
3302                ret2 = check_delayed_ref(root, path, objectid,
3303                                         offset, bytenr);
3304        } while (ret2 == -EAGAIN);
3305
3306        if (ret2 && ret2 != -ENOENT) {
3307                ret = ret2;
3308                goto out;
3309        }
3310
3311        if (ret != -ENOENT || ret2 != -ENOENT)
3312                ret = 0;
3313out:
3314        btrfs_free_path(path);
3315        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3316                WARN_ON(ret > 0);
3317        return ret;
3318}
3319
3320static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3321                           struct btrfs_root *root,
3322                           struct extent_buffer *buf,
3323                           int full_backref, int inc)
3324{
3325        struct btrfs_fs_info *fs_info = root->fs_info;
3326        u64 bytenr;
3327        u64 num_bytes;
3328        u64 parent;
3329        u64 ref_root;
3330        u32 nritems;
3331        struct btrfs_key key;
3332        struct btrfs_file_extent_item *fi;
3333        int i;
3334        int level;
3335        int ret = 0;
3336        int (*process_func)(struct btrfs_trans_handle *,
3337                            struct btrfs_root *,
3338                            u64, u64, u64, u64, u64, u64);
3339
3340
3341        if (btrfs_is_testing(fs_info))
3342                return 0;
3343
3344        ref_root = btrfs_header_owner(buf);
3345        nritems = btrfs_header_nritems(buf);
3346        level = btrfs_header_level(buf);
3347
3348        if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3349                return 0;
3350
3351        if (inc)
3352                process_func = btrfs_inc_extent_ref;
3353        else
3354                process_func = btrfs_free_extent;
3355
3356        if (full_backref)
3357                parent = buf->start;
3358        else
3359                parent = 0;
3360
3361        for (i = 0; i < nritems; i++) {
3362                if (level == 0) {
3363                        btrfs_item_key_to_cpu(buf, &key, i);
3364                        if (key.type != BTRFS_EXTENT_DATA_KEY)
3365                                continue;
3366                        fi = btrfs_item_ptr(buf, i,
3367                                            struct btrfs_file_extent_item);
3368                        if (btrfs_file_extent_type(buf, fi) ==
3369                            BTRFS_FILE_EXTENT_INLINE)
3370                                continue;
3371                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3372                        if (bytenr == 0)
3373                                continue;
3374
3375                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3376                        key.offset -= btrfs_file_extent_offset(buf, fi);
3377                        ret = process_func(trans, root, bytenr, num_bytes,
3378                                           parent, ref_root, key.objectid,
3379                                           key.offset);
3380                        if (ret)
3381                                goto fail;
3382                } else {
3383                        bytenr = btrfs_node_blockptr(buf, i);
3384                        num_bytes = fs_info->nodesize;
3385                        ret = process_func(trans, root, bytenr, num_bytes,
3386                                           parent, ref_root, level - 1, 0);
3387                        if (ret)
3388                                goto fail;
3389                }
3390        }
3391        return 0;
3392fail:
3393        return ret;
3394}
3395
3396int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3397                  struct extent_buffer *buf, int full_backref)
3398{
3399        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3400}
3401
3402int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3403                  struct extent_buffer *buf, int full_backref)
3404{
3405        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3406}
3407
3408static int write_one_cache_group(struct btrfs_trans_handle *trans,
3409                                 struct btrfs_fs_info *fs_info,
3410                                 struct btrfs_path *path,
3411                                 struct btrfs_block_group_cache *cache)
3412{
3413        int ret;
3414        struct btrfs_root *extent_root = fs_info->extent_root;
3415        unsigned long bi;
3416        struct extent_buffer *leaf;
3417
3418        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3419        if (ret) {
3420                if (ret > 0)
3421                        ret = -ENOENT;
3422                goto fail;
3423        }
3424
3425        leaf = path->nodes[0];
3426        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3427        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3428        btrfs_mark_buffer_dirty(leaf);
3429fail:
3430        btrfs_release_path(path);
3431        return ret;
3432
3433}
3434
3435static struct btrfs_block_group_cache *
3436next_block_group(struct btrfs_fs_info *fs_info,
3437                 struct btrfs_block_group_cache *cache)
3438{
3439        struct rb_node *node;
3440
3441        spin_lock(&fs_info->block_group_cache_lock);
3442
3443        /* If our block group was removed, we need a full search. */
3444        if (RB_EMPTY_NODE(&cache->cache_node)) {
3445                const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3446
3447                spin_unlock(&fs_info->block_group_cache_lock);
3448                btrfs_put_block_group(cache);
3449                cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3450        }
3451        node = rb_next(&cache->cache_node);
3452        btrfs_put_block_group(cache);
3453        if (node) {
3454                cache = rb_entry(node, struct btrfs_block_group_cache,
3455                                 cache_node);
3456                btrfs_get_block_group(cache);
3457        } else
3458                cache = NULL;
3459        spin_unlock(&fs_info->block_group_cache_lock);
3460        return cache;
3461}
3462
3463static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3464                            struct btrfs_trans_handle *trans,
3465                            struct btrfs_path *path)
3466{
3467        struct btrfs_fs_info *fs_info = block_group->fs_info;
3468        struct btrfs_root *root = fs_info->tree_root;
3469        struct inode *inode = NULL;
3470        struct extent_changeset *data_reserved = NULL;
3471        u64 alloc_hint = 0;
3472        int dcs = BTRFS_DC_ERROR;
3473        u64 num_pages = 0;
3474        int retries = 0;
3475        int ret = 0;
3476
3477        /*
3478         * If this block group is smaller than 100 megs don't bother caching the
3479         * block group.
3480         */
3481        if (block_group->key.offset < (100 * SZ_1M)) {
3482                spin_lock(&block_group->lock);
3483                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3484                spin_unlock(&block_group->lock);
3485                return 0;
3486        }
3487
3488        if (trans->aborted)
3489                return 0;
3490again:
3491        inode = lookup_free_space_inode(fs_info, block_group, path);
3492        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3493                ret = PTR_ERR(inode);
3494                btrfs_release_path(path);
3495                goto out;
3496        }
3497
3498        if (IS_ERR(inode)) {
3499                BUG_ON(retries);
3500                retries++;
3501
3502                if (block_group->ro)
3503                        goto out_free;
3504
3505                ret = create_free_space_inode(fs_info, trans, block_group,
3506                                              path);
3507                if (ret)
3508                        goto out_free;
3509                goto again;
3510        }
3511
3512        /*
3513         * We want to set the generation to 0, that way if anything goes wrong
3514         * from here on out we know not to trust this cache when we load up next
3515         * time.
3516         */
3517        BTRFS_I(inode)->generation = 0;
3518        ret = btrfs_update_inode(trans, root, inode);
3519        if (ret) {
3520                /*
3521                 * So theoretically we could recover from this, simply set the
3522                 * super cache generation to 0 so we know to invalidate the
3523                 * cache, but then we'd have to keep track of the block groups
3524                 * that fail this way so we know we _have_ to reset this cache
3525                 * before the next commit or risk reading stale cache.  So to
3526                 * limit our exposure to horrible edge cases lets just abort the
3527                 * transaction, this only happens in really bad situations
3528                 * anyway.
3529                 */
3530                btrfs_abort_transaction(trans, ret);
3531                goto out_put;
3532        }
3533        WARN_ON(ret);
3534
3535        /* We've already setup this transaction, go ahead and exit */
3536        if (block_group->cache_generation == trans->transid &&
3537            i_size_read(inode)) {
3538                dcs = BTRFS_DC_SETUP;
3539                goto out_put;
3540        }
3541
3542        if (i_size_read(inode) > 0) {
3543                ret = btrfs_check_trunc_cache_free_space(fs_info,
3544                                        &fs_info->global_block_rsv);
3545                if (ret)
3546                        goto out_put;
3547
3548                ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3549                if (ret)
3550                        goto out_put;
3551        }
3552
3553        spin_lock(&block_group->lock);
3554        if (block_group->cached != BTRFS_CACHE_FINISHED ||
3555            !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3556                /*
3557                 * don't bother trying to write stuff out _if_
3558                 * a) we're not cached,
3559                 * b) we're with nospace_cache mount option,
3560                 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3561                 */
3562                dcs = BTRFS_DC_WRITTEN;
3563                spin_unlock(&block_group->lock);
3564                goto out_put;
3565        }
3566        spin_unlock(&block_group->lock);
3567
3568        /*
3569         * We hit an ENOSPC when setting up the cache in this transaction, just
3570         * skip doing the setup, we've already cleared the cache so we're safe.
3571         */
3572        if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3573                ret = -ENOSPC;
3574                goto out_put;
3575        }
3576
3577        /*
3578         * Try to preallocate enough space based on how big the block group is.
3579         * Keep in mind this has to include any pinned space which could end up
3580         * taking up quite a bit since it's not folded into the other space
3581         * cache.
3582         */
3583        num_pages = div_u64(block_group->key.offset, SZ_256M);
3584        if (!num_pages)
3585                num_pages = 1;
3586
3587        num_pages *= 16;
3588        num_pages *= PAGE_SIZE;
3589
3590        ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3591        if (ret)
3592                goto out_put;
3593
3594        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3595                                              num_pages, num_pages,
3596                                              &alloc_hint);
3597        /*
3598         * Our cache requires contiguous chunks so that we don't modify a bunch
3599         * of metadata or split extents when writing the cache out, which means
3600         * we can enospc if we are heavily fragmented in addition to just normal
3601         * out of space conditions.  So if we hit this just skip setting up any
3602         * other block groups for this transaction, maybe we'll unpin enough
3603         * space the next time around.
3604         */
3605        if (!ret)
3606                dcs = BTRFS_DC_SETUP;
3607        else if (ret == -ENOSPC)
3608                set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3609
3610out_put:
3611        iput(inode);
3612out_free:
3613        btrfs_release_path(path);
3614out:
3615        spin_lock(&block_group->lock);
3616        if (!ret && dcs == BTRFS_DC_SETUP)
3617                block_group->cache_generation = trans->transid;
3618        block_group->disk_cache_state = dcs;
3619        spin_unlock(&block_group->lock);
3620
3621        extent_changeset_free(data_reserved);
3622        return ret;
3623}
3624
3625int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3626                            struct btrfs_fs_info *fs_info)
3627{
3628        struct btrfs_block_group_cache *cache, *tmp;
3629        struct btrfs_transaction *cur_trans = trans->transaction;
3630        struct btrfs_path *path;
3631
3632        if (list_empty(&cur_trans->dirty_bgs) ||
3633            !btrfs_test_opt(fs_info, SPACE_CACHE))
3634                return 0;
3635
3636        path = btrfs_alloc_path();
3637        if (!path)
3638                return -ENOMEM;
3639
3640        /* Could add new block groups, use _safe just in case */
3641        list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3642                                 dirty_list) {
3643                if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3644                        cache_save_setup(cache, trans, path);
3645        }
3646
3647        btrfs_free_path(path);
3648        return 0;
3649}
3650
3651/*
3652 * transaction commit does final block group cache writeback during a
3653 * critical section where nothing is allowed to change the FS.  This is
3654 * required in order for the cache to actually match the block group,
3655 * but can introduce a lot of latency into the commit.
3656 *
3657 * So, btrfs_start_dirty_block_groups is here to kick off block group
3658 * cache IO.  There's a chance we'll have to redo some of it if the
3659 * block group changes again during the commit, but it greatly reduces
3660 * the commit latency by getting rid of the easy block groups while
3661 * we're still allowing others to join the commit.
3662 */
3663int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3664{
3665        struct btrfs_fs_info *fs_info = trans->fs_info;
3666        struct btrfs_block_group_cache *cache;
3667        struct btrfs_transaction *cur_trans = trans->transaction;
3668        int ret = 0;
3669        int should_put;
3670        struct btrfs_path *path = NULL;
3671        LIST_HEAD(dirty);
3672        struct list_head *io = &cur_trans->io_bgs;
3673        int num_started = 0;
3674        int loops = 0;
3675
3676        spin_lock(&cur_trans->dirty_bgs_lock);
3677        if (list_empty(&cur_trans->dirty_bgs)) {
3678                spin_unlock(&cur_trans->dirty_bgs_lock);
3679                return 0;
3680        }
3681        list_splice_init(&cur_trans->dirty_bgs, &dirty);
3682        spin_unlock(&cur_trans->dirty_bgs_lock);
3683
3684again:
3685        /*
3686         * make sure all the block groups on our dirty list actually
3687         * exist
3688         */
3689        btrfs_create_pending_block_groups(trans);
3690
3691        if (!path) {
3692                path = btrfs_alloc_path();
3693                if (!path)
3694                        return -ENOMEM;
3695        }
3696
3697        /*
3698         * cache_write_mutex is here only to save us from balance or automatic
3699         * removal of empty block groups deleting this block group while we are
3700         * writing out the cache
3701         */
3702        mutex_lock(&trans->transaction->cache_write_mutex);
3703        while (!list_empty(&dirty)) {
3704                cache = list_first_entry(&dirty,
3705                                         struct btrfs_block_group_cache,
3706                                         dirty_list);
3707                /*
3708                 * this can happen if something re-dirties a block
3709                 * group that is already under IO.  Just wait for it to
3710                 * finish and then do it all again
3711                 */
3712                if (!list_empty(&cache->io_list)) {
3713                        list_del_init(&cache->io_list);
3714                        btrfs_wait_cache_io(trans, cache, path);
3715                        btrfs_put_block_group(cache);
3716                }
3717
3718
3719                /*
3720                 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3721                 * if it should update the cache_state.  Don't delete
3722                 * until after we wait.
3723                 *
3724                 * Since we're not running in the commit critical section
3725                 * we need the dirty_bgs_lock to protect from update_block_group
3726                 */
3727                spin_lock(&cur_trans->dirty_bgs_lock);
3728                list_del_init(&cache->dirty_list);
3729                spin_unlock(&cur_trans->dirty_bgs_lock);
3730
3731                should_put = 1;
3732
3733                cache_save_setup(cache, trans, path);
3734
3735                if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3736                        cache->io_ctl.inode = NULL;
3737                        ret = btrfs_write_out_cache(fs_info, trans,
3738                                                    cache, path);
3739                        if (ret == 0 && cache->io_ctl.inode) {
3740                                num_started++;
3741                                should_put = 0;
3742
3743                                /*
3744                                 * The cache_write_mutex is protecting the
3745                                 * io_list, also refer to the definition of
3746                                 * btrfs_transaction::io_bgs for more details
3747                                 */
3748                                list_add_tail(&cache->io_list, io);
3749                        } else {
3750                                /*
3751                                 * if we failed to write the cache, the
3752                                 * generation will be bad and life goes on
3753                                 */
3754                                ret = 0;
3755                        }
3756                }
3757                if (!ret) {
3758                        ret = write_one_cache_group(trans, fs_info,
3759                                                    path, cache);
3760                        /*
3761                         * Our block group might still be attached to the list
3762                         * of new block groups in the transaction handle of some
3763                         * other task (struct btrfs_trans_handle->new_bgs). This
3764                         * means its block group item isn't yet in the extent
3765                         * tree. If this happens ignore the error, as we will
3766                         * try again later in the critical section of the
3767                         * transaction commit.
3768                         */
3769                        if (ret == -ENOENT) {
3770                                ret = 0;
3771                                spin_lock(&cur_trans->dirty_bgs_lock);
3772                                if (list_empty(&cache->dirty_list)) {
3773                                        list_add_tail(&cache->dirty_list,
3774                                                      &cur_trans->dirty_bgs);
3775                                        btrfs_get_block_group(cache);
3776                                }
3777                                spin_unlock(&cur_trans->dirty_bgs_lock);
3778                        } else if (ret) {
3779                                btrfs_abort_transaction(trans, ret);
3780                        }
3781                }
3782
3783                /* if its not on the io list, we need to put the block group */
3784                if (should_put)
3785                        btrfs_put_block_group(cache);
3786
3787                if (ret)
3788                        break;
3789
3790                /*
3791                 * Avoid blocking other tasks for too long. It might even save
3792                 * us from writing caches for block groups that are going to be
3793                 * removed.
3794                 */
3795                mutex_unlock(&trans->transaction->cache_write_mutex);
3796                mutex_lock(&trans->transaction->cache_write_mutex);
3797        }
3798        mutex_unlock(&trans->transaction->cache_write_mutex);
3799
3800        /*
3801         * go through delayed refs for all the stuff we've just kicked off
3802         * and then loop back (just once)
3803         */
3804        ret = btrfs_run_delayed_refs(trans, 0);
3805        if (!ret && loops == 0) {
3806                loops++;
3807                spin_lock(&cur_trans->dirty_bgs_lock);
3808                list_splice_init(&cur_trans->dirty_bgs, &dirty);
3809                /*
3810                 * dirty_bgs_lock protects us from concurrent block group
3811                 * deletes too (not just cache_write_mutex).
3812                 */
3813                if (!list_empty(&dirty)) {
3814                        spin_unlock(&cur_trans->dirty_bgs_lock);
3815                        goto again;
3816                }
3817                spin_unlock(&cur_trans->dirty_bgs_lock);
3818        } else if (ret < 0) {
3819                btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3820        }
3821
3822        btrfs_free_path(path);
3823        return ret;
3824}
3825
3826int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3827                                   struct btrfs_fs_info *fs_info)
3828{
3829        struct btrfs_block_group_cache *cache;
3830        struct btrfs_transaction *cur_trans = trans->transaction;
3831        int ret = 0;
3832        int should_put;
3833        struct btrfs_path *path;
3834        struct list_head *io = &cur_trans->io_bgs;
3835        int num_started = 0;
3836
3837        path = btrfs_alloc_path();
3838        if (!path)
3839                return -ENOMEM;
3840
3841        /*
3842         * Even though we are in the critical section of the transaction commit,
3843         * we can still have concurrent tasks adding elements to this
3844         * transaction's list of dirty block groups. These tasks correspond to
3845         * endio free space workers started when writeback finishes for a
3846         * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3847         * allocate new block groups as a result of COWing nodes of the root
3848         * tree when updating the free space inode. The writeback for the space
3849         * caches is triggered by an earlier call to
3850         * btrfs_start_dirty_block_groups() and iterations of the following
3851         * loop.
3852         * Also we want to do the cache_save_setup first and then run the
3853         * delayed refs to make sure we have the best chance at doing this all
3854         * in one shot.
3855         */
3856        spin_lock(&cur_trans->dirty_bgs_lock);
3857        while (!list_empty(&cur_trans->dirty_bgs)) {
3858                cache = list_first_entry(&cur_trans->dirty_bgs,
3859                                         struct btrfs_block_group_cache,
3860                                         dirty_list);
3861
3862                /*
3863                 * this can happen if cache_save_setup re-dirties a block
3864                 * group that is already under IO.  Just wait for it to
3865                 * finish and then do it all again
3866                 */
3867                if (!list_empty(&cache->io_list)) {
3868                        spin_unlock(&cur_trans->dirty_bgs_lock);
3869                        list_del_init(&cache->io_list);
3870                        btrfs_wait_cache_io(trans, cache, path);
3871                        btrfs_put_block_group(cache);
3872                        spin_lock(&cur_trans->dirty_bgs_lock);
3873                }
3874
3875                /*
3876                 * don't remove from the dirty list until after we've waited
3877                 * on any pending IO
3878                 */
3879                list_del_init(&cache->dirty_list);
3880                spin_unlock(&cur_trans->dirty_bgs_lock);
3881                should_put = 1;
3882
3883                cache_save_setup(cache, trans, path);
3884
3885                if (!ret)
3886                        ret = btrfs_run_delayed_refs(trans,
3887                                                     (unsigned long) -1);
3888
3889                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3890                        cache->io_ctl.inode = NULL;
3891                        ret = btrfs_write_out_cache(fs_info, trans,
3892                                                    cache, path);
3893                        if (ret == 0 && cache->io_ctl.inode) {
3894                                num_started++;
3895                                should_put = 0;
3896                                list_add_tail(&cache->io_list, io);
3897                        } else {
3898                                /*
3899                                 * if we failed to write the cache, the
3900                                 * generation will be bad and life goes on
3901                                 */
3902                                ret = 0;
3903                        }
3904                }
3905                if (!ret) {
3906                        ret = write_one_cache_group(trans, fs_info,
3907                                                    path, cache);
3908                        /*
3909                         * One of the free space endio workers might have
3910                         * created a new block group while updating a free space
3911                         * cache's inode (at inode.c:btrfs_finish_ordered_io())
3912                         * and hasn't released its transaction handle yet, in
3913                         * which case the new block group is still attached to
3914                         * its transaction handle and its creation has not
3915                         * finished yet (no block group item in the extent tree
3916                         * yet, etc). If this is the case, wait for all free
3917                         * space endio workers to finish and retry. This is a
3918                         * a very rare case so no need for a more efficient and
3919                         * complex approach.
3920                         */
3921                        if (ret == -ENOENT) {
3922                                wait_event(cur_trans->writer_wait,
3923                                   atomic_read(&cur_trans->num_writers) == 1);
3924                                ret = write_one_cache_group(trans, fs_info,
3925                                                            path, cache);
3926                        }
3927                        if (ret)
3928                                btrfs_abort_transaction(trans, ret);
3929                }
3930
3931                /* if its not on the io list, we need to put the block group */
3932                if (should_put)
3933                        btrfs_put_block_group(cache);
3934                spin_lock(&cur_trans->dirty_bgs_lock);
3935        }
3936        spin_unlock(&cur_trans->dirty_bgs_lock);
3937
3938        /*
3939         * Refer to the definition of io_bgs member for details why it's safe
3940         * to use it without any locking
3941         */
3942        while (!list_empty(io)) {
3943                cache = list_first_entry(io, struct btrfs_block_group_cache,
3944                                         io_list);
3945                list_del_init(&cache->io_list);
3946                btrfs_wait_cache_io(trans, cache, path);
3947                btrfs_put_block_group(cache);
3948        }
3949
3950        btrfs_free_path(path);
3951        return ret;
3952}
3953
3954int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3955{
3956        struct btrfs_block_group_cache *block_group;
3957        int readonly = 0;
3958
3959        block_group = btrfs_lookup_block_group(fs_info, bytenr);
3960        if (!block_group || block_group->ro)
3961                readonly = 1;
3962        if (block_group)
3963                btrfs_put_block_group(block_group);
3964        return readonly;
3965}
3966
3967bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3968{
3969        struct btrfs_block_group_cache *bg;
3970        bool ret = true;
3971
3972        bg = btrfs_lookup_block_group(fs_info, bytenr);
3973        if (!bg)
3974                return false;
3975
3976        spin_lock(&bg->lock);
3977        if (bg->ro)
3978                ret = false;
3979        else
3980                atomic_inc(&bg->nocow_writers);
3981        spin_unlock(&bg->lock);
3982
3983        /* no put on block group, done by btrfs_dec_nocow_writers */
3984        if (!ret)
3985                btrfs_put_block_group(bg);
3986
3987        return ret;
3988
3989}
3990
3991void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3992{
3993        struct btrfs_block_group_cache *bg;
3994
3995        bg = btrfs_lookup_block_group(fs_info, bytenr);
3996        ASSERT(bg);
3997        if (atomic_dec_and_test(&bg->nocow_writers))
3998                wake_up_var(&bg->nocow_writers);
3999        /*
4000         * Once for our lookup and once for the lookup done by a previous call
4001         * to btrfs_inc_nocow_writers()
4002         */
4003        btrfs_put_block_group(bg);
4004        btrfs_put_block_group(bg);
4005}
4006
4007void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
4008{
4009        wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
4010}
4011
4012static const char *alloc_name(u64 flags)
4013{
4014        switch (flags) {
4015        case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
4016                return "mixed";
4017        case BTRFS_BLOCK_GROUP_METADATA:
4018                return "metadata";
4019        case BTRFS_BLOCK_GROUP_DATA:
4020                return "data";
4021        case BTRFS_BLOCK_GROUP_SYSTEM:
4022                return "system";
4023        default:
4024                WARN_ON(1);
4025                return "invalid-combination";
4026        };
4027}
4028
4029static int create_space_info(struct btrfs_fs_info *info, u64 flags,
4030                             struct btrfs_space_info **new)
4031{
4032
4033        struct btrfs_space_info *space_info;
4034        int i;
4035        int ret;
4036
4037        space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
4038        if (!space_info)
4039                return -ENOMEM;
4040
4041        ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
4042                                 GFP_KERNEL);
4043        if (ret) {
4044                kfree(space_info);
4045                return ret;
4046        }
4047
4048        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
4049                INIT_LIST_HEAD(&space_info->block_groups[i]);
4050        init_rwsem(&space_info->groups_sem);
4051        spin_lock_init(&space_info->lock);
4052        space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
4053        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4054        init_waitqueue_head(&space_info->wait);
4055        INIT_LIST_HEAD(&space_info->ro_bgs);
4056        INIT_LIST_HEAD(&space_info->tickets);
4057        INIT_LIST_HEAD(&space_info->priority_tickets);
4058
4059        ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
4060                                    info->space_info_kobj, "%s",
4061                                    alloc_name(space_info->flags));
4062        if (ret) {
4063                percpu_counter_destroy(&space_info->total_bytes_pinned);
4064                kfree(space_info);
4065                return ret;
4066        }
4067
4068        *new = space_info;
4069        list_add_rcu(&space_info->list, &info->space_info);
4070        if (flags & BTRFS_BLOCK_GROUP_DATA)
4071                info->data_sinfo = space_info;
4072
4073        return ret;
4074}
4075
4076static void update_space_info(struct btrfs_fs_info *info, u64 flags,
4077                             u64 total_bytes, u64 bytes_used,
4078                             u64 bytes_readonly,
4079                             struct btrfs_space_info **space_info)
4080{
4081        struct btrfs_space_info *found;
4082        int factor;
4083
4084        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
4085                     BTRFS_BLOCK_GROUP_RAID10))
4086                factor = 2;
4087        else
4088                factor = 1;
4089
4090        found = __find_space_info(info, flags);
4091        ASSERT(found);
4092        spin_lock(&found->lock);
4093        found->total_bytes += total_bytes;
4094        found->disk_total += total_bytes * factor;
4095        found->bytes_used += bytes_used;
4096        found->disk_used += bytes_used * factor;
4097        found->bytes_readonly += bytes_readonly;
4098        if (total_bytes > 0)
4099                found->full = 0;
4100        space_info_add_new_bytes(info, found, total_bytes -
4101                                 bytes_used - bytes_readonly);
4102        spin_unlock(&found->lock);
4103        *space_info = found;
4104}
4105
4106static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4107{
4108        u64 extra_flags = chunk_to_extended(flags) &
4109                                BTRFS_EXTENDED_PROFILE_MASK;
4110
4111        write_seqlock(&fs_info->profiles_lock);
4112        if (flags & BTRFS_BLOCK_GROUP_DATA)
4113                fs_info->avail_data_alloc_bits |= extra_flags;
4114        if (flags & BTRFS_BLOCK_GROUP_METADATA)
4115                fs_info->avail_metadata_alloc_bits |= extra_flags;
4116        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4117                fs_info->avail_system_alloc_bits |= extra_flags;
4118        write_sequnlock(&fs_info->profiles_lock);
4119}
4120
4121/*
4122 * returns target flags in extended format or 0 if restripe for this
4123 * chunk_type is not in progress
4124 *
4125 * should be called with either volume_mutex or balance_lock held
4126 */
4127static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4128{
4129        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4130        u64 target = 0;
4131
4132        if (!bctl)
4133                return 0;
4134
4135        if (flags & BTRFS_BLOCK_GROUP_DATA &&
4136            bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4137                target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4138        } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4139                   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4140                target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4141        } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4142                   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4143                target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4144        }
4145
4146        return target;
4147}
4148
4149/*
4150 * @flags: available profiles in extended format (see ctree.h)
4151 *
4152 * Returns reduced profile in chunk format.  If profile changing is in
4153 * progress (either running or paused) picks the target profile (if it's
4154 * already available), otherwise falls back to plain reducing.
4155 */
4156static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4157{
4158        u64 num_devices = fs_info->fs_devices->rw_devices;
4159        u64 target;
4160        u64 raid_type;
4161        u64 allowed = 0;
4162
4163        /*
4164         * see if restripe for this chunk_type is in progress, if so
4165         * try to reduce to the target profile
4166         */
4167        spin_lock(&fs_info->balance_lock);
4168        target = get_restripe_target(fs_info, flags);
4169        if (target) {
4170                /* pick target profile only if it's already available */
4171                if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4172                        spin_unlock(&fs_info->balance_lock);
4173                        return extended_to_chunk(target);
4174                }
4175        }
4176        spin_unlock(&fs_info->balance_lock);
4177
4178        /* First, mask out the RAID levels which aren't possible */
4179        for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4180                if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4181                        allowed |= btrfs_raid_group[raid_type];
4182        }
4183        allowed &= flags;
4184
4185        if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4186                allowed = BTRFS_BLOCK_GROUP_RAID6;
4187        else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4188                allowed = BTRFS_BLOCK_GROUP_RAID5;
4189        else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4190                allowed = BTRFS_BLOCK_GROUP_RAID10;
4191        else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4192                allowed = BTRFS_BLOCK_GROUP_RAID1;
4193        else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4194                allowed = BTRFS_BLOCK_GROUP_RAID0;
4195
4196        flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4197
4198        return extended_to_chunk(flags | allowed);
4199}
4200
4201static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4202{
4203        unsigned seq;
4204        u64 flags;
4205
4206        do {
4207                flags = orig_flags;
4208                seq = read_seqbegin(&fs_info->profiles_lock);
4209
4210                if (flags & BTRFS_BLOCK_GROUP_DATA)
4211                        flags |= fs_info->avail_data_alloc_bits;
4212                else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4213                        flags |= fs_info->avail_system_alloc_bits;
4214                else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4215                        flags |= fs_info->avail_metadata_alloc_bits;
4216        } while (read_seqretry(&fs_info->profiles_lock, seq));
4217
4218        return btrfs_reduce_alloc_profile(fs_info, flags);
4219}
4220
4221static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
4222{
4223        struct btrfs_fs_info *fs_info = root->fs_info;
4224        u64 flags;
4225        u64 ret;
4226
4227        if (data)
4228                flags = BTRFS_BLOCK_GROUP_DATA;
4229        else if (root == fs_info->chunk_root)
4230                flags = BTRFS_BLOCK_GROUP_SYSTEM;
4231        else
4232                flags = BTRFS_BLOCK_GROUP_METADATA;
4233
4234        ret = get_alloc_profile(fs_info, flags);
4235        return ret;
4236}
4237
4238u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4239{
4240        return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4241}
4242
4243u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4244{
4245        return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4246}
4247
4248u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4249{
4250        return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4251}
4252
4253static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4254                                 bool may_use_included)
4255{
4256        ASSERT(s_info);
4257        return s_info->bytes_used + s_info->bytes_reserved +
4258                s_info->bytes_pinned + s_info->bytes_readonly +
4259                (may_use_included ? s_info->bytes_may_use : 0);
4260}
4261
4262int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4263{
4264        struct btrfs_root *root = inode->root;
4265        struct btrfs_fs_info *fs_info = root->fs_info;
4266        struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4267        u64 used;
4268        int ret = 0;
4269        int need_commit = 2;
4270        int have_pinned_space;
4271
4272        /* make sure bytes are sectorsize aligned */
4273        bytes = ALIGN(bytes, fs_info->sectorsize);
4274
4275        if (btrfs_is_free_space_inode(inode)) {
4276                need_commit = 0;
4277                ASSERT(current->journal_info);
4278        }
4279
4280again:
4281        /* make sure we have enough space to handle the data first */
4282        spin_lock(&data_sinfo->lock);
4283        used = btrfs_space_info_used(data_sinfo, true);
4284
4285        if (used + bytes > data_sinfo->total_bytes) {
4286                struct btrfs_trans_handle *trans;
4287
4288                /*
4289                 * if we don't have enough free bytes in this space then we need
4290                 * to alloc a new chunk.
4291                 */
4292                if (!data_sinfo->full) {
4293                        u64 alloc_target;
4294
4295                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4296                        spin_unlock(&data_sinfo->lock);
4297
4298                        alloc_target = btrfs_data_alloc_profile(fs_info);
4299                        /*
4300                         * It is ugly that we don't call nolock join
4301                         * transaction for the free space inode case here.
4302                         * But it is safe because we only do the data space
4303                         * reservation for the free space cache in the
4304                         * transaction context, the common join transaction
4305                         * just increase the counter of the current transaction
4306                         * handler, doesn't try to acquire the trans_lock of
4307                         * the fs.
4308                         */
4309                        trans = btrfs_join_transaction(root);
4310                        if (IS_ERR(trans))
4311                                return PTR_ERR(trans);
4312
4313                        ret = do_chunk_alloc(trans, fs_info, alloc_target,
4314                                             CHUNK_ALLOC_NO_FORCE);
4315                        btrfs_end_transaction(trans);
4316                        if (ret < 0) {
4317                                if (ret != -ENOSPC)
4318                                        return ret;
4319                                else {
4320                                        have_pinned_space = 1;
4321                                        goto commit_trans;
4322                                }
4323                        }
4324
4325                        goto again;
4326                }
4327
4328                /*
4329                 * If we don't have enough pinned space to deal with this
4330                 * allocation, and no removed chunk in current transaction,
4331                 * don't bother committing the transaction.
4332                 */
4333                have_pinned_space = percpu_counter_compare(
4334                        &data_sinfo->total_bytes_pinned,
4335                        used + bytes - data_sinfo->total_bytes);
4336                spin_unlock(&data_sinfo->lock);
4337
4338                /* commit the current transaction and try again */
4339commit_trans:
4340                if (need_commit) {
4341                        need_commit--;
4342
4343                        if (need_commit > 0) {
4344                                btrfs_start_delalloc_roots(fs_info, 0, -1);
4345                                btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4346                                                         (u64)-1);
4347                        }
4348
4349                        trans = btrfs_join_transaction(root);
4350                        if (IS_ERR(trans))
4351                                return PTR_ERR(trans);
4352                        if (have_pinned_space >= 0 ||
4353                            test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4354                                     &trans->transaction->flags) ||
4355                            need_commit > 0) {
4356                                ret = btrfs_commit_transaction(trans);
4357                                if (ret)
4358                                        return ret;
4359                                /*
4360                                 * The cleaner kthread might still be doing iput
4361                                 * operations. Wait for it to finish so that
4362                                 * more space is released.
4363                                 */
4364                                mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4365                                mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
4366                                goto again;
4367                        } else {
4368                                btrfs_end_transaction(trans);
4369                        }
4370                }
4371
4372                trace_btrfs_space_reservation(fs_info,
4373                                              "space_info:enospc",
4374                                              data_sinfo->flags, bytes, 1);
4375                return -ENOSPC;
4376        }
4377        data_sinfo->bytes_may_use += bytes;
4378        trace_btrfs_space_reservation(fs_info, "space_info",
4379                                      data_sinfo->flags, bytes, 1);
4380        spin_unlock(&data_sinfo->lock);
4381
4382        return ret;
4383}
4384
4385int btrfs_check_data_free_space(struct inode *inode,
4386                        struct extent_changeset **reserved, u64 start, u64 len)
4387{
4388        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4389        int ret;
4390
4391        /* align the range */
4392        len = round_up(start + len, fs_info->sectorsize) -
4393              round_down(start, fs_info->sectorsize);
4394        start = round_down(start, fs_info->sectorsize);
4395
4396        ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4397        if (ret < 0)
4398                return ret;
4399
4400        /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4401        ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4402        if (ret < 0)
4403                btrfs_free_reserved_data_space_noquota(inode, start, len);
4404        else
4405                ret = 0;
4406        return ret;
4407}
4408
4409/*
4410 * Called if we need to clear a data reservation for this inode
4411 * Normally in a error case.
4412 *
4413 * This one will *NOT* use accurate qgroup reserved space API, just for case
4414 * which we can't sleep and is sure it won't affect qgroup reserved space.
4415 * Like clear_bit_hook().
4416 */
4417void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4418                                            u64 len)
4419{
4420        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4421        struct btrfs_space_info *data_sinfo;
4422
4423        /* Make sure the range is aligned to sectorsize */
4424        len = round_up(start + len, fs_info->sectorsize) -
4425              round_down(start, fs_info->sectorsize);
4426        start = round_down(start, fs_info->sectorsize);
4427
4428        data_sinfo = fs_info->data_sinfo;
4429        spin_lock(&data_sinfo->lock);
4430        if (WARN_ON(data_sinfo->bytes_may_use < len))
4431                data_sinfo->bytes_may_use = 0;
4432        else
4433                data_sinfo->bytes_may_use -= len;
4434        trace_btrfs_space_reservation(fs_info, "space_info",
4435                                      data_sinfo->flags, len, 0);
4436        spin_unlock(&data_sinfo->lock);
4437}
4438
4439/*
4440 * Called if we need to clear a data reservation for this inode
4441 * Normally in a error case.
4442 *
4443 * This one will handle the per-inode data rsv map for accurate reserved
4444 * space framework.
4445 */
4446void btrfs_free_reserved_data_space(struct inode *inode,
4447                        struct extent_changeset *reserved, u64 start, u64 len)
4448{
4449        struct btrfs_root *root = BTRFS_I(inode)->root;
4450
4451        /* Make sure the range is aligned to sectorsize */
4452        len = round_up(start + len, root->fs_info->sectorsize) -
4453              round_down(start, root->fs_info->sectorsize);
4454        start = round_down(start, root->fs_info->sectorsize);
4455
4456        btrfs_free_reserved_data_space_noquota(inode, start, len);
4457        btrfs_qgroup_free_data(inode, reserved, start, len);
4458}
4459
4460static void force_metadata_allocation(struct btrfs_fs_info *info)
4461{
4462        struct list_head *head = &info->space_info;
4463        struct btrfs_space_info *found;
4464
4465        rcu_read_lock();
4466        list_for_each_entry_rcu(found, head, list) {
4467                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4468                        found->force_alloc = CHUNK_ALLOC_FORCE;
4469        }
4470        rcu_read_unlock();
4471}
4472
4473static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4474{
4475        return (global->size << 1);
4476}
4477
4478static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4479                              struct btrfs_space_info *sinfo, int force)
4480{
4481        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4482        u64 bytes_used = btrfs_space_info_used(sinfo, false);
4483        u64 thresh;
4484
4485        if (force == CHUNK_ALLOC_FORCE)
4486                return 1;
4487
4488        /*
4489         * We need to take into account the global rsv because for all intents
4490         * and purposes it's used space.  Don't worry about locking the
4491         * global_rsv, it doesn't change except when the transaction commits.
4492         */
4493        if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4494                bytes_used += calc_global_rsv_need_space(global_rsv);
4495
4496        /*
4497         * in limited mode, we want to have some free space up to
4498         * about 1% of the FS size.
4499         */
4500        if (force == CHUNK_ALLOC_LIMITED) {
4501                thresh = btrfs_super_total_bytes(fs_info->super_copy);
4502                thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4503
4504                if (sinfo->total_bytes - bytes_used < thresh)
4505                        return 1;
4506        }
4507
4508        if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4509                return 0;
4510        return 1;
4511}
4512
4513static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4514{
4515        u64 num_dev;
4516
4517        if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4518                    BTRFS_BLOCK_GROUP_RAID0 |
4519                    BTRFS_BLOCK_GROUP_RAID5 |
4520                    BTRFS_BLOCK_GROUP_RAID6))
4521                num_dev = fs_info->fs_devices->rw_devices;
4522        else if (type & BTRFS_BLOCK_GROUP_RAID1)
4523                num_dev = 2;
4524        else
4525                num_dev = 1;    /* DUP or single */
4526
4527        return num_dev;
4528}
4529
4530/*
4531 * If @is_allocation is true, reserve space in the system space info necessary
4532 * for allocating a chunk, otherwise if it's false, reserve space necessary for
4533 * removing a chunk.
4534 */
4535void check_system_chunk(struct btrfs_trans_handle *trans,
4536                        struct btrfs_fs_info *fs_info, u64 type)
4537{
4538        struct btrfs_space_info *info;
4539        u64 left;
4540        u64 thresh;
4541        int ret = 0;
4542        u64 num_devs;
4543
4544        /*
4545         * Needed because we can end up allocating a system chunk and for an
4546         * atomic and race free space reservation in the chunk block reserve.
4547         */
4548        lockdep_assert_held(&fs_info->chunk_mutex);
4549
4550        info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4551        spin_lock(&info->lock);
4552        left = info->total_bytes - btrfs_space_info_used(info, true);
4553        spin_unlock(&info->lock);
4554
4555        num_devs = get_profile_num_devs(fs_info, type);
4556
4557        /* num_devs device items to update and 1 chunk item to add or remove */
4558        thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4559                btrfs_calc_trans_metadata_size(fs_info, 1);
4560
4561        if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4562                btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4563                           left, thresh, type);
4564                dump_space_info(fs_info, info, 0, 0);
4565        }
4566
4567        if (left < thresh) {
4568                u64 flags = btrfs_system_alloc_profile(fs_info);
4569
4570                /*
4571                 * Ignore failure to create system chunk. We might end up not
4572                 * needing it, as we might not need to COW all nodes/leafs from
4573                 * the paths we visit in the chunk tree (they were already COWed
4574                 * or created in the current transaction for example).
4575                 */
4576                ret = btrfs_alloc_chunk(trans, fs_info, flags);
4577        }
4578
4579        if (!ret) {
4580                ret = btrfs_block_rsv_add(fs_info->chunk_root,
4581                                          &fs_info->chunk_block_rsv,
4582                                          thresh, BTRFS_RESERVE_NO_FLUSH);
4583                if (!ret)
4584                        trans->chunk_bytes_reserved += thresh;
4585        }
4586}
4587
4588/*
4589 * If force is CHUNK_ALLOC_FORCE:
4590 *    - return 1 if it successfully allocates a chunk,
4591 *    - return errors including -ENOSPC otherwise.
4592 * If force is NOT CHUNK_ALLOC_FORCE:
4593 *    - return 0 if it doesn't need to allocate a new chunk,
4594 *    - return 1 if it successfully allocates a chunk,
4595 *    - return errors including -ENOSPC otherwise.
4596 */
4597static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4598                          struct btrfs_fs_info *fs_info, u64 flags, int force)
4599{
4600        struct btrfs_space_info *space_info;
4601        int wait_for_alloc = 0;
4602        int ret = 0;
4603
4604        /* Don't re-enter if we're already allocating a chunk */
4605        if (trans->allocating_chunk)
4606                return -ENOSPC;
4607
4608        space_info = __find_space_info(fs_info, flags);
4609        ASSERT(space_info);
4610
4611again:
4612        spin_lock(&space_info->lock);
4613        if (force < space_info->force_alloc)
4614                force = space_info->force_alloc;
4615        if (space_info->full) {
4616                if (should_alloc_chunk(fs_info, space_info, force))
4617                        ret = -ENOSPC;
4618                else
4619                        ret = 0;
4620                spin_unlock(&space_info->lock);
4621                return ret;
4622        }
4623
4624        if (!should_alloc_chunk(fs_info, space_info, force)) {
4625                spin_unlock(&space_info->lock);
4626                return 0;
4627        } else if (space_info->chunk_alloc) {
4628                wait_for_alloc = 1;
4629        } else {
4630                space_info->chunk_alloc = 1;
4631        }
4632
4633        spin_unlock(&space_info->lock);
4634
4635        mutex_lock(&fs_info->chunk_mutex);
4636
4637        /*
4638         * The chunk_mutex is held throughout the entirety of a chunk
4639         * allocation, so once we've acquired the chunk_mutex we know that the
4640         * other guy is done and we need to recheck and see if we should
4641         * allocate.
4642         */
4643        if (wait_for_alloc) {
4644                mutex_unlock(&fs_info->chunk_mutex);
4645                wait_for_alloc = 0;
4646                cond_resched();
4647                goto again;
4648        }
4649
4650        trans->allocating_chunk = true;
4651
4652        /*
4653         * If we have mixed data/metadata chunks we want to make sure we keep
4654         * allocating mixed chunks instead of individual chunks.
4655         */
4656        if (btrfs_mixed_space_info(space_info))
4657                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4658
4659        /*
4660         * if we're doing a data chunk, go ahead and make sure that
4661         * we keep a reasonable number of metadata chunks allocated in the
4662         * FS as well.
4663         */
4664        if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4665                fs_info->data_chunk_allocations++;
4666                if (!(fs_info->data_chunk_allocations %
4667                      fs_info->metadata_ratio))
4668                        force_metadata_allocation(fs_info);
4669        }
4670
4671        /*
4672         * Check if we have enough space in SYSTEM chunk because we may need
4673         * to update devices.
4674         */
4675        check_system_chunk(trans, fs_info, flags);
4676
4677        ret = btrfs_alloc_chunk(trans, fs_info, flags);
4678        trans->allocating_chunk = false;
4679
4680        spin_lock(&space_info->lock);
4681        if (ret < 0 && ret != -ENOSPC)
4682                goto out;
4683        if (ret)
4684                space_info->full = 1;
4685        else
4686                ret = 1;
4687
4688        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4689out:
4690        space_info->chunk_alloc = 0;
4691        spin_unlock(&space_info->lock);
4692        mutex_unlock(&fs_info->chunk_mutex);
4693        /*
4694         * When we allocate a new chunk we reserve space in the chunk block
4695         * reserve to make sure we can COW nodes/leafs in the chunk tree or
4696         * add new nodes/leafs to it if we end up needing to do it when
4697         * inserting the chunk item and updating device items as part of the
4698         * second phase of chunk allocation, performed by
4699         * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4700         * large number of new block groups to create in our transaction
4701         * handle's new_bgs list to avoid exhausting the chunk block reserve
4702         * in extreme cases - like having a single transaction create many new
4703         * block groups when starting to write out the free space caches of all
4704         * the block groups that were made dirty during the lifetime of the
4705         * transaction.
4706         */
4707        if (trans->can_flush_pending_bgs &&
4708            trans->chunk_bytes_reserved >= (u64)SZ_2M) {
4709                btrfs_create_pending_block_groups(trans);
4710                btrfs_trans_release_chunk_metadata(trans);
4711        }
4712        return ret;
4713}
4714
4715static int can_overcommit(struct btrfs_fs_info *fs_info,
4716                          struct btrfs_space_info *space_info, u64 bytes,
4717                          enum btrfs_reserve_flush_enum flush,
4718                          bool system_chunk)
4719{
4720        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4721        u64 profile;
4722        u64 space_size;
4723        u64 avail;
4724        u64 used;
4725
4726        /* Don't overcommit when in mixed mode. */
4727        if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4728                return 0;
4729
4730        if (system_chunk)
4731                profile = btrfs_system_alloc_profile(fs_info);
4732        else
4733                profile = btrfs_metadata_alloc_profile(fs_info);
4734
4735        used = btrfs_space_info_used(space_info, false);
4736
4737        /*
4738         * We only want to allow over committing if we have lots of actual space
4739         * free, but if we don't have enough space to handle the global reserve
4740         * space then we could end up having a real enospc problem when trying
4741         * to allocate a chunk or some other such important allocation.
4742         */
4743        spin_lock(&global_rsv->lock);
4744        space_size = calc_global_rsv_need_space(global_rsv);
4745        spin_unlock(&global_rsv->lock);
4746        if (used + space_size >= space_info->total_bytes)
4747                return 0;
4748
4749        used += space_info->bytes_may_use;
4750
4751        avail = atomic64_read(&fs_info->free_chunk_space);
4752
4753        /*
4754         * If we have dup, raid1 or raid10 then only half of the free
4755         * space is actually useable.  For raid56, the space info used
4756         * doesn't include the parity drive, so we don't have to
4757         * change the math
4758         */
4759        if (profile & (BTRFS_BLOCK_GROUP_DUP |
4760                       BTRFS_BLOCK_GROUP_RAID1 |
4761                       BTRFS_BLOCK_GROUP_RAID10))
4762                avail >>= 1;
4763
4764        /*
4765         * If we aren't flushing all things, let us overcommit up to
4766         * 1/2th of the space. If we can flush, don't let us overcommit
4767         * too much, let it overcommit up to 1/8 of the space.
4768         */
4769        if (flush == BTRFS_RESERVE_FLUSH_ALL)
4770                avail >>= 3;
4771        else
4772                avail >>= 1;
4773
4774        if (used + bytes < space_info->total_bytes + avail)
4775                return 1;
4776        return 0;
4777}
4778
4779static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4780                                         unsigned long nr_pages, int nr_items)
4781{
4782        struct super_block *sb = fs_info->sb;
4783
4784        if (down_read_trylock(&sb->s_umount)) {
4785                writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4786                up_read(&sb->s_umount);
4787        } else {
4788                /*
4789                 * We needn't worry the filesystem going from r/w to r/o though
4790                 * we don't acquire ->s_umount mutex, because the filesystem
4791                 * should guarantee the delalloc inodes list be empty after
4792                 * the filesystem is readonly(all dirty pages are written to
4793                 * the disk).
4794                 */
4795                btrfs_start_delalloc_roots(fs_info, 0, nr_items);
4796                if (!current->journal_info)
4797                        btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4798        }
4799}
4800
4801static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4802                                        u64 to_reclaim)
4803{
4804        u64 bytes;
4805        u64 nr;
4806
4807        bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4808        nr = div64_u64(to_reclaim, bytes);
4809        if (!nr)
4810                nr = 1;
4811        return nr;
4812}
4813
4814#define EXTENT_SIZE_PER_ITEM    SZ_256K
4815
4816/*
4817 * shrink metadata reservation for delalloc
4818 */
4819static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4820                            u64 orig, bool wait_ordered)
4821{
4822        struct btrfs_space_info *space_info;
4823        struct btrfs_trans_handle *trans;
4824        u64 delalloc_bytes;
4825        u64 max_reclaim;
4826        u64 items;
4827        long time_left;
4828        unsigned long nr_pages;
4829        int loops;
4830
4831        /* Calc the number of the pages we need flush for space reservation */
4832        items = calc_reclaim_items_nr(fs_info, to_reclaim);
4833        to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4834
4835        trans = (struct btrfs_trans_handle *)current->journal_info;
4836        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4837
4838        delalloc_bytes = percpu_counter_sum_positive(
4839                                                &fs_info->delalloc_bytes);
4840        if (delalloc_bytes == 0) {
4841                if (trans)
4842                        return;
4843                if (wait_ordered)
4844                        btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4845                return;
4846        }
4847
4848        loops = 0;
4849        while (delalloc_bytes && loops < 3) {
4850                max_reclaim = min(delalloc_bytes, to_reclaim);
4851                nr_pages = max_reclaim >> PAGE_SHIFT;
4852                btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4853                /*
4854                 * We need to wait for the async pages to actually start before
4855                 * we do anything.
4856                 */
4857                max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
4858                if (!max_reclaim)
4859                        goto skip_async;
4860
4861                if (max_reclaim <= nr_pages)
4862                        max_reclaim = 0;
4863                else
4864                        max_reclaim -= nr_pages;
4865
4866                wait_event(fs_info->async_submit_wait,
4867                           atomic_read(&fs_info->async_delalloc_pages) <=
4868                           (int)max_reclaim);
4869skip_async:
4870                spin_lock(&space_info->lock);
4871                if (list_empty(&space_info->tickets) &&
4872                    list_empty(&space_info->priority_tickets)) {
4873                        spin_unlock(&space_info->lock);
4874                        break;
4875                }
4876                spin_unlock(&space_info->lock);
4877
4878                loops++;
4879                if (wait_ordered && !trans) {
4880                        btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4881                } else {
4882                        time_left = schedule_timeout_killable(1);
4883                        if (time_left)
4884                                break;
4885                }
4886                delalloc_bytes = percpu_counter_sum_positive(
4887                                                &fs_info->delalloc_bytes);
4888        }
4889}
4890
4891struct reserve_ticket {
4892        u64 bytes;
4893        int error;
4894        struct list_head list;
4895        wait_queue_head_t wait;
4896};
4897
4898/**
4899 * maybe_commit_transaction - possibly commit the transaction if its ok to
4900 * @root - the root we're allocating for
4901 * @bytes - the number of bytes we want to reserve
4902 * @force - force the commit
4903 *
4904 * This will check to make sure that committing the transaction will actually
4905 * get us somewhere and then commit the transaction if it does.  Otherwise it
4906 * will return -ENOSPC.
4907 */
4908static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4909                                  struct btrfs_space_info *space_info)
4910{
4911        struct reserve_ticket *ticket = NULL;
4912        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4913        struct btrfs_trans_handle *trans;
4914        u64 bytes;
4915
4916        trans = (struct btrfs_trans_handle *)current->journal_info;
4917        if (trans)
4918                return -EAGAIN;
4919
4920        spin_lock(&space_info->lock);
4921        if (!list_empty(&space_info->priority_tickets))
4922                ticket = list_first_entry(&space_info->priority_tickets,
4923                                          struct reserve_ticket, list);
4924        else if (!list_empty(&space_info->tickets))
4925                ticket = list_first_entry(&space_info->tickets,
4926                                          struct reserve_ticket, list);
4927        bytes = (ticket) ? ticket->bytes : 0;
4928        spin_unlock(&space_info->lock);
4929
4930        if (!bytes)
4931                return 0;
4932
4933        /* See if there is enough pinned space to make this reservation */
4934        if (percpu_counter_compare(&space_info->total_bytes_pinned,
4935                                   bytes) >= 0)
4936                goto commit;
4937
4938        /*
4939         * See if there is some space in the delayed insertion reservation for
4940         * this reservation.
4941         */
4942        if (space_info != delayed_rsv->space_info)
4943                return -ENOSPC;
4944
4945        spin_lock(&delayed_rsv->lock);
4946        if (delayed_rsv->size > bytes)
4947                bytes = 0;
4948        else
4949                bytes -= delayed_rsv->size;
4950        spin_unlock(&delayed_rsv->lock);
4951
4952        if (percpu_counter_compare(&space_info->total_bytes_pinned,
4953                                   bytes) < 0) {
4954                return -ENOSPC;
4955        }
4956
4957commit:
4958        trans = btrfs_join_transaction(fs_info->extent_root);
4959        if (IS_ERR(trans))
4960                return -ENOSPC;
4961
4962        return btrfs_commit_transaction(trans);
4963}
4964
4965/*
4966 * Try to flush some data based on policy set by @state. This is only advisory
4967 * and may fail for various reasons. The caller is supposed to examine the
4968 * state of @space_info to detect the outcome.
4969 */
4970static void flush_space(struct btrfs_fs_info *fs_info,
4971                       struct btrfs_space_info *space_info, u64 num_bytes,
4972                       int state)
4973{
4974        struct btrfs_root *root = fs_info->extent_root;
4975        struct btrfs_trans_handle *trans;
4976        int nr;
4977        int ret = 0;
4978
4979        switch (state) {
4980        case FLUSH_DELAYED_ITEMS_NR:
4981        case FLUSH_DELAYED_ITEMS:
4982                if (state == FLUSH_DELAYED_ITEMS_NR)
4983                        nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4984                else
4985                        nr = -1;
4986
4987                trans = btrfs_join_transaction(root);
4988                if (IS_ERR(trans)) {
4989                        ret = PTR_ERR(trans);
4990                        break;
4991                }
4992                ret = btrfs_run_delayed_items_nr(trans, nr);
4993                btrfs_end_transaction(trans);
4994                break;
4995        case FLUSH_DELALLOC:
4996        case FLUSH_DELALLOC_WAIT:
4997                shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4998                                state == FLUSH_DELALLOC_WAIT);
4999                break;
5000        case ALLOC_CHUNK:
5001                trans = btrfs_join_transaction(root);
5002                if (IS_ERR(trans)) {
5003                        ret = PTR_ERR(trans);
5004                        break;
5005                }
5006                ret = do_chunk_alloc(trans, fs_info,
5007                                     btrfs_metadata_alloc_profile(fs_info),
5008                                     CHUNK_ALLOC_NO_FORCE);
5009                btrfs_end_transaction(trans);
5010                if (ret > 0 || ret == -ENOSPC)
5011                        ret = 0;
5012                break;
5013        case COMMIT_TRANS:
5014                ret = may_commit_transaction(fs_info, space_info);
5015                break;
5016        default:
5017                ret = -ENOSPC;
5018                break;
5019        }
5020
5021        trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
5022                                ret);
5023        return;
5024}
5025
5026static inline u64
5027btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
5028                                 struct btrfs_space_info *space_info,
5029                                 bool system_chunk)
5030{
5031        struct reserve_ticket *ticket;
5032        u64 used;
5033        u64 expected;
5034        u64 to_reclaim = 0;
5035
5036        list_for_each_entry(ticket, &space_info->tickets, list)
5037                to_reclaim += ticket->bytes;
5038        list_for_each_entry(ticket, &space_info->priority_tickets, list)
5039                to_reclaim += ticket->bytes;
5040        if (to_reclaim)
5041                return to_reclaim;
5042
5043        to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
5044        if (can_overcommit(fs_info, space_info, to_reclaim,
5045                           BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5046                return 0;
5047
5048        used = btrfs_space_info_used(space_info, true);
5049
5050        if (can_overcommit(fs_info, space_info, SZ_1M,
5051                           BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5052                expected = div_factor_fine(space_info->total_bytes, 95);
5053        else
5054                expected = div_factor_fine(space_info->total_bytes, 90);
5055
5056        if (used > expected)
5057                to_reclaim = used - expected;
5058        else
5059                to_reclaim = 0;
5060        to_reclaim = min(to_reclaim, space_info->bytes_may_use +
5061                                     space_info->bytes_reserved);
5062        return to_reclaim;
5063}
5064
5065static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
5066                                        struct btrfs_space_info *space_info,
5067                                        u64 used, bool system_chunk)
5068{
5069        u64 thresh = div_factor_fine(space_info->total_bytes, 98);
5070
5071        /* If we're just plain full then async reclaim just slows us down. */
5072        if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
5073                return 0;
5074
5075        if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5076                                              system_chunk))
5077                return 0;
5078
5079        return (used >= thresh && !btrfs_fs_closing(fs_info) &&
5080                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
5081}
5082
5083static void wake_all_tickets(struct list_head *head)
5084{
5085        struct reserve_ticket *ticket;
5086
5087        while (!list_empty(head)) {
5088                ticket = list_first_entry(head, struct reserve_ticket, list);
5089                list_del_init(&ticket->list);
5090                ticket->error = -ENOSPC;
5091                wake_up(&ticket->wait);
5092        }
5093}
5094
5095/*
5096 * This is for normal flushers, we can wait all goddamned day if we want to.  We
5097 * will loop and continuously try to flush as long as we are making progress.
5098 * We count progress as clearing off tickets each time we have to loop.
5099 */
5100static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5101{
5102        struct btrfs_fs_info *fs_info;
5103        struct btrfs_space_info *space_info;
5104        u64 to_reclaim;
5105        int flush_state;
5106        int commit_cycles = 0;
5107        u64 last_tickets_id;
5108
5109        fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
5110        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5111
5112        spin_lock(&space_info->lock);
5113        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5114                                                      false);
5115        if (!to_reclaim) {
5116                space_info->flush = 0;
5117                spin_unlock(&space_info->lock);
5118                return;
5119        }
5120        last_tickets_id = space_info->tickets_id;
5121        spin_unlock(&space_info->lock);
5122
5123        flush_state = FLUSH_DELAYED_ITEMS_NR;
5124        do {
5125                flush_space(fs_info, space_info, to_reclaim, flush_state);
5126                spin_lock(&space_info->lock);
5127                if (list_empty(&space_info->tickets)) {
5128                        space_info->flush = 0;
5129                        spin_unlock(&space_info->lock);
5130                        return;
5131                }
5132                to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5133                                                              space_info,
5134                                                              false);
5135                if (last_tickets_id == space_info->tickets_id) {
5136                        flush_state++;
5137                } else {
5138                        last_tickets_id = space_info->tickets_id;
5139                        flush_state = FLUSH_DELAYED_ITEMS_NR;
5140                        if (commit_cycles)
5141                                commit_cycles--;
5142                }
5143
5144                if (flush_state > COMMIT_TRANS) {
5145                        commit_cycles++;
5146                        if (commit_cycles > 2) {
5147                                wake_all_tickets(&space_info->tickets);
5148                                space_info->flush = 0;
5149                        } else {
5150                                flush_state = FLUSH_DELAYED_ITEMS_NR;
5151                        }
5152                }
5153                spin_unlock(&space_info->lock);
5154        } while (flush_state <= COMMIT_TRANS);
5155}
5156
5157void btrfs_init_async_reclaim_work(struct work_struct *work)
5158{
5159        INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5160}
5161
5162static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5163                                            struct btrfs_space_info *space_info,
5164                                            struct reserve_ticket *ticket)
5165{
5166        u64 to_reclaim;
5167        int flush_state = FLUSH_DELAYED_ITEMS_NR;
5168
5169        spin_lock(&space_info->lock);
5170        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5171                                                      false);
5172        if (!to_reclaim) {
5173                spin_unlock(&space_info->lock);
5174                return;
5175        }
5176        spin_unlock(&space_info->lock);
5177
5178        do {
5179                flush_space(fs_info, space_info, to_reclaim, flush_state);
5180                flush_state++;
5181                spin_lock(&space_info->lock);
5182                if (ticket->bytes == 0) {
5183                        spin_unlock(&space_info->lock);
5184                        return;
5185                }
5186                spin_unlock(&space_info->lock);
5187
5188                /*
5189                 * Priority flushers can't wait on delalloc without
5190                 * deadlocking.
5191                 */
5192                if (flush_state == FLUSH_DELALLOC ||
5193                    flush_state == FLUSH_DELALLOC_WAIT)
5194                        flush_state = ALLOC_CHUNK;
5195        } while (flush_state < COMMIT_TRANS);
5196}
5197
5198static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5199                               struct btrfs_space_info *space_info,
5200                               struct reserve_ticket *ticket, u64 orig_bytes)
5201
5202{
5203        DEFINE_WAIT(wait);
5204        int ret = 0;
5205
5206        spin_lock(&space_info->lock);
5207        while (ticket->bytes > 0 && ticket->error == 0) {
5208                ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5209                if (ret) {
5210                        ret = -EINTR;
5211                        break;
5212                }
5213                spin_unlock(&space_info->lock);
5214
5215                schedule();
5216
5217                finish_wait(&ticket->wait, &wait);
5218                spin_lock(&space_info->lock);
5219        }
5220        if (!ret)
5221                ret = ticket->error;
5222        if (!list_empty(&ticket->list))
5223                list_del_init(&ticket->list);
5224        if (ticket->bytes && ticket->bytes < orig_bytes) {
5225                u64 num_bytes = orig_bytes - ticket->bytes;
5226                space_info->bytes_may_use -= num_bytes;
5227                trace_btrfs_space_reservation(fs_info, "space_info",
5228                                              space_info->flags, num_bytes, 0);
5229        }
5230        spin_unlock(&space_info->lock);
5231
5232        return ret;
5233}
5234
5235/**
5236 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5237 * @root - the root we're allocating for
5238 * @space_info - the space info we want to allocate from
5239 * @orig_bytes - the number of bytes we want
5240 * @flush - whether or not we can flush to make our reservation
5241 *
5242 * This will reserve orig_bytes number of bytes from the space info associated
5243 * with the block_rsv.  If there is not enough space it will make an attempt to
5244 * flush out space to make room.  It will do this by flushing delalloc if
5245 * possible or committing the transaction.  If flush is 0 then no attempts to
5246 * regain reservations will be made and this will fail if there is not enough
5247 * space already.
5248 */
5249static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5250                                    struct btrfs_space_info *space_info,
5251                                    u64 orig_bytes,
5252                                    enum btrfs_reserve_flush_enum flush,
5253                                    bool system_chunk)
5254{
5255        struct reserve_ticket ticket;
5256        u64 used;
5257        int ret = 0;
5258
5259        ASSERT(orig_bytes);
5260        ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5261
5262        spin_lock(&space_info->lock);
5263        ret = -ENOSPC;
5264        used = btrfs_space_info_used(space_info, true);
5265
5266        /*
5267         * If we have enough space then hooray, make our reservation and carry
5268         * on.  If not see if we can overcommit, and if we can, hooray carry on.
5269         * If not things get more complicated.
5270         */
5271        if (used + orig_bytes <= space_info->total_bytes) {
5272                space_info->bytes_may_use += orig_bytes;
5273                trace_btrfs_space_reservation(fs_info, "space_info",
5274                                              space_info->flags, orig_bytes, 1);
5275                ret = 0;
5276        } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5277                                  system_chunk)) {
5278                space_info->bytes_may_use += orig_bytes;
5279                trace_btrfs_space_reservation(fs_info, "space_info",
5280                                              space_info->flags, orig_bytes, 1);
5281                ret = 0;
5282        }
5283
5284        /*
5285         * If we couldn't make a reservation then setup our reservation ticket
5286         * and kick the async worker if it's not already running.
5287         *
5288         * If we are a priority flusher then we just need to add our ticket to
5289         * the list and we will do our own flushing further down.
5290         */
5291        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5292                ticket.bytes = orig_bytes;
5293                ticket.error = 0;
5294                init_waitqueue_head(&ticket.wait);
5295                if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5296                        list_add_tail(&ticket.list, &space_info->tickets);
5297                        if (!space_info->flush) {
5298                                space_info->flush = 1;
5299                                trace_btrfs_trigger_flush(fs_info,
5300                                                          space_info->flags,
5301                                                          orig_bytes, flush,
5302                                                          "enospc");
5303                                queue_work(system_unbound_wq,
5304                                           &fs_info->async_reclaim_work);
5305                        }
5306                } else {
5307                        list_add_tail(&ticket.list,
5308                                      &space_info->priority_tickets);
5309                }
5310        } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5311                used += orig_bytes;
5312                /*
5313                 * We will do the space reservation dance during log replay,
5314                 * which means we won't have fs_info->fs_root set, so don't do
5315                 * the async reclaim as we will panic.
5316                 */
5317                if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5318                    need_do_async_reclaim(fs_info, space_info,
5319                                          used, system_chunk) &&
5320                    !work_busy(&fs_info->async_reclaim_work)) {
5321                        trace_btrfs_trigger_flush(fs_info, space_info->flags,
5322                                                  orig_bytes, flush, "preempt");
5323                        queue_work(system_unbound_wq,
5324                                   &fs_info->async_reclaim_work);
5325                }
5326        }
5327        spin_unlock(&space_info->lock);
5328        if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5329                return ret;
5330
5331        if (flush == BTRFS_RESERVE_FLUSH_ALL)
5332                return wait_reserve_ticket(fs_info, space_info, &ticket,
5333                                           orig_bytes);
5334
5335        ret = 0;
5336        priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5337        spin_lock(&space_info->lock);
5338        if (ticket.bytes) {
5339                if (ticket.bytes < orig_bytes) {
5340                        u64 num_bytes = orig_bytes - ticket.bytes;
5341                        space_info->bytes_may_use -= num_bytes;
5342                        trace_btrfs_space_reservation(fs_info, "space_info",
5343                                                      space_info->flags,
5344                                                      num_bytes, 0);
5345
5346                }
5347                list_del_init(&ticket.list);
5348                ret = -ENOSPC;
5349        }
5350        spin_unlock(&space_info->lock);
5351        ASSERT(list_empty(&ticket.list));
5352        return ret;
5353}
5354
5355/**
5356 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5357 * @root - the root we're allocating for
5358 * @block_rsv - the block_rsv we're allocating for
5359 * @orig_bytes - the number of bytes we want
5360 * @flush - whether or not we can flush to make our reservation
5361 *
5362 * This will reserve orgi_bytes number of bytes from the space info associated
5363 * with the block_rsv.  If there is not enough space it will make an attempt to
5364 * flush out space to make room.  It will do this by flushing delalloc if
5365 * possible or committing the transaction.  If flush is 0 then no attempts to
5366 * regain reservations will be made and this will fail if there is not enough
5367 * space already.
5368 */
5369static int reserve_metadata_bytes(struct btrfs_root *root,
5370                                  struct btrfs_block_rsv *block_rsv,
5371                                  u64 orig_bytes,
5372                                  enum btrfs_reserve_flush_enum flush)
5373{
5374        struct btrfs_fs_info *fs_info = root->fs_info;
5375        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5376        int ret;
5377        bool system_chunk = (root == fs_info->chunk_root);
5378
5379        ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5380                                       orig_bytes, flush, system_chunk);
5381        if (ret == -ENOSPC &&
5382            unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5383                if (block_rsv != global_rsv &&
5384                    !block_rsv_use_bytes(global_rsv, orig_bytes))
5385                        ret = 0;
5386        }
5387        if (ret == -ENOSPC) {
5388                trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5389                                              block_rsv->space_info->flags,
5390                                              orig_bytes, 1);
5391
5392                if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5393                        dump_space_info(fs_info, block_rsv->space_info,
5394                                        orig_bytes, 0);
5395        }
5396        return ret;
5397}
5398
5399static struct btrfs_block_rsv *get_block_rsv(
5400                                        const struct btrfs_trans_handle *trans,
5401                                        const struct btrfs_root *root)
5402{
5403        struct btrfs_fs_info *fs_info = root->fs_info;
5404        struct btrfs_block_rsv *block_rsv = NULL;
5405
5406        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5407            (root == fs_info->csum_root && trans->adding_csums) ||
5408            (root == fs_info->uuid_root))
5409                block_rsv = trans->block_rsv;
5410
5411        if (!block_rsv)
5412                block_rsv = root->block_rsv;
5413
5414        if (!block_rsv)
5415                block_rsv = &fs_info->empty_block_rsv;
5416
5417        return block_rsv;
5418}
5419
5420static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5421                               u64 num_bytes)
5422{
5423        int ret = -ENOSPC;
5424        spin_lock(&block_rsv->lock);
5425        if (block_rsv->reserved >= num_bytes) {
5426                block_rsv->reserved -= num_bytes;
5427                if (block_rsv->reserved < block_rsv->size)
5428                        block_rsv->full = 0;
5429                ret = 0;
5430        }
5431        spin_unlock(&block_rsv->lock);
5432        return ret;
5433}
5434
5435static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5436                                u64 num_bytes, int update_size)
5437{
5438        spin_lock(&block_rsv->lock);
5439        block_rsv->reserved += num_bytes;
5440        if (update_size)
5441                block_rsv->size += num_bytes;
5442        else if (block_rsv->reserved >= block_rsv->size)
5443                block_rsv->full = 1;
5444        spin_unlock(&block_rsv->lock);
5445}
5446
5447int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5448                             struct btrfs_block_rsv *dest, u64 num_bytes,
5449                             int min_factor)
5450{
5451        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5452        u64 min_bytes;
5453
5454        if (global_rsv->space_info != dest->space_info)
5455                return -ENOSPC;
5456
5457        spin_lock(&global_rsv->lock);
5458        min_bytes = div_factor(global_rsv->size, min_factor);
5459        if (global_rsv->reserved < min_bytes + num_bytes) {
5460                spin_unlock(&global_rsv->lock);
5461                return -ENOSPC;
5462        }
5463        global_rsv->reserved -= num_bytes;
5464        if (global_rsv->reserved < global_rsv->size)
5465                global_rsv->full = 0;
5466        spin_unlock(&global_rsv->lock);
5467
5468        block_rsv_add_bytes(dest, num_bytes, 1);
5469        return 0;
5470}
5471
5472/*
5473 * This is for space we already have accounted in space_info->bytes_may_use, so
5474 * basically when we're returning space from block_rsv's.
5475 */
5476static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5477                                     struct btrfs_space_info *space_info,
5478                                     u64 num_bytes)
5479{
5480        struct reserve_ticket *ticket;
5481        struct list_head *head;
5482        u64 used;
5483        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5484        bool check_overcommit = false;
5485
5486        spin_lock(&space_info->lock);
5487        head = &space_info->priority_tickets;
5488
5489        /*
5490         * If we are over our limit then we need to check and see if we can
5491         * overcommit, and if we can't then we just need to free up our space
5492         * and not satisfy any requests.
5493         */
5494        used = btrfs_space_info_used(space_info, true);
5495        if (used - num_bytes >= space_info->total_bytes)
5496                check_overcommit = true;
5497again:
5498        while (!list_empty(head) && num_bytes) {
5499                ticket = list_first_entry(head, struct reserve_ticket,
5500                                          list);
5501                /*
5502                 * We use 0 bytes because this space is already reserved, so
5503                 * adding the ticket space would be a double count.
5504                 */
5505                if (check_overcommit &&
5506                    !can_overcommit(fs_info, space_info, 0, flush, false))
5507                        break;
5508                if (num_bytes >= ticket->bytes) {
5509                        list_del_init(&ticket->list);
5510                        num_bytes -= ticket->bytes;
5511                        ticket->bytes = 0;
5512                        space_info->tickets_id++;
5513                        wake_up(&ticket->wait);
5514                } else {
5515                        ticket->bytes -= num_bytes;
5516                        num_bytes = 0;
5517                }
5518        }
5519
5520        if (num_bytes && head == &space_info->priority_tickets) {
5521                head = &space_info->tickets;
5522                flush = BTRFS_RESERVE_FLUSH_ALL;
5523                goto again;
5524        }
5525        space_info->bytes_may_use -= num_bytes;
5526        trace_btrfs_space_reservation(fs_info, "space_info",
5527                                      space_info->flags, num_bytes, 0);
5528        spin_unlock(&space_info->lock);
5529}
5530
5531/*
5532 * This is for newly allocated space that isn't accounted in
5533 * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5534 * we use this helper.
5535 */
5536static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5537                                     struct btrfs_space_info *space_info,
5538                                     u64 num_bytes)
5539{
5540        struct reserve_ticket *ticket;
5541        struct list_head *head = &space_info->priority_tickets;
5542
5543again:
5544        while (!list_empty(head) && num_bytes) {
5545                ticket = list_first_entry(head, struct reserve_ticket,
5546                                          list);
5547                if (num_bytes >= ticket->bytes) {
5548                        trace_btrfs_space_reservation(fs_info, "space_info",
5549                                                      space_info->flags,
5550                                                      ticket->bytes, 1);
5551                        list_del_init(&ticket->list);
5552                        num_bytes -= ticket->bytes;
5553                        space_info->bytes_may_use += ticket->bytes;
5554                        ticket->bytes = 0;
5555                        space_info->tickets_id++;
5556                        wake_up(&ticket->wait);
5557                } else {
5558                        trace_btrfs_space_reservation(fs_info, "space_info",
5559                                                      space_info->flags,
5560                                                      num_bytes, 1);
5561                        space_info->bytes_may_use += num_bytes;
5562                        ticket->bytes -= num_bytes;
5563                        num_bytes = 0;
5564                }
5565        }
5566
5567        if (num_bytes && head == &space_info->priority_tickets) {
5568                head = &space_info->tickets;
5569                goto again;
5570        }
5571}
5572
5573static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5574                                    struct btrfs_block_rsv *block_rsv,
5575                                    struct btrfs_block_rsv *dest, u64 num_bytes,
5576                                    u64 *qgroup_to_release_ret)
5577{
5578        struct btrfs_space_info *space_info = block_rsv->space_info;
5579        u64 qgroup_to_release = 0;
5580        u64 ret;
5581
5582        spin_lock(&block_rsv->lock);
5583        if (num_bytes == (u64)-1) {
5584                num_bytes = block_rsv->size;
5585                qgroup_to_release = block_rsv->qgroup_rsv_size;
5586        }
5587        block_rsv->size -= num_bytes;
5588        if (block_rsv->reserved >= block_rsv->size) {
5589                num_bytes = block_rsv->reserved - block_rsv->size;
5590                block_rsv->reserved = block_rsv->size;
5591                block_rsv->full = 1;
5592        } else {
5593                num_bytes = 0;
5594        }
5595        if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5596                qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5597                                    block_rsv->qgroup_rsv_size;
5598                block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5599        } else {
5600                qgroup_to_release = 0;
5601        }
5602        spin_unlock(&block_rsv->lock);
5603
5604        ret = num_bytes;
5605        if (num_bytes > 0) {
5606                if (dest) {
5607                        spin_lock(&dest->lock);
5608                        if (!dest->full) {
5609                                u64 bytes_to_add;
5610
5611                                bytes_to_add = dest->size - dest->reserved;
5612                                bytes_to_add = min(num_bytes, bytes_to_add);
5613                                dest->reserved += bytes_to_add;
5614                                if (dest->reserved >= dest->size)
5615                                        dest->full = 1;
5616                                num_bytes -= bytes_to_add;
5617                        }
5618                        spin_unlock(&dest->lock);
5619                }
5620                if (num_bytes)
5621                        space_info_add_old_bytes(fs_info, space_info,
5622                                                 num_bytes);
5623        }
5624        if (qgroup_to_release_ret)
5625                *qgroup_to_release_ret = qgroup_to_release;
5626        return ret;
5627}
5628
5629int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5630                            struct btrfs_block_rsv *dst, u64 num_bytes,
5631                            int update_size)
5632{
5633        int ret;
5634
5635        ret = block_rsv_use_bytes(src, num_bytes);
5636        if (ret)
5637                return ret;
5638
5639        block_rsv_add_bytes(dst, num_bytes, update_size);
5640        return 0;
5641}
5642
5643void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5644{
5645        memset(rsv, 0, sizeof(*rsv));
5646        spin_lock_init(&rsv->lock);
5647        rsv->type = type;
5648}
5649
5650void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5651                                   struct btrfs_block_rsv *rsv,
5652                                   unsigned short type)
5653{
5654        btrfs_init_block_rsv(rsv, type);
5655        rsv->space_info = __find_space_info(fs_info,
5656                                            BTRFS_BLOCK_GROUP_METADATA);
5657}
5658
5659struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5660                                              unsigned short type)
5661{
5662        struct btrfs_block_rsv *block_rsv;
5663
5664        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5665        if (!block_rsv)
5666                return NULL;
5667
5668        btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5669        return block_rsv;
5670}
5671
5672void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5673                          struct btrfs_block_rsv *rsv)
5674{
5675        if (!rsv)
5676                return;
5677        btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5678        kfree(rsv);
5679}
5680
5681void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
5682{
5683        kfree(rsv);
5684}
5685
5686int btrfs_block_rsv_add(struct btrfs_root *root,
5687                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5688                        enum btrfs_reserve_flush_enum flush)
5689{
5690        int ret;
5691
5692        if (num_bytes == 0)
5693                return 0;
5694
5695        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5696        if (!ret) {
5697                block_rsv_add_bytes(block_rsv, num_bytes, 1);
5698                return 0;
5699        }
5700
5701        return ret;
5702}
5703
5704int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5705{
5706        u64 num_bytes = 0;
5707        int ret = -ENOSPC;
5708
5709        if (!block_rsv)
5710                return 0;
5711
5712        spin_lock(&block_rsv->lock);
5713        num_bytes = div_factor(block_rsv->size, min_factor);
5714        if (block_rsv->reserved >= num_bytes)
5715                ret = 0;
5716        spin_unlock(&block_rsv->lock);
5717
5718        return ret;
5719}
5720
5721int btrfs_block_rsv_refill(struct btrfs_root *root,
5722                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5723                           enum btrfs_reserve_flush_enum flush)
5724{
5725        u64 num_bytes = 0;
5726        int ret = -ENOSPC;
5727
5728        if (!block_rsv)
5729                return 0;
5730
5731        spin_lock(&block_rsv->lock);
5732        num_bytes = min_reserved;
5733        if (block_rsv->reserved >= num_bytes)
5734                ret = 0;
5735        else
5736                num_bytes -= block_rsv->reserved;
5737        spin_unlock(&block_rsv->lock);
5738
5739        if (!ret)
5740                return 0;
5741
5742        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5743        if (!ret) {
5744                block_rsv_add_bytes(block_rsv, num_bytes, 0);
5745                return 0;
5746        }
5747
5748        return ret;
5749}
5750
5751/**
5752 * btrfs_inode_rsv_refill - refill the inode block rsv.
5753 * @inode - the inode we are refilling.
5754 * @flush - the flusing restriction.
5755 *
5756 * Essentially the same as btrfs_block_rsv_refill, except it uses the
5757 * block_rsv->size as the minimum size.  We'll either refill the missing amount
5758 * or return if we already have enough space.  This will also handle the resreve
5759 * tracepoint for the reserved amount.
5760 */
5761static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5762                                  enum btrfs_reserve_flush_enum flush)
5763{
5764        struct btrfs_root *root = inode->root;
5765        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5766        u64 num_bytes = 0;
5767        u64 qgroup_num_bytes = 0;
5768        int ret = -ENOSPC;
5769
5770        spin_lock(&block_rsv->lock);
5771        if (block_rsv->reserved < block_rsv->size)
5772                num_bytes = block_rsv->size - block_rsv->reserved;
5773        if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5774                qgroup_num_bytes = block_rsv->qgroup_rsv_size -
5775                                   block_rsv->qgroup_rsv_reserved;
5776        spin_unlock(&block_rsv->lock);
5777
5778        if (num_bytes == 0)
5779                return 0;
5780
5781        ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
5782        if (ret)
5783                return ret;
5784        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5785        if (!ret) {
5786                block_rsv_add_bytes(block_rsv, num_bytes, 0);
5787                trace_btrfs_space_reservation(root->fs_info, "delalloc",
5788                                              btrfs_ino(inode), num_bytes, 1);
5789
5790                /* Don't forget to increase qgroup_rsv_reserved */
5791                spin_lock(&block_rsv->lock);
5792                block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
5793                spin_unlock(&block_rsv->lock);
5794        } else
5795                btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5796        return ret;
5797}
5798
5799/**
5800 * btrfs_inode_rsv_release - release any excessive reservation.
5801 * @inode - the inode we need to release from.
5802 * @qgroup_free - free or convert qgroup meta.
5803 *   Unlike normal operation, qgroup meta reservation needs to know if we are
5804 *   freeing qgroup reservation or just converting it into per-trans.  Normally
5805 *   @qgroup_free is true for error handling, and false for normal release.
5806 *
5807 * This is the same as btrfs_block_rsv_release, except that it handles the
5808 * tracepoint for the reservation.
5809 */
5810static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5811{
5812        struct btrfs_fs_info *fs_info = inode->root->fs_info;
5813        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5814        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5815        u64 released = 0;
5816        u64 qgroup_to_release = 0;
5817
5818        /*
5819         * Since we statically set the block_rsv->size we just want to say we
5820         * are releasing 0 bytes, and then we'll just get the reservation over
5821         * the size free'd.
5822         */
5823        released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
5824                                           &qgroup_to_release);
5825        if (released > 0)
5826                trace_btrfs_space_reservation(fs_info, "delalloc",
5827                                              btrfs_ino(inode), released, 0);
5828        if (qgroup_free)
5829                btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5830        else
5831                btrfs_qgroup_convert_reserved_meta(inode->root,
5832                                                   qgroup_to_release);
5833}
5834
5835void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5836                             struct btrfs_block_rsv *block_rsv,
5837                             u64 num_bytes)
5838{
5839        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5840
5841        if (global_rsv == block_rsv ||
5842            block_rsv->space_info != global_rsv->space_info)
5843                global_rsv = NULL;
5844        block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
5845}
5846
5847static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5848{
5849        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5850        struct btrfs_space_info *sinfo = block_rsv->space_info;
5851        u64 num_bytes;
5852
5853        /*
5854         * The global block rsv is based on the size of the extent tree, the
5855         * checksum tree and the root tree.  If the fs is empty we want to set
5856         * it to a minimal amount for safety.
5857         */
5858        num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5859                btrfs_root_used(&fs_info->csum_root->root_item) +
5860                btrfs_root_used(&fs_info->tree_root->root_item);
5861        num_bytes = max_t(u64, num_bytes, SZ_16M);
5862
5863        spin_lock(&sinfo->lock);
5864        spin_lock(&block_rsv->lock);
5865
5866        block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5867
5868        if (block_rsv->reserved < block_rsv->size) {
5869                num_bytes = btrfs_space_info_used(sinfo, true);
5870                if (sinfo->total_bytes > num_bytes) {
5871                        num_bytes = sinfo->total_bytes - num_bytes;
5872                        num_bytes = min(num_bytes,
5873                                        block_rsv->size - block_rsv->reserved);
5874                        block_rsv->reserved += num_bytes;
5875                        sinfo->bytes_may_use += num_bytes;
5876                        trace_btrfs_space_reservation(fs_info, "space_info",
5877                                                      sinfo->flags, num_bytes,
5878                                                      1);
5879                }
5880        } else if (block_rsv->reserved > block_rsv->size) {
5881                num_bytes = block_rsv->reserved - block_rsv->size;
5882                sinfo->bytes_may_use -= num_bytes;
5883                trace_btrfs_space_reservation(fs_info, "space_info",
5884                                      sinfo->flags, num_bytes, 0);
5885                block_rsv->reserved = block_rsv->size;
5886        }
5887
5888        if (block_rsv->reserved == block_rsv->size)
5889                block_rsv->full = 1;
5890        else
5891                block_rsv->full = 0;
5892
5893        spin_unlock(&block_rsv->lock);
5894        spin_unlock(&sinfo->lock);
5895}
5896
5897static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5898{
5899        struct btrfs_space_info *space_info;
5900
5901        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5902        fs_info->chunk_block_rsv.space_info = space_info;
5903
5904        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5905        fs_info->global_block_rsv.space_info = space_info;
5906        fs_info->trans_block_rsv.space_info = space_info;
5907        fs_info->empty_block_rsv.space_info = space_info;
5908        fs_info->delayed_block_rsv.space_info = space_info;
5909
5910        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
5911        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
5912        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5913        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5914        if (fs_info->quota_root)
5915                fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5916        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5917
5918        update_global_block_rsv(fs_info);
5919}
5920
5921static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5922{
5923        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5924                                (u64)-1, NULL);
5925        WARN_ON(fs_info->trans_block_rsv.size > 0);
5926        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5927        WARN_ON(fs_info->chunk_block_rsv.size > 0);
5928        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5929        WARN_ON(fs_info->delayed_block_rsv.size > 0);
5930        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5931}
5932
5933
5934/*
5935 * To be called after all the new block groups attached to the transaction
5936 * handle have been created (btrfs_create_pending_block_groups()).
5937 */
5938void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
5939{
5940        struct btrfs_fs_info *fs_info = trans->fs_info;
5941
5942        if (!trans->chunk_bytes_reserved)
5943                return;
5944
5945        WARN_ON_ONCE(!list_empty(&trans->new_bgs));
5946
5947        block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
5948                                trans->chunk_bytes_reserved, NULL);
5949        trans->chunk_bytes_reserved = 0;
5950}
5951
5952/* Can only return 0 or -ENOSPC */
5953int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
5954                                  struct btrfs_inode *inode)
5955{
5956        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5957        struct btrfs_root *root = inode->root;
5958        /*
5959         * We always use trans->block_rsv here as we will have reserved space
5960         * for our orphan when starting the transaction, using get_block_rsv()
5961         * here will sometimes make us choose the wrong block rsv as we could be
5962         * doing a reloc inode for a non refcounted root.
5963         */
5964        struct btrfs_block_rsv *src_rsv = trans->block_rsv;
5965        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
5966
5967        /*
5968         * We need to hold space in order to delete our orphan item once we've
5969         * added it, so this takes the reservation so we can release it later
5970         * when we are truly done with the orphan item.
5971         */
5972        u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
5973
5974        trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
5975                        num_bytes, 1);
5976        return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
5977}
5978
5979void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
5980{
5981        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5982        struct btrfs_root *root = inode->root;
5983        u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
5984
5985        trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
5986                        num_bytes, 0);
5987        btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes);
5988}
5989
5990/*
5991 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
5992 * root: the root of the parent directory
5993 * rsv: block reservation
5994 * items: the number of items that we need do reservation
5995 * qgroup_reserved: used to return the reserved size in qgroup
5996 *
5997 * This function is used to reserve the space for snapshot/subvolume
5998 * creation and deletion. Those operations are different with the
5999 * common file/directory operations, they change two fs/file trees
6000 * and root tree, the number of items that the qgroup reserves is
6001 * different with the free space reservation. So we can not use
6002 * the space reservation mechanism in start_transaction().
6003 */
6004int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
6005                                     struct btrfs_block_rsv *rsv,
6006                                     int items,
6007                                     u64 *qgroup_reserved,
6008                                     bool use_global_rsv)
6009{
6010        u64 num_bytes;
6011        int ret;
6012        struct btrfs_fs_info *fs_info = root->fs_info;
6013        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6014
6015        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
6016                /* One for parent inode, two for dir entries */
6017                num_bytes = 3 * fs_info->nodesize;
6018                ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
6019                if (ret)
6020                        return ret;
6021        } else {
6022                num_bytes = 0;
6023        }
6024
6025        *qgroup_reserved = num_bytes;
6026
6027        num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
6028        rsv->space_info = __find_space_info(fs_info,
6029                                            BTRFS_BLOCK_GROUP_METADATA);
6030        ret = btrfs_block_rsv_add(root, rsv, num_bytes,
6031                                  BTRFS_RESERVE_FLUSH_ALL);
6032
6033        if (ret == -ENOSPC && use_global_rsv)
6034                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
6035
6036        if (ret && *qgroup_reserved)
6037                btrfs_qgroup_free_meta_prealloc(root, *qgroup_reserved);
6038
6039        return ret;
6040}
6041
6042void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
6043                                      struct btrfs_block_rsv *rsv)
6044{
6045        btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
6046}
6047
6048static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
6049                                                 struct btrfs_inode *inode)
6050{
6051        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
6052        u64 reserve_size = 0;
6053        u64 qgroup_rsv_size = 0;
6054        u64 csum_leaves;
6055        unsigned outstanding_extents;
6056
6057        lockdep_assert_held(&inode->lock);
6058        outstanding_extents = inode->outstanding_extents;
6059        if (outstanding_extents)
6060                reserve_size = btrfs_calc_trans_metadata_size(fs_info,
6061                                                outstanding_extents + 1);
6062        csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
6063                                                 inode->csum_bytes);
6064        reserve_size += btrfs_calc_trans_metadata_size(fs_info,
6065                                                       csum_leaves);
6066        /*
6067         * For qgroup rsv, the calculation is very simple:
6068         * account one nodesize for each outstanding extent
6069         *
6070         * This is overestimating in most cases.
6071         */
6072        qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
6073
6074        spin_lock(&block_rsv->lock);
6075        block_rsv->size = reserve_size;
6076        block_rsv->qgroup_rsv_size = qgroup_rsv_size;
6077        spin_unlock(&block_rsv->lock);
6078}
6079
6080int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6081{
6082        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6083        unsigned nr_extents;
6084        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
6085        int ret = 0;
6086        bool delalloc_lock = true;
6087
6088        /* If we are a free space inode we need to not flush since we will be in
6089         * the middle of a transaction commit.  We also don't need the delalloc
6090         * mutex since we won't race with anybody.  We need this mostly to make
6091         * lockdep shut its filthy mouth.
6092         *
6093         * If we have a transaction open (can happen if we call truncate_block
6094         * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
6095         */
6096        if (btrfs_is_free_space_inode(inode)) {
6097                flush = BTRFS_RESERVE_NO_FLUSH;
6098                delalloc_lock = false;
6099        } else {
6100                if (current->journal_info)
6101                        flush = BTRFS_RESERVE_FLUSH_LIMIT;
6102
6103                if (btrfs_transaction_in_commit(fs_info))
6104                        schedule_timeout(1);
6105        }
6106
6107        if (delalloc_lock)
6108                mutex_lock(&inode->delalloc_mutex);
6109
6110        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6111
6112        /* Add our new extents and calculate the new rsv size. */
6113        spin_lock(&inode->lock);
6114        nr_extents = count_max_extents(num_bytes);
6115        btrfs_mod_outstanding_extents(inode, nr_extents);
6116        inode->csum_bytes += num_bytes;
6117        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6118        spin_unlock(&inode->lock);
6119
6120        ret = btrfs_inode_rsv_refill(inode, flush);
6121        if (unlikely(ret))
6122                goto out_fail;
6123
6124        if (delalloc_lock)
6125                mutex_unlock(&inode->delalloc_mutex);
6126        return 0;
6127
6128out_fail:
6129        spin_lock(&inode->lock);
6130        nr_extents = count_max_extents(num_bytes);
6131        btrfs_mod_outstanding_extents(inode, -nr_extents);
6132        inode->csum_bytes -= num_bytes;
6133        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6134        spin_unlock(&inode->lock);
6135
6136        btrfs_inode_rsv_release(inode, true);
6137        if (delalloc_lock)
6138                mutex_unlock(&inode->delalloc_mutex);
6139        return ret;
6140}
6141
6142/**
6143 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6144 * @inode: the inode to release the reservation for.
6145 * @num_bytes: the number of bytes we are releasing.
6146 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
6147 *
6148 * This will release the metadata reservation for an inode.  This can be called
6149 * once we complete IO for a given set of bytes to release their metadata
6150 * reservations, or on error for the same reason.
6151 */
6152void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
6153                                     bool qgroup_free)
6154{
6155        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6156
6157        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6158        spin_lock(&inode->lock);
6159        inode->csum_bytes -= num_bytes;
6160        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6161        spin_unlock(&inode->lock);
6162
6163        if (btrfs_is_testing(fs_info))
6164                return;
6165
6166        btrfs_inode_rsv_release(inode, qgroup_free);
6167}
6168
6169/**
6170 * btrfs_delalloc_release_extents - release our outstanding_extents
6171 * @inode: the inode to balance the reservation for.
6172 * @num_bytes: the number of bytes we originally reserved with
6173 * @qgroup_free: do we need to free qgroup meta reservation or convert them.
6174 *
6175 * When we reserve space we increase outstanding_extents for the extents we may
6176 * add.  Once we've set the range as delalloc or created our ordered extents we
6177 * have outstanding_extents to track the real usage, so we use this to free our
6178 * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
6179 * with btrfs_delalloc_reserve_metadata.
6180 */
6181void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
6182                                    bool qgroup_free)
6183{
6184        struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6185        unsigned num_extents;
6186
6187        spin_lock(&inode->lock);
6188        num_extents = count_max_extents(num_bytes);
6189        btrfs_mod_outstanding_extents(inode, -num_extents);
6190        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6191        spin_unlock(&inode->lock);
6192
6193        if (btrfs_is_testing(fs_info))
6194                return;
6195
6196        btrfs_inode_rsv_release(inode, qgroup_free);
6197}
6198
6199/**
6200 * btrfs_delalloc_reserve_space - reserve data and metadata space for
6201 * delalloc
6202 * @inode: inode we're writing to
6203 * @start: start range we are writing to
6204 * @len: how long the range we are writing to
6205 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6206 *            current reservation.
6207 *
6208 * This will do the following things
6209 *
6210 * o reserve space in data space info for num bytes
6211 *   and reserve precious corresponding qgroup space
6212 *   (Done in check_data_free_space)
6213 *
6214 * o reserve space for metadata space, based on the number of outstanding
6215 *   extents and how much csums will be needed
6216 *   also reserve metadata space in a per root over-reserve method.
6217 * o add to the inodes->delalloc_bytes
6218 * o add it to the fs_info's delalloc inodes list.
6219 *   (Above 3 all done in delalloc_reserve_metadata)
6220 *
6221 * Return 0 for success
6222 * Return <0 for error(-ENOSPC or -EQUOT)
6223 */
6224int btrfs_delalloc_reserve_space(struct inode *inode,
6225                        struct extent_changeset **reserved, u64 start, u64 len)
6226{
6227        int ret;
6228
6229        ret = btrfs_check_data_free_space(inode, reserved, start, len);
6230        if (ret < 0)
6231                return ret;
6232        ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6233        if (ret < 0)
6234                btrfs_free_reserved_data_space(inode, *reserved, start, len);
6235        return ret;
6236}
6237
6238/**
6239 * btrfs_delalloc_release_space - release data and metadata space for delalloc
6240 * @inode: inode we're releasing space for
6241 * @start: start position of the space already reserved
6242 * @len: the len of the space already reserved
6243 * @release_bytes: the len of the space we consumed or didn't use
6244 *
6245 * This function will release the metadata space that was not used and will
6246 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6247 * list if there are no delalloc bytes left.
6248 * Also it will handle the qgroup reserved space.
6249 */
6250void btrfs_delalloc_release_space(struct inode *inode,
6251                                  struct extent_changeset *reserved,
6252                                  u64 start, u64 len, bool qgroup_free)
6253{
6254        btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6255        btrfs_free_reserved_data_space(inode, reserved, start, len);
6256}
6257
6258static int update_block_group(struct btrfs_trans_handle *trans,
6259                              struct btrfs_fs_info *info, u64 bytenr,
6260                              u64 num_bytes, int alloc)
6261{
6262        struct btrfs_block_group_cache *cache = NULL;
6263        u64 total = num_bytes;
6264        u64 old_val;
6265        u64 byte_in_group;
6266        int factor;
6267
6268        /* block accounting for super block */
6269        spin_lock(&info->delalloc_root_lock);
6270        old_val = btrfs_super_bytes_used(info->super_copy);
6271        if (alloc)
6272                old_val += num_bytes;
6273        else
6274                old_val -= num_bytes;
6275        btrfs_set_super_bytes_used(info->super_copy, old_val);
6276        spin_unlock(&info->delalloc_root_lock);
6277
6278        while (total) {
6279                cache = btrfs_lookup_block_group(info, bytenr);
6280                if (!cache)
6281                        return -ENOENT;
6282                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
6283                                    BTRFS_BLOCK_GROUP_RAID1 |
6284                                    BTRFS_BLOCK_GROUP_RAID10))
6285                        factor = 2;
6286                else
6287                        factor = 1;
6288                /*
6289                 * If this block group has free space cache written out, we
6290                 * need to make sure to load it if we are removing space.  This
6291                 * is because we need the unpinning stage to actually add the
6292                 * space back to the block group, otherwise we will leak space.
6293                 */
6294                if (!alloc && cache->cached == BTRFS_CACHE_NO)
6295                        cache_block_group(cache, 1);
6296
6297                byte_in_group = bytenr - cache->key.objectid;
6298                WARN_ON(byte_in_group > cache->key.offset);
6299
6300                spin_lock(&cache->space_info->lock);
6301                spin_lock(&cache->lock);
6302
6303                if (btrfs_test_opt(info, SPACE_CACHE) &&
6304                    cache->disk_cache_state < BTRFS_DC_CLEAR)
6305                        cache->disk_cache_state = BTRFS_DC_CLEAR;
6306
6307                old_val = btrfs_block_group_used(&cache->item);
6308                num_bytes = min(total, cache->key.offset - byte_in_group);
6309                if (alloc) {
6310                        old_val += num_bytes;
6311                        btrfs_set_block_group_used(&cache->item, old_val);
6312                        cache->reserved -= num_bytes;
6313                        cache->space_info->bytes_reserved -= num_bytes;
6314                        cache->space_info->bytes_used += num_bytes;
6315                        cache->space_info->disk_used += num_bytes * factor;
6316                        spin_unlock(&cache->lock);
6317                        spin_unlock(&cache->space_info->lock);
6318                } else {
6319                        old_val -= num_bytes;
6320                        btrfs_set_block_group_used(&cache->item, old_val);
6321                        cache->pinned += num_bytes;
6322                        cache->space_info->bytes_pinned += num_bytes;
6323                        cache->space_info->bytes_used -= num_bytes;
6324                        cache->space_info->disk_used -= num_bytes * factor;
6325                        spin_unlock(&cache->lock);
6326                        spin_unlock(&cache->space_info->lock);
6327
6328                        trace_btrfs_space_reservation(info, "pinned",
6329                                                      cache->space_info->flags,
6330                                                      num_bytes, 1);
6331                        percpu_counter_add(&cache->space_info->total_bytes_pinned,
6332                                           num_bytes);
6333                        set_extent_dirty(info->pinned_extents,
6334                                         bytenr, bytenr + num_bytes - 1,
6335                                         GFP_NOFS | __GFP_NOFAIL);
6336                }
6337
6338                spin_lock(&trans->transaction->dirty_bgs_lock);
6339                if (list_empty(&cache->dirty_list)) {
6340                        list_add_tail(&cache->dirty_list,
6341                                      &trans->transaction->dirty_bgs);
6342                                trans->transaction->num_dirty_bgs++;
6343                        btrfs_get_block_group(cache);
6344                }
6345                spin_unlock(&trans->transaction->dirty_bgs_lock);
6346
6347                /*
6348                 * No longer have used bytes in this block group, queue it for
6349                 * deletion. We do this after adding the block group to the
6350                 * dirty list to avoid races between cleaner kthread and space
6351                 * cache writeout.
6352                 */
6353                if (!alloc && old_val == 0) {
6354                        spin_lock(&info->unused_bgs_lock);
6355                        if (list_empty(&cache->bg_list)) {
6356                                btrfs_get_block_group(cache);
6357                                list_add_tail(&cache->bg_list,
6358                                              &info->unused_bgs);
6359                        }
6360                        spin_unlock(&info->unused_bgs_lock);
6361                }
6362
6363                btrfs_put_block_group(cache);
6364                total -= num_bytes;
6365                bytenr += num_bytes;
6366        }
6367        return 0;
6368}
6369
6370static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6371{
6372        struct btrfs_block_group_cache *cache;
6373        u64 bytenr;
6374
6375        spin_lock(&fs_info->block_group_cache_lock);
6376        bytenr = fs_info->first_logical_byte;
6377        spin_unlock(&fs_info->block_group_cache_lock);
6378
6379        if (bytenr < (u64)-1)
6380                return bytenr;
6381
6382        cache = btrfs_lookup_first_block_group(fs_info, search_start);
6383        if (!cache)
6384                return 0;
6385
6386        bytenr = cache->key.objectid;
6387        btrfs_put_block_group(cache);
6388
6389        return bytenr;
6390}
6391
6392static int pin_down_extent(struct btrfs_fs_info *fs_info,
6393                           struct btrfs_block_group_cache *cache,
6394                           u64 bytenr, u64 num_bytes, int reserved)
6395{
6396        spin_lock(&cache->space_info->lock);
6397        spin_lock(&cache->lock);
6398        cache->pinned += num_bytes;
6399        cache->space_info->bytes_pinned += num_bytes;
6400        if (reserved) {
6401                cache->reserved -= num_bytes;
6402                cache->space_info->bytes_reserved -= num_bytes;
6403        }
6404        spin_unlock(&cache->lock);
6405        spin_unlock(&cache->space_info->lock);
6406
6407        trace_btrfs_space_reservation(fs_info, "pinned",
6408                                      cache->space_info->flags, num_bytes, 1);
6409        percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes);
6410        set_extent_dirty(fs_info->pinned_extents, bytenr,
6411                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6412        return 0;
6413}
6414
6415/*
6416 * this function must be called within transaction
6417 */
6418int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6419                     u64 bytenr, u64 num_bytes, int reserved)
6420{
6421        struct btrfs_block_group_cache *cache;
6422
6423        cache = btrfs_lookup_block_group(fs_info, bytenr);
6424        BUG_ON(!cache); /* Logic error */
6425
6426        pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
6427
6428        btrfs_put_block_group(cache);
6429        return 0;
6430}
6431
6432/*
6433 * this function must be called within transaction
6434 */
6435int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6436                                    u64 bytenr, u64 num_bytes)
6437{
6438        struct btrfs_block_group_cache *cache;
6439        int ret;
6440
6441        cache = btrfs_lookup_block_group(fs_info, bytenr);
6442        if (!cache)
6443                return -EINVAL;
6444
6445        /*
6446         * pull in the free space cache (if any) so that our pin
6447         * removes the free space from the cache.  We have load_only set
6448         * to one because the slow code to read in the free extents does check
6449         * the pinned extents.
6450         */
6451        cache_block_group(cache, 1);
6452
6453        pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
6454
6455        /* remove us from the free space cache (if we're there at all) */
6456        ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6457        btrfs_put_block_group(cache);
6458        return ret;
6459}
6460
6461static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6462                                   u64 start, u64 num_bytes)
6463{
6464        int ret;
6465        struct btrfs_block_group_cache *block_group;
6466        struct btrfs_caching_control *caching_ctl;
6467
6468        block_group = btrfs_lookup_block_group(fs_info, start);
6469        if (!block_group)
6470                return -EINVAL;
6471
6472        cache_block_group(block_group, 0);
6473        caching_ctl = get_caching_control(block_group);
6474
6475        if (!caching_ctl) {
6476                /* Logic error */
6477                BUG_ON(!block_group_cache_done(block_group));
6478                ret = btrfs_remove_free_space(block_group, start, num_bytes);
6479        } else {
6480                mutex_lock(&caching_ctl->mutex);
6481
6482                if (start >= caching_ctl->progress) {
6483                        ret = add_excluded_extent(fs_info, start, num_bytes);
6484                } else if (start + num_bytes <= caching_ctl->progress) {
6485                        ret = btrfs_remove_free_space(block_group,
6486                                                      start, num_bytes);
6487                } else {
6488                        num_bytes = caching_ctl->progress - start;
6489                        ret = btrfs_remove_free_space(block_group,
6490                                                      start, num_bytes);
6491                        if (ret)
6492                                goto out_lock;
6493
6494                        num_bytes = (start + num_bytes) -
6495                                caching_ctl->progress;
6496                        start = caching_ctl->progress;
6497                        ret = add_excluded_extent(fs_info, start, num_bytes);
6498                }
6499out_lock:
6500                mutex_unlock(&caching_ctl->mutex);
6501                put_caching_control(caching_ctl);
6502        }
6503        btrfs_put_block_group(block_group);
6504        return ret;
6505}
6506
6507int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
6508                                 struct extent_buffer *eb)
6509{
6510        struct btrfs_file_extent_item *item;
6511        struct btrfs_key key;
6512        int found_type;
6513        int i;
6514
6515        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6516                return 0;
6517
6518        for (i = 0; i < btrfs_header_nritems(eb); i++) {
6519                btrfs_item_key_to_cpu(eb, &key, i);
6520                if (key.type != BTRFS_EXTENT_DATA_KEY)
6521                        continue;
6522                item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6523                found_type = btrfs_file_extent_type(eb, item);
6524                if (found_type == BTRFS_FILE_EXTENT_INLINE)
6525                        continue;
6526                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6527                        continue;
6528                key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6529                key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6530                __exclude_logged_extent(fs_info, key.objectid, key.offset);
6531        }
6532
6533        return 0;
6534}
6535
6536static void
6537btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6538{
6539        atomic_inc(&bg->reservations);
6540}
6541
6542void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6543                                        const u64 start)
6544{
6545        struct btrfs_block_group_cache *bg;
6546
6547        bg = btrfs_lookup_block_group(fs_info, start);
6548        ASSERT(bg);
6549        if (atomic_dec_and_test(&bg->reservations))
6550                wake_up_var(&bg->reservations);
6551        btrfs_put_block_group(bg);
6552}
6553
6554void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6555{
6556        struct btrfs_space_info *space_info = bg->space_info;
6557
6558        ASSERT(bg->ro);
6559
6560        if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6561                return;
6562
6563        /*
6564         * Our block group is read only but before we set it to read only,
6565         * some task might have had allocated an extent from it already, but it
6566         * has not yet created a respective ordered extent (and added it to a
6567         * root's list of ordered extents).
6568         * Therefore wait for any task currently allocating extents, since the
6569         * block group's reservations counter is incremented while a read lock
6570         * on the groups' semaphore is held and decremented after releasing
6571         * the read access on that semaphore and creating the ordered extent.
6572         */
6573        down_write(&space_info->groups_sem);
6574        up_write(&space_info->groups_sem);
6575
6576        wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6577}
6578
6579/**
6580 * btrfs_add_reserved_bytes - update the block_group and space info counters
6581 * @cache:      The cache we are manipulating
6582 * @ram_bytes:  The number of bytes of file content, and will be same to
6583 *              @num_bytes except for the compress path.
6584 * @num_bytes:  The number of bytes in question
6585 * @delalloc:   The blocks are allocated for the delalloc write
6586 *
6587 * This is called by the allocator when it reserves space. If this is a
6588 * reservation and the block group has become read only we cannot make the
6589 * reservation and return -EAGAIN, otherwise this function always succeeds.
6590 */
6591static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6592                                    u64 ram_bytes, u64 num_bytes, int delalloc)
6593{
6594        struct btrfs_space_info *space_info = cache->space_info;
6595        int ret = 0;
6596
6597        spin_lock(&space_info->lock);
6598        spin_lock(&cache->lock);
6599        if (cache->ro) {
6600                ret = -EAGAIN;
6601        } else {
6602                cache->reserved += num_bytes;
6603                space_info->bytes_reserved += num_bytes;
6604
6605                trace_btrfs_space_reservation(cache->fs_info,
6606                                "space_info", space_info->flags,
6607                                ram_bytes, 0);
6608                space_info->bytes_may_use -= ram_bytes;
6609                if (delalloc)
6610                        cache->delalloc_bytes += num_bytes;
6611        }
6612        spin_unlock(&cache->lock);
6613        spin_unlock(&space_info->lock);
6614        return ret;
6615}
6616
6617/**
6618 * btrfs_free_reserved_bytes - update the block_group and space info counters
6619 * @cache:      The cache we are manipulating
6620 * @num_bytes:  The number of bytes in question
6621 * @delalloc:   The blocks are allocated for the delalloc write
6622 *
6623 * This is called by somebody who is freeing space that was never actually used
6624 * on disk.  For example if you reserve some space for a new leaf in transaction
6625 * A and before transaction A commits you free that leaf, you call this with
6626 * reserve set to 0 in order to clear the reservation.
6627 */
6628
6629static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6630                                     u64 num_bytes, int delalloc)
6631{
6632        struct btrfs_space_info *space_info = cache->space_info;
6633        int ret = 0;
6634
6635        spin_lock(&space_info->lock);
6636        spin_lock(&cache->lock);
6637        if (cache->ro)
6638                space_info->bytes_readonly += num_bytes;
6639        cache->reserved -= num_bytes;
6640        space_info->bytes_reserved -= num_bytes;
6641
6642        if (delalloc)
6643                cache->delalloc_bytes -= num_bytes;
6644        spin_unlock(&cache->lock);
6645        spin_unlock(&space_info->lock);
6646        return ret;
6647}
6648void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6649{
6650        struct btrfs_caching_control *next;
6651        struct btrfs_caching_control *caching_ctl;
6652        struct btrfs_block_group_cache *cache;
6653
6654        down_write(&fs_info->commit_root_sem);
6655
6656        list_for_each_entry_safe(caching_ctl, next,
6657                                 &fs_info->caching_block_groups, list) {
6658                cache = caching_ctl->block_group;
6659                if (block_group_cache_done(cache)) {
6660                        cache->last_byte_to_unpin = (u64)-1;
6661                        list_del_init(&caching_ctl->list);
6662                        put_caching_control(caching_ctl);
6663                } else {
6664                        cache->last_byte_to_unpin = caching_ctl->progress;
6665                }
6666        }
6667
6668        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6669                fs_info->pinned_extents = &fs_info->freed_extents[1];
6670        else
6671                fs_info->pinned_extents = &fs_info->freed_extents[0];
6672
6673        up_write(&fs_info->commit_root_sem);
6674
6675        update_global_block_rsv(fs_info);
6676}
6677
6678/*
6679 * Returns the free cluster for the given space info and sets empty_cluster to
6680 * what it should be based on the mount options.
6681 */
6682static struct btrfs_free_cluster *
6683fetch_cluster_info(struct btrfs_fs_info *fs_info,
6684                   struct btrfs_space_info *space_info, u64 *empty_cluster)
6685{
6686        struct btrfs_free_cluster *ret = NULL;
6687
6688        *empty_cluster = 0;
6689        if (btrfs_mixed_space_info(space_info))
6690                return ret;
6691
6692        if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6693                ret = &fs_info->meta_alloc_cluster;
6694                if (btrfs_test_opt(fs_info, SSD))
6695                        *empty_cluster = SZ_2M;
6696                else
6697                        *empty_cluster = SZ_64K;
6698        } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6699                   btrfs_test_opt(fs_info, SSD_SPREAD)) {
6700                *empty_cluster = SZ_2M;
6701                ret = &fs_info->data_alloc_cluster;
6702        }
6703
6704        return ret;
6705}
6706
6707static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6708                              u64 start, u64 end,
6709                              const bool return_free_space)
6710{
6711        struct btrfs_block_group_cache *cache = NULL;
6712        struct btrfs_space_info *space_info;
6713        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6714        struct btrfs_free_cluster *cluster = NULL;
6715        u64 len;
6716        u64 total_unpinned = 0;
6717        u64 empty_cluster = 0;
6718        bool readonly;
6719
6720        while (start <= end) {
6721                readonly = false;
6722                if (!cache ||
6723                    start >= cache->key.objectid + cache->key.offset) {
6724                        if (cache)
6725                                btrfs_put_block_group(cache);
6726                        total_unpinned = 0;
6727                        cache = btrfs_lookup_block_group(fs_info, start);
6728                        BUG_ON(!cache); /* Logic error */
6729
6730                        cluster = fetch_cluster_info(fs_info,
6731                                                     cache->space_info,
6732                                                     &empty_cluster);
6733                        empty_cluster <<= 1;
6734                }
6735
6736                len = cache->key.objectid + cache->key.offset - start;
6737                len = min(len, end + 1 - start);
6738
6739                if (start < cache->last_byte_to_unpin) {
6740                        len = min(len, cache->last_byte_to_unpin - start);
6741                        if (return_free_space)
6742                                btrfs_add_free_space(cache, start, len);
6743                }
6744
6745                start += len;
6746                total_unpinned += len;
6747                space_info = cache->space_info;
6748
6749                /*
6750                 * If this space cluster has been marked as fragmented and we've
6751                 * unpinned enough in this block group to potentially allow a
6752                 * cluster to be created inside of it go ahead and clear the
6753                 * fragmented check.
6754                 */
6755                if (cluster && cluster->fragmented &&
6756                    total_unpinned > empty_cluster) {
6757                        spin_lock(&cluster->lock);
6758                        cluster->fragmented = 0;
6759                        spin_unlock(&cluster->lock);
6760                }
6761
6762                spin_lock(&space_info->lock);
6763                spin_lock(&cache->lock);
6764                cache->pinned -= len;
6765                space_info->bytes_pinned -= len;
6766
6767                trace_btrfs_space_reservation(fs_info, "pinned",
6768                                              space_info->flags, len, 0);
6769                space_info->max_extent_size = 0;
6770                percpu_counter_add(&space_info->total_bytes_pinned, -len);
6771                if (cache->ro) {
6772                        space_info->bytes_readonly += len;
6773                        readonly = true;
6774                }
6775                spin_unlock(&cache->lock);
6776                if (!readonly && return_free_space &&
6777                    global_rsv->space_info == space_info) {
6778                        u64 to_add = len;
6779
6780                        spin_lock(&global_rsv->lock);
6781                        if (!global_rsv->full) {
6782                                to_add = min(len, global_rsv->size -
6783                                             global_rsv->reserved);
6784                                global_rsv->reserved += to_add;
6785                                space_info->bytes_may_use += to_add;
6786                                if (global_rsv->reserved >= global_rsv->size)
6787                                        global_rsv->full = 1;
6788                                trace_btrfs_space_reservation(fs_info,
6789                                                              "space_info",
6790                                                              space_info->flags,
6791                                                              to_add, 1);
6792                                len -= to_add;
6793                        }
6794                        spin_unlock(&global_rsv->lock);
6795                        /* Add to any tickets we may have */
6796                        if (len)
6797                                space_info_add_new_bytes(fs_info, space_info,
6798                                                         len);
6799                }
6800                spin_unlock(&space_info->lock);
6801        }
6802
6803        if (cache)
6804                btrfs_put_block_group(cache);
6805        return 0;
6806}
6807
6808int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
6809{
6810        struct btrfs_fs_info *fs_info = trans->fs_info;
6811        struct btrfs_block_group_cache *block_group, *tmp;
6812        struct list_head *deleted_bgs;
6813        struct extent_io_tree *unpin;
6814        u64 start;
6815        u64 end;
6816        int ret;
6817
6818        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6819                unpin = &fs_info->freed_extents[1];
6820        else
6821                unpin = &fs_info->freed_extents[0];
6822
6823        while (!trans->aborted) {
6824                mutex_lock(&fs_info->unused_bg_unpin_mutex);
6825                ret = find_first_extent_bit(unpin, 0, &start, &end,
6826                                            EXTENT_DIRTY, NULL);
6827                if (ret) {
6828                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6829                        break;
6830                }
6831
6832                if (btrfs_test_opt(fs_info, DISCARD))
6833                        ret = btrfs_discard_extent(fs_info, start,
6834                                                   end + 1 - start, NULL);
6835
6836                clear_extent_dirty(unpin, start, end);
6837                unpin_extent_range(fs_info, start, end, true);
6838                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6839                cond_resched();
6840        }
6841
6842        /*
6843         * Transaction is finished.  We don't need the lock anymore.  We
6844         * do need to clean up the block groups in case of a transaction
6845         * abort.
6846         */
6847        deleted_bgs = &trans->transaction->deleted_bgs;
6848        list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6849                u64 trimmed = 0;
6850
6851                ret = -EROFS;
6852                if (!trans->aborted)
6853                        ret = btrfs_discard_extent(fs_info,
6854                                                   block_group->key.objectid,
6855                                                   block_group->key.offset,
6856                                                   &trimmed);
6857
6858                list_del_init(&block_group->bg_list);
6859                btrfs_put_block_group_trimming(block_group);
6860                btrfs_put_block_group(block_group);
6861
6862                if (ret) {
6863                        const char *errstr = btrfs_decode_error(ret);
6864                        btrfs_warn(fs_info,
6865                           "discard failed while removing blockgroup: errno=%d %s",
6866                                   ret, errstr);
6867                }
6868        }
6869
6870        return 0;
6871}
6872
6873static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6874                                struct btrfs_fs_info *info,
6875                                struct btrfs_delayed_ref_node *node, u64 parent,
6876                                u64 root_objectid, u64 owner_objectid,
6877                                u64 owner_offset, int refs_to_drop,
6878                                struct btrfs_delayed_extent_op *extent_op)
6879{
6880        struct btrfs_key key;
6881        struct btrfs_path *path;
6882        struct btrfs_root *extent_root = info->extent_root;
6883        struct extent_buffer *leaf;
6884        struct btrfs_extent_item *ei;
6885        struct btrfs_extent_inline_ref *iref;
6886        int ret;
6887        int is_data;
6888        int extent_slot = 0;
6889        int found_extent = 0;
6890        int num_to_del = 1;
6891        u32 item_size;
6892        u64 refs;
6893        u64 bytenr = node->bytenr;
6894        u64 num_bytes = node->num_bytes;
6895        int last_ref = 0;
6896        bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
6897
6898        path = btrfs_alloc_path();
6899        if (!path)
6900                return -ENOMEM;
6901
6902        path->reada = READA_FORWARD;
6903        path->leave_spinning = 1;
6904
6905        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6906        BUG_ON(!is_data && refs_to_drop != 1);
6907
6908        if (is_data)
6909                skinny_metadata = false;
6910
6911        ret = lookup_extent_backref(trans, info, path, &iref,
6912                                    bytenr, num_bytes, parent,
6913                                    root_objectid, owner_objectid,
6914                                    owner_offset);
6915        if (ret == 0) {
6916                extent_slot = path->slots[0];
6917                while (extent_slot >= 0) {
6918                        btrfs_item_key_to_cpu(path->nodes[0], &key,
6919                                              extent_slot);
6920                        if (key.objectid != bytenr)
6921                                break;
6922                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6923                            key.offset == num_bytes) {
6924                                found_extent = 1;
6925                                break;
6926                        }
6927                        if (key.type == BTRFS_METADATA_ITEM_KEY &&
6928                            key.offset == owner_objectid) {
6929                                found_extent = 1;
6930                                break;
6931                        }
6932                        if (path->slots[0] - extent_slot > 5)
6933                                break;
6934                        extent_slot--;
6935                }
6936#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
6937                item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
6938                if (found_extent && item_size < sizeof(*ei))
6939                        found_extent = 0;
6940#endif
6941                if (!found_extent) {
6942                        BUG_ON(iref);
6943                        ret = remove_extent_backref(trans, info, path, NULL,
6944                                                    refs_to_drop,
6945                                                    is_data, &last_ref);
6946                        if (ret) {
6947                                btrfs_abort_transaction(trans, ret);
6948                                goto out;
6949                        }
6950                        btrfs_release_path(path);
6951                        path->leave_spinning = 1;
6952
6953                        key.objectid = bytenr;
6954                        key.type = BTRFS_EXTENT_ITEM_KEY;
6955                        key.offset = num_bytes;
6956
6957                        if (!is_data && skinny_metadata) {
6958                                key.type = BTRFS_METADATA_ITEM_KEY;
6959                                key.offset = owner_objectid;
6960                        }
6961
6962                        ret = btrfs_search_slot(trans, extent_root,
6963                                                &key, path, -1, 1);
6964                        if (ret > 0 && skinny_metadata && path->slots[0]) {
6965                                /*
6966                                 * Couldn't find our skinny metadata item,
6967                                 * see if we have ye olde extent item.
6968                                 */
6969                                path->slots[0]--;
6970                                btrfs_item_key_to_cpu(path->nodes[0], &key,
6971                                                      path->slots[0]);
6972                                if (key.objectid == bytenr &&
6973                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
6974                                    key.offset == num_bytes)
6975                                        ret = 0;
6976                        }
6977
6978                        if (ret > 0 && skinny_metadata) {
6979                                skinny_metadata = false;
6980                                key.objectid = bytenr;
6981                                key.type = BTRFS_EXTENT_ITEM_KEY;
6982                                key.offset = num_bytes;
6983                                btrfs_release_path(path);
6984                                ret = btrfs_search_slot(trans, extent_root,
6985                                                        &key, path, -1, 1);
6986                        }
6987
6988                        if (ret) {
6989                                btrfs_err(info,
6990                                          "umm, got %d back from search, was looking for %llu",
6991                                          ret, bytenr);
6992                                if (ret > 0)
6993                                        btrfs_print_leaf(path->nodes[0]);
6994                        }
6995                        if (ret < 0) {
6996                                btrfs_abort_transaction(trans, ret);
6997                                goto out;
6998                        }
6999                        extent_slot = path->slots[0];
7000                }
7001        } else if (WARN_ON(ret == -ENOENT)) {
7002                btrfs_print_leaf(path->nodes[0]);
7003                btrfs_err(info,
7004                        "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
7005                        bytenr, parent, root_objectid, owner_objectid,
7006                        owner_offset);
7007                btrfs_abort_transaction(trans, ret);
7008                goto out;
7009        } else {
7010                btrfs_abort_transaction(trans, ret);
7011                goto out;
7012        }
7013
7014        leaf = path->nodes[0];
7015        item_size = btrfs_item_size_nr(leaf, extent_slot);
7016#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
7017        if (item_size < sizeof(*ei)) {
7018                BUG_ON(found_extent || extent_slot != path->slots[0]);
7019                ret = convert_extent_item_v0(trans, info, path, owner_objectid,
7020                                             0);
7021                if (ret < 0) {
7022                        btrfs_abort_transaction(trans, ret);
7023                        goto out;
7024                }
7025
7026                btrfs_release_path(path);
7027                path->leave_spinning = 1;
7028
7029                key.objectid = bytenr;
7030                key.type = BTRFS_EXTENT_ITEM_KEY;
7031                key.offset = num_bytes;
7032
7033                ret = btrfs_search_slot(trans, extent_root, &key, path,
7034                                        -1, 1);
7035                if (ret) {
7036                        btrfs_err(info,
7037                                  "umm, got %d back from search, was looking for %llu",
7038                                ret, bytenr);
7039                        btrfs_print_leaf(path->nodes[0]);
7040                }
7041                if (ret < 0) {
7042                        btrfs_abort_transaction(trans, ret);
7043                        goto out;
7044                }
7045
7046                extent_slot = path->slots[0];
7047                leaf = path->nodes[0];
7048                item_size = btrfs_item_size_nr(leaf, extent_slot);
7049        }
7050#endif
7051        BUG_ON(item_size < sizeof(*ei));
7052        ei = btrfs_item_ptr(leaf, extent_slot,
7053                            struct btrfs_extent_item);
7054        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
7055            key.type == BTRFS_EXTENT_ITEM_KEY) {
7056                struct btrfs_tree_block_info *bi;
7057                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7058                bi = (struct btrfs_tree_block_info *)(ei + 1);
7059                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7060        }
7061
7062        refs = btrfs_extent_refs(leaf, ei);
7063        if (refs < refs_to_drop) {
7064                btrfs_err(info,
7065                          "trying to drop %d refs but we only have %Lu for bytenr %Lu",
7066                          refs_to_drop, refs, bytenr);
7067                ret = -EINVAL;
7068                btrfs_abort_transaction(trans, ret);
7069                goto out;
7070        }
7071        refs -= refs_to_drop;
7072
7073        if (refs > 0) {
7074                if (extent_op)
7075                        __run_delayed_extent_op(extent_op, leaf, ei);
7076                /*
7077                 * In the case of inline back ref, reference count will
7078                 * be updated by remove_extent_backref
7079                 */
7080                if (iref) {
7081                        BUG_ON(!found_extent);
7082                } else {
7083                        btrfs_set_extent_refs(leaf, ei, refs);
7084                        btrfs_mark_buffer_dirty(leaf);
7085                }
7086                if (found_extent) {
7087                        ret = remove_extent_backref(trans, info, path,
7088                                                    iref, refs_to_drop,
7089                                                    is_data, &last_ref);
7090                        if (ret) {
7091                                btrfs_abort_transaction(trans, ret);
7092                                goto out;
7093                        }
7094                }
7095        } else {
7096                if (found_extent) {
7097                        BUG_ON(is_data && refs_to_drop !=
7098                               extent_data_ref_count(path, iref));
7099                        if (iref) {
7100                                BUG_ON(path->slots[0] != extent_slot);
7101                        } else {
7102                                BUG_ON(path->slots[0] != extent_slot + 1);
7103                                path->slots[0] = extent_slot;
7104                                num_to_del = 2;
7105                        }
7106                }
7107
7108                last_ref = 1;
7109                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7110                                      num_to_del);
7111                if (ret) {
7112                        btrfs_abort_transaction(trans, ret);
7113                        goto out;
7114                }
7115                btrfs_release_path(path);
7116
7117                if (is_data) {
7118                        ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
7119                        if (ret) {
7120                                btrfs_abort_transaction(trans, ret);
7121                                goto out;
7122                        }
7123                }
7124
7125                ret = add_to_free_space_tree(trans, info, bytenr, num_bytes);
7126                if (ret) {
7127                        btrfs_abort_transaction(trans, ret);
7128                        goto out;
7129                }
7130
7131                ret = update_block_group(trans, info, bytenr, num_bytes, 0);
7132                if (ret) {
7133                        btrfs_abort_transaction(trans, ret);
7134                        goto out;
7135                }
7136        }
7137        btrfs_release_path(path);
7138
7139out:
7140        btrfs_free_path(path);
7141        return ret;
7142}
7143
7144/*
7145 * when we free an block, it is possible (and likely) that we free the last
7146 * delayed ref for that extent as well.  This searches the delayed ref tree for
7147 * a given extent, and if there are no other delayed refs to be processed, it
7148 * removes it from the tree.
7149 */
7150static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7151                                      u64 bytenr)
7152{
7153        struct btrfs_delayed_ref_head *head;
7154        struct btrfs_delayed_ref_root *delayed_refs;
7155        int ret = 0;
7156
7157        delayed_refs = &trans->transaction->delayed_refs;
7158        spin_lock(&delayed_refs->lock);
7159        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
7160        if (!head)
7161                goto out_delayed_unlock;
7162
7163        spin_lock(&head->lock);
7164        if (!RB_EMPTY_ROOT(&head->ref_tree))
7165                goto out;
7166
7167        if (head->extent_op) {
7168                if (!head->must_insert_reserved)
7169                        goto out;
7170                btrfs_free_delayed_extent_op(head->extent_op);
7171                head->extent_op = NULL;
7172        }
7173
7174        /*
7175         * waiting for the lock here would deadlock.  If someone else has it
7176         * locked they are already in the process of dropping it anyway
7177         */
7178        if (!mutex_trylock(&head->mutex))
7179                goto out;
7180
7181        /*
7182         * at this point we have a head with no other entries.  Go
7183         * ahead and process it.
7184         */
7185        rb_erase(&head->href_node, &delayed_refs->href_root);
7186        RB_CLEAR_NODE(&head->href_node);
7187        atomic_dec(&delayed_refs->num_entries);
7188
7189        /*
7190         * we don't take a ref on the node because we're removing it from the
7191         * tree, so we just steal the ref the tree was holding.
7192         */
7193        delayed_refs->num_heads--;
7194        if (head->processing == 0)
7195                delayed_refs->num_heads_ready--;
7196        head->processing = 0;
7197        spin_unlock(&head->lock);
7198        spin_unlock(&delayed_refs->lock);
7199
7200        BUG_ON(head->extent_op);
7201        if (head->must_insert_reserved)
7202                ret = 1;
7203
7204        mutex_unlock(&head->mutex);
7205        btrfs_put_delayed_ref_head(head);
7206        return ret;
7207out:
7208        spin_unlock(&head->lock);
7209
7210out_delayed_unlock:
7211        spin_unlock(&delayed_refs->lock);
7212        return 0;
7213}
7214
7215void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7216                           struct btrfs_root *root,
7217                           struct extent_buffer *buf,
7218                           u64 parent, int last_ref)
7219{
7220        struct btrfs_fs_info *fs_info = root->fs_info;
7221        int pin = 1;
7222        int ret;
7223
7224        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7225                int old_ref_mod, new_ref_mod;
7226
7227                btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
7228                                   root->root_key.objectid,
7229                                   btrfs_header_level(buf), 0,
7230                                   BTRFS_DROP_DELAYED_REF);
7231                ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
7232                                                 buf->len, parent,
7233                                                 root->root_key.objectid,
7234                                                 btrfs_header_level(buf),
7235                                                 BTRFS_DROP_DELAYED_REF, NULL,
7236                                                 &old_ref_mod, &new_ref_mod);
7237                BUG_ON(ret); /* -ENOMEM */
7238                pin = old_ref_mod >= 0 && new_ref_mod < 0;
7239        }
7240
7241        if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7242                struct btrfs_block_group_cache *cache;
7243
7244                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7245                        ret = check_ref_cleanup(trans, buf->start);
7246                        if (!ret)
7247                                goto out;
7248                }
7249
7250                pin = 0;
7251                cache = btrfs_lookup_block_group(fs_info, buf->start);
7252
7253                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7254                        pin_down_extent(fs_info, cache, buf->start,
7255                                        buf->len, 1);
7256                        btrfs_put_block_group(cache);
7257                        goto out;
7258                }
7259
7260                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7261
7262                btrfs_add_free_space(cache, buf->start, buf->len);
7263                btrfs_free_reserved_bytes(cache, buf->len, 0);
7264                btrfs_put_block_group(cache);
7265                trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7266        }
7267out:
7268        if (pin)
7269                add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
7270                                 root->root_key.objectid);
7271
7272        if (last_ref) {
7273                /*
7274                 * Deleting the buffer, clear the corrupt flag since it doesn't
7275                 * matter anymore.
7276                 */
7277                clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7278        }
7279}
7280
7281/* Can return -ENOMEM */
7282int btrfs_free_extent(struct btrfs_trans_handle *trans,
7283                      struct btrfs_root *root,
7284                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7285                      u64 owner, u64 offset)
7286{
7287        struct btrfs_fs_info *fs_info = root->fs_info;
7288        int old_ref_mod, new_ref_mod;
7289        int ret;
7290
7291        if (btrfs_is_testing(fs_info))
7292                return 0;
7293
7294        if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7295                btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7296                                   root_objectid, owner, offset,
7297                                   BTRFS_DROP_DELAYED_REF);
7298
7299        /*
7300         * tree log blocks never actually go into the extent allocation
7301         * tree, just update pinning info and exit early.
7302         */
7303        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7304                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7305                /* unlocks the pinned mutex */
7306                btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7307                old_ref_mod = new_ref_mod = 0;
7308                ret = 0;
7309        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7310                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
7311                                                 num_bytes, parent,
7312                                                 root_objectid, (int)owner,
7313                                                 BTRFS_DROP_DELAYED_REF, NULL,
7314                                                 &old_ref_mod, &new_ref_mod);
7315        } else {
7316                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
7317                                                 num_bytes, parent,
7318                                                 root_objectid, owner, offset,
7319                                                 0, BTRFS_DROP_DELAYED_REF,
7320                                                 &old_ref_mod, &new_ref_mod);
7321        }
7322
7323        if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
7324                add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
7325
7326        return ret;
7327}
7328
7329/*
7330 * when we wait for progress in the block group caching, its because
7331 * our allocation attempt failed at least once.  So, we must sleep
7332 * and let some progress happen before we try again.
7333 *
7334 * This function will sleep at least once waiting for new free space to
7335 * show up, and then it will check the block group free space numbers
7336 * for our min num_bytes.  Another option is to have it go ahead
7337 * and look in the rbtree for a free extent of a given size, but this
7338 * is a good start.
7339 *
7340 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7341 * any of the information in this block group.
7342 */
7343static noinline void
7344wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7345                                u64 num_bytes)
7346{
7347        struct btrfs_caching_control *caching_ctl;
7348
7349        caching_ctl = get_caching_control(cache);
7350        if (!caching_ctl)
7351                return;
7352
7353        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7354                   (cache->free_space_ctl->free_space >= num_bytes));
7355
7356        put_caching_control(caching_ctl);
7357}
7358
7359static noinline int
7360wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7361{
7362        struct btrfs_caching_control *caching_ctl;
7363        int ret = 0;
7364
7365        caching_ctl = get_caching_control(cache);
7366        if (!caching_ctl)
7367                return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7368
7369        wait_event(caching_ctl->wait, block_group_cache_done(cache));
7370        if (cache->cached == BTRFS_CACHE_ERROR)
7371                ret = -EIO;
7372        put_caching_control(caching_ctl);
7373        return ret;
7374}
7375
7376static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
7377        [BTRFS_RAID_RAID10]     = "raid10",
7378        [BTRFS_RAID_RAID1]      = "raid1",
7379        [BTRFS_RAID_DUP]        = "dup",
7380        [BTRFS_RAID_RAID0]      = "raid0",
7381        [BTRFS_RAID_SINGLE]     = "single",
7382        [BTRFS_RAID_RAID5]      = "raid5",
7383        [BTRFS_RAID_RAID6]      = "raid6",
7384};
7385
7386static const char *get_raid_name(enum btrfs_raid_types type)
7387{
7388        if (type >= BTRFS_NR_RAID_TYPES)
7389                return NULL;
7390
7391        return btrfs_raid_type_names[type];
7392}
7393
7394enum btrfs_loop_type {
7395        LOOP_CACHING_NOWAIT = 0,
7396        LOOP_CACHING_WAIT = 1,
7397        LOOP_ALLOC_CHUNK = 2,
7398        LOOP_NO_EMPTY_SIZE = 3,
7399};
7400
7401static inline void
7402btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7403                       int delalloc)
7404{
7405        if (delalloc)
7406                down_read(&cache->data_rwsem);
7407}
7408
7409static inline void
7410btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7411                       int delalloc)
7412{
7413        btrfs_get_block_group(cache);
7414        if (delalloc)
7415                down_read(&cache->data_rwsem);
7416}
7417
7418static struct btrfs_block_group_cache *
7419btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7420                   struct btrfs_free_cluster *cluster,
7421                   int delalloc)
7422{
7423        struct btrfs_block_group_cache *used_bg = NULL;
7424
7425        spin_lock(&cluster->refill_lock);
7426        while (1) {
7427                used_bg = cluster->block_group;
7428                if (!used_bg)
7429                        return NULL;
7430
7431                if (used_bg == block_group)
7432                        return used_bg;
7433
7434                btrfs_get_block_group(used_bg);
7435
7436                if (!delalloc)
7437                        return used_bg;
7438
7439                if (down_read_trylock(&used_bg->data_rwsem))
7440                        return used_bg;
7441
7442                spin_unlock(&cluster->refill_lock);
7443
7444                /* We should only have one-level nested. */
7445                down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7446
7447                spin_lock(&cluster->refill_lock);
7448                if (used_bg == cluster->block_group)
7449                        return used_bg;
7450
7451                up_read(&used_bg->data_rwsem);
7452                btrfs_put_block_group(used_bg);
7453        }
7454}
7455
7456static inline void
7457btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7458                         int delalloc)
7459{
7460        if (delalloc)
7461                up_read(&cache->data_rwsem);
7462        btrfs_put_block_group(cache);
7463}
7464
7465/*
7466 * walks the btree of allocated extents and find a hole of a given size.
7467 * The key ins is changed to record the hole:
7468 * ins->objectid == start position
7469 * ins->flags = BTRFS_EXTENT_ITEM_KEY
7470 * ins->offset == the size of the hole.
7471 * Any available blocks before search_start are skipped.
7472 *
7473 * If there is no suitable free space, we will record the max size of
7474 * the free space extent currently.
7475 */
7476static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7477                                u64 ram_bytes, u64 num_bytes, u64 empty_size,
7478                                u64 hint_byte, struct btrfs_key *ins,
7479                                u64 flags, int delalloc)
7480{
7481        int ret = 0;
7482        struct btrfs_root *root = fs_info->extent_root;
7483        struct btrfs_free_cluster *last_ptr = NULL;
7484        struct btrfs_block_group_cache *block_group = NULL;
7485        u64 search_start = 0;
7486        u64 max_extent_size = 0;
7487        u64 empty_cluster = 0;
7488        struct btrfs_space_info *space_info;
7489        int loop = 0;
7490        int index = btrfs_bg_flags_to_raid_index(flags);
7491        bool failed_cluster_refill = false;
7492        bool failed_alloc = false;
7493        bool use_cluster = true;
7494        bool have_caching_bg = false;
7495        bool orig_have_caching_bg = false;
7496        bool full_search = false;
7497
7498        WARN_ON(num_bytes < fs_info->sectorsize);
7499        ins->type = BTRFS_EXTENT_ITEM_KEY;
7500        ins->objectid = 0;
7501        ins->offset = 0;
7502
7503        trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7504
7505        space_info = __find_space_info(fs_info, flags);
7506        if (!space_info) {
7507                btrfs_err(fs_info, "No space info for %llu", flags);
7508                return -ENOSPC;
7509        }
7510
7511        /*
7512         * If our free space is heavily fragmented we may not be able to make
7513         * big contiguous allocations, so instead of doing the expensive search
7514         * for free space, simply return ENOSPC with our max_extent_size so we
7515         * can go ahead and search for a more manageable chunk.
7516         *
7517         * If our max_extent_size is large enough for our allocation simply
7518         * disable clustering since we will likely not be able to find enough
7519         * space to create a cluster and induce latency trying.
7520         */
7521        if (unlikely(space_info->max_extent_size)) {
7522                spin_lock(&space_info->lock);
7523                if (space_info->max_extent_size &&
7524                    num_bytes > space_info->max_extent_size) {
7525                        ins->offset = space_info->max_extent_size;
7526                        spin_unlock(&space_info->lock);
7527                        return -ENOSPC;
7528                } else if (space_info->max_extent_size) {
7529                        use_cluster = false;
7530                }
7531                spin_unlock(&space_info->lock);
7532        }
7533
7534        last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
7535        if (last_ptr) {
7536                spin_lock(&last_ptr->lock);
7537                if (last_ptr->block_group)
7538                        hint_byte = last_ptr->window_start;
7539                if (last_ptr->fragmented) {
7540                        /*
7541                         * We still set window_start so we can keep track of the
7542                         * last place we found an allocation to try and save
7543                         * some time.
7544                         */
7545                        hint_byte = last_ptr->window_start;
7546                        use_cluster = false;
7547                }
7548                spin_unlock(&last_ptr->lock);
7549        }
7550
7551        search_start = max(search_start, first_logical_byte(fs_info, 0));
7552        search_start = max(search_start, hint_byte);
7553        if (search_start == hint_byte) {
7554                block_group = btrfs_lookup_block_group(fs_info, search_start);
7555                /*
7556                 * we don't want to use the block group if it doesn't match our
7557                 * allocation bits, or if its not cached.
7558                 *
7559                 * However if we are re-searching with an ideal block group
7560                 * picked out then we don't care that the block group is cached.
7561                 */
7562                if (block_group && block_group_bits(block_group, flags) &&
7563                    block_group->cached != BTRFS_CACHE_NO) {
7564                        down_read(&space_info->groups_sem);
7565                        if (list_empty(&block_group->list) ||
7566                            block_group->ro) {
7567                                /*
7568                                 * someone is removing this block group,
7569                                 * we can't jump into the have_block_group
7570                                 * target because our list pointers are not
7571                                 * valid
7572                                 */
7573                                btrfs_put_block_group(block_group);
7574                                up_read(&space_info->groups_sem);
7575                        } else {
7576                                index = btrfs_bg_flags_to_raid_index(
7577                                                block_group->flags);
7578                                btrfs_lock_block_group(block_group, delalloc);
7579                                goto have_block_group;
7580                        }
7581                } else if (block_group) {
7582                        btrfs_put_block_group(block_group);
7583                }
7584        }
7585search:
7586        have_caching_bg = false;
7587        if (index == 0 || index == btrfs_bg_flags_to_raid_index(flags))
7588                full_search = true;
7589        down_read(&space_info->groups_sem);
7590        list_for_each_entry(block_group, &space_info->block_groups[index],
7591                            list) {
7592                u64 offset;
7593                int cached;
7594
7595                /* If the block group is read-only, we can skip it entirely. */
7596                if (unlikely(block_group->ro))
7597                        continue;
7598
7599                btrfs_grab_block_group(block_group, delalloc);
7600                search_start = block_group->key.objectid;
7601
7602                /*
7603                 * this can happen if we end up cycling through all the
7604                 * raid types, but we want to make sure we only allocate
7605                 * for the proper type.
7606                 */
7607                if (!block_group_bits(block_group, flags)) {
7608                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
7609                                BTRFS_BLOCK_GROUP_RAID1 |
7610                                BTRFS_BLOCK_GROUP_RAID5 |
7611                                BTRFS_BLOCK_GROUP_RAID6 |
7612                                BTRFS_BLOCK_GROUP_RAID10;
7613
7614                        /*
7615                         * if they asked for extra copies and this block group
7616                         * doesn't provide them, bail.  This does allow us to
7617                         * fill raid0 from raid1.
7618                         */
7619                        if ((flags & extra) && !(block_group->flags & extra))
7620                                goto loop;
7621                }
7622
7623have_block_group:
7624                cached = block_group_cache_done(block_group);
7625                if (unlikely(!cached)) {
7626                        have_caching_bg = true;
7627                        ret = cache_block_group(block_group, 0);
7628                        BUG_ON(ret < 0);
7629                        ret = 0;
7630                }
7631
7632                if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7633                        goto loop;
7634
7635                /*
7636                 * Ok we want to try and use the cluster allocator, so
7637                 * lets look there
7638                 */
7639                if (last_ptr && use_cluster) {
7640                        struct btrfs_block_group_cache *used_block_group;
7641                        unsigned long aligned_cluster;
7642                        /*
7643                         * the refill lock keeps out other
7644                         * people trying to start a new cluster
7645                         */
7646                        used_block_group = btrfs_lock_cluster(block_group,
7647                                                              last_ptr,
7648                                                              delalloc);
7649                        if (!used_block_group)
7650                                goto refill_cluster;
7651
7652                        if (used_block_group != block_group &&
7653                            (used_block_group->ro ||
7654                             !block_group_bits(used_block_group, flags)))
7655                                goto release_cluster;
7656
7657                        offset = btrfs_alloc_from_cluster(used_block_group,
7658                                                last_ptr,
7659                                                num_bytes,
7660                                                used_block_group->key.objectid,
7661                                                &max_extent_size);
7662                        if (offset) {
7663                                /* we have a block, we're done */
7664                                spin_unlock(&last_ptr->refill_lock);
7665                                trace_btrfs_reserve_extent_cluster(fs_info,
7666                                                used_block_group,
7667                                                search_start, num_bytes);
7668                                if (used_block_group != block_group) {
7669                                        btrfs_release_block_group(block_group,
7670                                                                  delalloc);
7671                                        block_group = used_block_group;
7672                                }
7673                                goto checks;
7674                        }
7675
7676                        WARN_ON(last_ptr->block_group != used_block_group);
7677release_cluster:
7678                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
7679                         * set up a new clusters, so lets just skip it
7680                         * and let the allocator find whatever block
7681                         * it can find.  If we reach this point, we
7682                         * will have tried the cluster allocator
7683                         * plenty of times and not have found
7684                         * anything, so we are likely way too
7685                         * fragmented for the clustering stuff to find
7686                         * anything.
7687                         *
7688                         * However, if the cluster is taken from the
7689                         * current block group, release the cluster
7690                         * first, so that we stand a better chance of
7691                         * succeeding in the unclustered
7692                         * allocation.  */
7693                        if (loop >= LOOP_NO_EMPTY_SIZE &&
7694                            used_block_group != block_group) {
7695                                spin_unlock(&last_ptr->refill_lock);
7696                                btrfs_release_block_group(used_block_group,
7697                                                          delalloc);
7698                                goto unclustered_alloc;
7699                        }
7700
7701                        /*
7702                         * this cluster didn't work out, free it and
7703                         * start over
7704                         */
7705                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
7706
7707                        if (used_block_group != block_group)
7708                                btrfs_release_block_group(used_block_group,
7709                                                          delalloc);
7710refill_cluster:
7711                        if (loop >= LOOP_NO_EMPTY_SIZE) {
7712                                spin_unlock(&last_ptr->refill_lock);
7713                                goto unclustered_alloc;
7714                        }
7715
7716                        aligned_cluster = max_t(unsigned long,
7717                                                empty_cluster + empty_size,
7718                                              block_group->full_stripe_len);
7719
7720                        /* allocate a cluster in this block group */
7721                        ret = btrfs_find_space_cluster(fs_info, block_group,
7722                                                       last_ptr, search_start,
7723                                                       num_bytes,
7724                                                       aligned_cluster);
7725                        if (ret == 0) {
7726                                /*
7727                                 * now pull our allocation out of this
7728                                 * cluster
7729                                 */
7730                                offset = btrfs_alloc_from_cluster(block_group,
7731                                                        last_ptr,
7732                                                        num_bytes,
7733                                                        search_start,
7734                                                        &max_extent_size);
7735                                if (offset) {
7736                                        /* we found one, proceed */
7737                                        spin_unlock(&last_ptr->refill_lock);
7738                                        trace_btrfs_reserve_extent_cluster(fs_info,
7739                                                block_group, search_start,
7740                                                num_bytes);
7741                                        goto checks;
7742                                }
7743                        } else if (!cached && loop > LOOP_CACHING_NOWAIT
7744                                   && !failed_cluster_refill) {
7745                                spin_unlock(&last_ptr->refill_lock);
7746
7747                                failed_cluster_refill = true;
7748                                wait_block_group_cache_progress(block_group,
7749                                       num_bytes + empty_cluster + empty_size);
7750                                goto have_block_group;
7751                        }
7752
7753                        /*
7754                         * at this point we either didn't find a cluster
7755                         * or we weren't able to allocate a block from our
7756                         * cluster.  Free the cluster we've been trying
7757                         * to use, and go to the next block group
7758                         */
7759                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
7760                        spin_unlock(&last_ptr->refill_lock);
7761                        goto loop;
7762                }
7763
7764unclustered_alloc:
7765                /*
7766                 * We are doing an unclustered alloc, set the fragmented flag so
7767                 * we don't bother trying to setup a cluster again until we get
7768                 * more space.
7769                 */
7770                if (unlikely(last_ptr)) {
7771                        spin_lock(&last_ptr->lock);
7772                        last_ptr->fragmented = 1;
7773                        spin_unlock(&last_ptr->lock);
7774                }
7775                if (cached) {
7776                        struct btrfs_free_space_ctl *ctl =
7777                                block_group->free_space_ctl;
7778
7779                        spin_lock(&ctl->tree_lock);
7780                        if (ctl->free_space <
7781                            num_bytes + empty_cluster + empty_size) {
7782                                if (ctl->free_space > max_extent_size)
7783                                        max_extent_size = ctl->free_space;
7784                                spin_unlock(&ctl->tree_lock);
7785                                goto loop;
7786                        }
7787                        spin_unlock(&ctl->tree_lock);
7788                }
7789
7790                offset = btrfs_find_space_for_alloc(block_group, search_start,
7791                                                    num_bytes, empty_size,
7792                                                    &max_extent_size);
7793                /*
7794                 * If we didn't find a chunk, and we haven't failed on this
7795                 * block group before, and this block group is in the middle of
7796                 * caching and we are ok with waiting, then go ahead and wait
7797                 * for progress to be made, and set failed_alloc to true.
7798                 *
7799                 * If failed_alloc is true then we've already waited on this
7800                 * block group once and should move on to the next block group.
7801                 */
7802                if (!offset && !failed_alloc && !cached &&
7803                    loop > LOOP_CACHING_NOWAIT) {
7804                        wait_block_group_cache_progress(block_group,
7805                                                num_bytes + empty_size);
7806                        failed_alloc = true;
7807                        goto have_block_group;
7808                } else if (!offset) {
7809                        goto loop;
7810                }
7811checks:
7812                search_start = ALIGN(offset, fs_info->stripesize);
7813
7814                /* move on to the next group */
7815                if (search_start + num_bytes >
7816                    block_group->key.objectid + block_group->key.offset) {
7817                        btrfs_add_free_space(block_group, offset, num_bytes);
7818                        goto loop;
7819                }
7820
7821                if (offset < search_start)
7822                        btrfs_add_free_space(block_group, offset,
7823                                             search_start - offset);
7824                BUG_ON(offset > search_start);
7825
7826                ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
7827                                num_bytes, delalloc);
7828                if (ret == -EAGAIN) {
7829                        btrfs_add_free_space(block_group, offset, num_bytes);
7830                        goto loop;
7831                }
7832                btrfs_inc_block_group_reservations(block_group);
7833
7834                /* we are all good, lets return */
7835                ins->objectid = search_start;
7836                ins->offset = num_bytes;
7837
7838                trace_btrfs_reserve_extent(fs_info, block_group,
7839                                           search_start, num_bytes);
7840                btrfs_release_block_group(block_group, delalloc);
7841                break;
7842loop:
7843                failed_cluster_refill = false;
7844                failed_alloc = false;
7845                BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
7846                       index);
7847                btrfs_release_block_group(block_group, delalloc);
7848                cond_resched();
7849        }
7850        up_read(&space_info->groups_sem);
7851
7852        if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7853                && !orig_have_caching_bg)
7854                orig_have_caching_bg = true;
7855
7856        if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7857                goto search;
7858
7859        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
7860                goto search;
7861
7862        /*
7863         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7864         *                      caching kthreads as we move along
7865         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7866         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7867         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7868         *                      again
7869         */
7870        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7871                index = 0;
7872                if (loop == LOOP_CACHING_NOWAIT) {
7873                        /*
7874                         * We want to skip the LOOP_CACHING_WAIT step if we
7875                         * don't have any uncached bgs and we've already done a
7876                         * full search through.
7877                         */
7878                        if (orig_have_caching_bg || !full_search)
7879                                loop = LOOP_CACHING_WAIT;
7880                        else
7881                                loop = LOOP_ALLOC_CHUNK;
7882                } else {
7883                        loop++;
7884                }
7885
7886                if (loop == LOOP_ALLOC_CHUNK) {
7887                        struct btrfs_trans_handle *trans;
7888                        int exist = 0;
7889
7890                        trans = current->journal_info;
7891                        if (trans)
7892                                exist = 1;
7893                        else
7894                                trans = btrfs_join_transaction(root);
7895
7896                        if (IS_ERR(trans)) {
7897                                ret = PTR_ERR(trans);
7898                                goto out;
7899                        }
7900
7901                        ret = do_chunk_alloc(trans, fs_info, flags,
7902                                             CHUNK_ALLOC_FORCE);
7903
7904                        /*
7905                         * If we can't allocate a new chunk we've already looped
7906                         * through at least once, move on to the NO_EMPTY_SIZE
7907                         * case.
7908                         */
7909                        if (ret == -ENOSPC)
7910                                loop = LOOP_NO_EMPTY_SIZE;
7911
7912                        /*
7913                         * Do not bail out on ENOSPC since we
7914                         * can do more things.
7915                         */
7916                        if (ret < 0 && ret != -ENOSPC)
7917                                btrfs_abort_transaction(trans, ret);
7918                        else
7919                                ret = 0;
7920                        if (!exist)
7921                                btrfs_end_transaction(trans);
7922                        if (ret)
7923                                goto out;
7924                }
7925
7926                if (loop == LOOP_NO_EMPTY_SIZE) {
7927                        /*
7928                         * Don't loop again if we already have no empty_size and
7929                         * no empty_cluster.
7930                         */
7931                        if (empty_size == 0 &&
7932                            empty_cluster == 0) {
7933                                ret = -ENOSPC;
7934                                goto out;
7935                        }
7936                        empty_size = 0;
7937                        empty_cluster = 0;
7938                }
7939
7940                goto search;
7941        } else if (!ins->objectid) {
7942                ret = -ENOSPC;
7943        } else if (ins->objectid) {
7944                if (!use_cluster && last_ptr) {
7945                        spin_lock(&last_ptr->lock);
7946                        last_ptr->window_start = ins->objectid;
7947                        spin_unlock(&last_ptr->lock);
7948                }
7949                ret = 0;
7950        }
7951out:
7952        if (ret == -ENOSPC) {
7953                spin_lock(&space_info->lock);
7954                space_info->max_extent_size = max_extent_size;
7955                spin_unlock(&space_info->lock);
7956                ins->offset = max_extent_size;
7957        }
7958        return ret;
7959}
7960
7961static void dump_space_info(struct btrfs_fs_info *fs_info,
7962                            struct btrfs_space_info *info, u64 bytes,
7963                            int dump_block_groups)
7964{
7965        struct btrfs_block_group_cache *cache;
7966        int index = 0;
7967
7968        spin_lock(&info->lock);
7969        btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
7970                   info->flags,
7971                   info->total_bytes - btrfs_space_info_used(info, true),
7972                   info->full ? "" : "not ");
7973        btrfs_info(fs_info,
7974                "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
7975                info->total_bytes, info->bytes_used, info->bytes_pinned,
7976                info->bytes_reserved, info->bytes_may_use,
7977                info->bytes_readonly);
7978        spin_unlock(&info->lock);
7979
7980        if (!dump_block_groups)
7981                return;
7982
7983        down_read(&info->groups_sem);
7984again:
7985        list_for_each_entry(cache, &info->block_groups[index], list) {
7986                spin_lock(&cache->lock);
7987                btrfs_info(fs_info,
7988                        "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
7989                        cache->key.objectid, cache->key.offset,
7990                        btrfs_block_group_used(&cache->item), cache->pinned,
7991                        cache->reserved, cache->ro ? "[readonly]" : "");
7992                btrfs_dump_free_space(cache, bytes);
7993                spin_unlock(&cache->lock);
7994        }
7995        if (++index < BTRFS_NR_RAID_TYPES)
7996                goto again;
7997        up_read(&info->groups_sem);
7998}
7999
8000/*
8001 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
8002 *                        hole that is at least as big as @num_bytes.
8003 *
8004 * @root           -    The root that will contain this extent
8005 *
8006 * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
8007 *                      is used for accounting purposes. This value differs
8008 *                      from @num_bytes only in the case of compressed extents.
8009 *
8010 * @num_bytes      -    Number of bytes to allocate on-disk.
8011 *
8012 * @min_alloc_size -    Indicates the minimum amount of space that the
8013 *                      allocator should try to satisfy. In some cases
8014 *                      @num_bytes may be larger than what is required and if
8015 *                      the filesystem is fragmented then allocation fails.
8016 *                      However, the presence of @min_alloc_size gives a
8017 *                      chance to try and satisfy the smaller allocation.
8018 *
8019 * @empty_size     -    A hint that you plan on doing more COW. This is the
8020 *                      size in bytes the allocator should try to find free
8021 *                      next to the block it returns.  This is just a hint and
8022 *                      may be ignored by the allocator.
8023 *
8024 * @hint_byte      -    Hint to the allocator to start searching above the byte
8025 *                      address passed. It might be ignored.
8026 *
8027 * @ins            -    This key is modified to record the found hole. It will
8028 *                      have the following values:
8029 *                      ins->objectid == start position
8030 *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
8031 *                      ins->offset == the size of the hole.
8032 *
8033 * @is_data        -    Boolean flag indicating whether an extent is
8034 *                      allocated for data (true) or metadata (false)
8035 *
8036 * @delalloc       -    Boolean flag indicating whether this allocation is for
8037 *                      delalloc or not. If 'true' data_rwsem of block groups
8038 *                      is going to be acquired.
8039 *
8040 *
8041 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
8042 * case -ENOSPC is returned then @ins->offset will contain the size of the
8043 * largest available hole the allocator managed to find.
8044 */
8045int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
8046                         u64 num_bytes, u64 min_alloc_size,
8047                         u64 empty_size, u64 hint_byte,
8048                         struct btrfs_key *ins, int is_data, int delalloc)
8049{
8050        struct btrfs_fs_info *fs_info = root->fs_info;
8051        bool final_tried = num_bytes == min_alloc_size;
8052        u64 flags;
8053        int ret;
8054
8055        flags = get_alloc_profile_by_root(root, is_data);
8056again:
8057        WARN_ON(num_bytes < fs_info->sectorsize);
8058        ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
8059                               hint_byte, ins, flags, delalloc);
8060        if (!ret && !is_data) {
8061                btrfs_dec_block_group_reservations(fs_info, ins->objectid);
8062        } else if (ret == -ENOSPC) {
8063                if (!final_tried && ins->offset) {
8064                        num_bytes = min(num_bytes >> 1, ins->offset);
8065                        num_bytes = round_down(num_bytes,
8066                                               fs_info->sectorsize);
8067                        num_bytes = max(num_bytes, min_alloc_size);
8068                        ram_bytes = num_bytes;
8069                        if (num_bytes == min_alloc_size)
8070                                final_tried = true;
8071                        goto again;
8072                } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8073                        struct btrfs_space_info *sinfo;
8074
8075                        sinfo = __find_space_info(fs_info, flags);
8076                        btrfs_err(fs_info,
8077                                  "allocation failed flags %llu, wanted %llu",
8078                                  flags, num_bytes);
8079                        if (sinfo)
8080                                dump_space_info(fs_info, sinfo, num_bytes, 1);
8081                }
8082        }
8083
8084        return ret;
8085}
8086
8087static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8088                                        u64 start, u64 len,
8089                                        int pin, int delalloc)
8090{
8091        struct btrfs_block_group_cache *cache;
8092        int ret = 0;
8093
8094        cache = btrfs_lookup_block_group(fs_info, start);
8095        if (!cache) {
8096                btrfs_err(fs_info, "Unable to find block group for %llu",
8097                          start);
8098                return -ENOSPC;
8099        }
8100
8101        if (pin)
8102                pin_down_extent(fs_info, cache, start, len, 1);
8103        else {
8104                if (btrfs_test_opt(fs_info, DISCARD))
8105                        ret = btrfs_discard_extent(fs_info, start, len, NULL);
8106                btrfs_add_free_space(cache, start, len);
8107                btrfs_free_reserved_bytes(cache, len, delalloc);
8108                trace_btrfs_reserved_extent_free(fs_info, start, len);
8109        }
8110
8111        btrfs_put_block_group(cache);
8112        return ret;
8113}
8114
8115int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8116                               u64 start, u64 len, int delalloc)
8117{
8118        return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
8119}
8120
8121int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
8122                                       u64 start, u64 len)
8123{
8124        return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
8125}
8126
8127static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8128                                      struct btrfs_fs_info *fs_info,
8129                                      u64 parent, u64 root_objectid,
8130                                      u64 flags, u64 owner, u64 offset,
8131                                      struct btrfs_key *ins, int ref_mod)
8132{
8133        int ret;
8134        struct btrfs_extent_item *extent_item;
8135        struct btrfs_extent_inline_ref *iref;
8136        struct btrfs_path *path;
8137        struct extent_buffer *leaf;
8138        int type;
8139        u32 size;
8140
8141        if (parent > 0)
8142                type = BTRFS_SHARED_DATA_REF_KEY;
8143        else
8144                type = BTRFS_EXTENT_DATA_REF_KEY;
8145
8146        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8147
8148        path = btrfs_alloc_path();
8149        if (!path)
8150                return -ENOMEM;
8151
8152        path->leave_spinning = 1;
8153        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8154                                      ins, size);
8155        if (ret) {
8156                btrfs_free_path(path);
8157                return ret;
8158        }
8159
8160        leaf = path->nodes[0];
8161        extent_item = btrfs_item_ptr(leaf, path->slots[0],
8162                                     struct btrfs_extent_item);
8163        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8164        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8165        btrfs_set_extent_flags(leaf, extent_item,
8166                               flags | BTRFS_EXTENT_FLAG_DATA);
8167
8168        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8169        btrfs_set_extent_inline_ref_type(leaf, iref, type);
8170        if (parent > 0) {
8171                struct btrfs_shared_data_ref *ref;
8172                ref = (struct btrfs_shared_data_ref *)(iref + 1);
8173                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8174                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8175        } else {
8176                struct btrfs_extent_data_ref *ref;
8177                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8178                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8179                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8180                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8181                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8182        }
8183
8184        btrfs_mark_buffer_dirty(path->nodes[0]);
8185        btrfs_free_path(path);
8186
8187        ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8188                                          ins->offset);
8189        if (ret)
8190                return ret;
8191
8192        ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
8193        if (ret) { /* -ENOENT, logic error */
8194                btrfs_err(fs_info, "update block group failed for %llu %llu",
8195                        ins->objectid, ins->offset);
8196                BUG();
8197        }
8198        trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
8199        return ret;
8200}
8201
8202static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8203                                     struct btrfs_fs_info *fs_info,
8204                                     u64 parent, u64 root_objectid,
8205                                     u64 flags, struct btrfs_disk_key *key,
8206                                     int level, struct btrfs_key *ins)
8207{
8208        int ret;
8209        struct btrfs_extent_item *extent_item;
8210        struct btrfs_tree_block_info *block_info;
8211        struct btrfs_extent_inline_ref *iref;
8212        struct btrfs_path *path;
8213        struct extent_buffer *leaf;
8214        u32 size = sizeof(*extent_item) + sizeof(*iref);
8215        u64 num_bytes = ins->offset;
8216        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8217
8218        if (!skinny_metadata)
8219                size += sizeof(*block_info);
8220
8221        path = btrfs_alloc_path();
8222        if (!path) {
8223                btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
8224                                                   fs_info->nodesize);
8225                return -ENOMEM;
8226        }
8227
8228        path->leave_spinning = 1;
8229        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8230                                      ins, size);
8231        if (ret) {
8232                btrfs_free_path(path);
8233                btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
8234                                                   fs_info->nodesize);
8235                return ret;
8236        }
8237
8238        leaf = path->nodes[0];
8239        extent_item = btrfs_item_ptr(leaf, path->slots[0],
8240                                     struct btrfs_extent_item);
8241        btrfs_set_extent_refs(leaf, extent_item, 1);
8242        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8243        btrfs_set_extent_flags(leaf, extent_item,
8244                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8245
8246        if (skinny_metadata) {
8247                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8248                num_bytes = fs_info->nodesize;
8249        } else {
8250                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8251                btrfs_set_tree_block_key(leaf, block_info, key);
8252                btrfs_set_tree_block_level(leaf, block_info, level);
8253                iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8254        }
8255
8256        if (parent > 0) {
8257                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8258                btrfs_set_extent_inline_ref_type(leaf, iref,
8259                                                 BTRFS_SHARED_BLOCK_REF_KEY);
8260                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8261        } else {
8262                btrfs_set_extent_inline_ref_type(leaf, iref,
8263                                                 BTRFS_TREE_BLOCK_REF_KEY);
8264                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
8265        }
8266
8267        btrfs_mark_buffer_dirty(leaf);
8268        btrfs_free_path(path);
8269
8270        ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
8271                                          num_bytes);
8272        if (ret)
8273                return ret;
8274
8275        ret = update_block_group(trans, fs_info, ins->objectid,
8276                                 fs_info->nodesize, 1);
8277        if (ret) { /* -ENOENT, logic error */
8278                btrfs_err(fs_info, "update block group failed for %llu %llu",
8279                        ins->objectid, ins->offset);
8280                BUG();
8281        }
8282
8283        trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid,
8284                                          fs_info->nodesize);
8285        return ret;
8286}
8287
8288int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8289                                     struct btrfs_root *root, u64 owner,
8290                                     u64 offset, u64 ram_bytes,
8291                                     struct btrfs_key *ins)
8292{
8293        struct btrfs_fs_info *fs_info = root->fs_info;
8294        int ret;
8295
8296        BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8297
8298        btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8299                           root->root_key.objectid, owner, offset,
8300                           BTRFS_ADD_DELAYED_EXTENT);
8301
8302        ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
8303                                         ins->offset, 0,
8304                                         root->root_key.objectid, owner,
8305                                         offset, ram_bytes,
8306                                         BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8307        return ret;
8308}
8309
8310/*
8311 * this is used by the tree logging recovery code.  It records that
8312 * an extent has been allocated and makes sure to clear the free
8313 * space cache bits as well
8314 */
8315int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8316                                   struct btrfs_fs_info *fs_info,
8317                                   u64 root_objectid, u64 owner, u64 offset,
8318                                   struct btrfs_key *ins)
8319{
8320        int ret;
8321        struct btrfs_block_group_cache *block_group;
8322        struct btrfs_space_info *space_info;
8323
8324        /*
8325         * Mixed block groups will exclude before processing the log so we only
8326         * need to do the exclude dance if this fs isn't mixed.
8327         */
8328        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8329                ret = __exclude_logged_extent(fs_info, ins->objectid,
8330                                              ins->offset);
8331                if (ret)
8332                        return ret;
8333        }
8334
8335        block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8336        if (!block_group)
8337                return -EINVAL;
8338
8339        space_info = block_group->space_info;
8340        spin_lock(&space_info->lock);
8341        spin_lock(&block_group->lock);
8342        space_info->bytes_reserved += ins->offset;
8343        block_group->reserved += ins->offset;
8344        spin_unlock(&block_group->lock);
8345        spin_unlock(&space_info->lock);
8346
8347        ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid,
8348                                         0, owner, offset, ins, 1);
8349        btrfs_put_block_group(block_group);
8350        return ret;
8351}
8352
8353static struct extent_buffer *
8354btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8355                      u64 bytenr, int level)
8356{
8357        struct btrfs_fs_info *fs_info = root->fs_info;
8358        struct extent_buffer *buf;
8359
8360        buf = btrfs_find_create_tree_block(fs_info, bytenr);
8361        if (IS_ERR(buf))
8362                return buf;
8363
8364        btrfs_set_header_generation(buf, trans->transid);
8365        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8366        btrfs_tree_lock(buf);
8367        clean_tree_block(fs_info, buf);
8368        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8369
8370        btrfs_set_lock_blocking(buf);
8371        set_extent_buffer_uptodate(buf);
8372
8373        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8374                buf->log_index = root->log_transid % 2;
8375                /*
8376                 * we allow two log transactions at a time, use different
8377                 * EXENT bit to differentiate dirty pages.
8378                 */
8379                if (buf->log_index == 0)
8380                        set_extent_dirty(&root->dirty_log_pages, buf->start,
8381                                        buf->start + buf->len - 1, GFP_NOFS);
8382                else
8383                        set_extent_new(&root->dirty_log_pages, buf->start,
8384                                        buf->start + buf->len - 1);
8385        } else {
8386                buf->log_index = -1;
8387                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8388                         buf->start + buf->len - 1, GFP_NOFS);
8389        }
8390        trans->dirty = true;
8391        /* this returns a buffer locked for blocking */
8392        return buf;
8393}
8394
8395static struct btrfs_block_rsv *
8396use_block_rsv(struct btrfs_trans_handle *trans,
8397              struct btrfs_root *root, u32 blocksize)
8398{
8399        struct btrfs_fs_info *fs_info = root->fs_info;
8400        struct btrfs_block_rsv *block_rsv;
8401        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8402        int ret;
8403        bool global_updated = false;
8404
8405        block_rsv = get_block_rsv(trans, root);
8406
8407        if (unlikely(block_rsv->size == 0))
8408                goto try_reserve;
8409again:
8410        ret = block_rsv_use_bytes(block_rsv, blocksize);
8411        if (!ret)
8412                return block_rsv;
8413
8414        if (block_rsv->failfast)
8415                return ERR_PTR(ret);
8416
8417        if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8418                global_updated = true;
8419                update_global_block_rsv(fs_info);
8420                goto again;
8421        }
8422
8423        if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8424                static DEFINE_RATELIMIT_STATE(_rs,
8425                                DEFAULT_RATELIMIT_INTERVAL * 10,
8426                                /*DEFAULT_RATELIMIT_BURST*/ 1);
8427                if (__ratelimit(&_rs))
8428                        WARN(1, KERN_DEBUG
8429                                "BTRFS: block rsv returned %d\n", ret);
8430        }
8431try_reserve:
8432        ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8433                                     BTRFS_RESERVE_NO_FLUSH);
8434        if (!ret)
8435                return block_rsv;
8436        /*
8437         * If we couldn't reserve metadata bytes try and use some from
8438         * the global reserve if its space type is the same as the global
8439         * reservation.
8440         */
8441        if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8442            block_rsv->space_info == global_rsv->space_info) {
8443                ret = block_rsv_use_bytes(global_rsv, blocksize);
8444                if (!ret)
8445                        return global_rsv;
8446        }
8447        return ERR_PTR(ret);
8448}
8449
8450static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8451                            struct btrfs_block_rsv *block_rsv, u32 blocksize)
8452{
8453        block_rsv_add_bytes(block_rsv, blocksize, 0);
8454        block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8455}
8456
8457/*
8458 * finds a free extent and does all the dirty work required for allocation
8459 * returns the tree buffer or an ERR_PTR on error.
8460 */
8461struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8462                                             struct btrfs_root *root,
8463                                             u64 parent, u64 root_objectid,
8464                                             const struct btrfs_disk_key *key,
8465                                             int level, u64 hint,
8466                                             u64 empty_size)
8467{
8468        struct btrfs_fs_info *fs_info = root->fs_info;
8469        struct btrfs_key ins;
8470        struct btrfs_block_rsv *block_rsv;
8471        struct extent_buffer *buf;
8472        struct btrfs_delayed_extent_op *extent_op;
8473        u64 flags = 0;
8474        int ret;
8475        u32 blocksize = fs_info->nodesize;
8476        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8477
8478#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8479        if (btrfs_is_testing(fs_info)) {
8480                buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8481                                            level);
8482                if (!IS_ERR(buf))
8483                        root->alloc_bytenr += blocksize;
8484                return buf;
8485        }
8486#endif
8487
8488        block_rsv = use_block_rsv(trans, root, blocksize);
8489        if (IS_ERR(block_rsv))
8490                return ERR_CAST(block_rsv);
8491
8492        ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8493                                   empty_size, hint, &ins, 0, 0);
8494        if (ret)
8495                goto out_unuse;
8496
8497        buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
8498        if (IS_ERR(buf)) {
8499                ret = PTR_ERR(buf);
8500                goto out_free_reserved;
8501        }
8502
8503        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8504                if (parent == 0)
8505                        parent = ins.objectid;
8506                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8507        } else
8508                BUG_ON(parent > 0);
8509
8510        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8511                extent_op = btrfs_alloc_delayed_extent_op();
8512                if (!extent_op) {
8513                        ret = -ENOMEM;
8514                        goto out_free_buf;
8515                }
8516                if (key)
8517                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
8518                else
8519                        memset(&extent_op->key, 0, sizeof(extent_op->key));
8520                extent_op->flags_to_set = flags;
8521                extent_op->update_key = skinny_metadata ? false : true;
8522                extent_op->update_flags = true;
8523                extent_op->is_data = false;
8524                extent_op->level = level;
8525
8526                btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8527                                   root_objectid, level, 0,
8528                                   BTRFS_ADD_DELAYED_EXTENT);
8529                ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
8530                                                 ins.offset, parent,
8531                                                 root_objectid, level,
8532                                                 BTRFS_ADD_DELAYED_EXTENT,
8533                                                 extent_op, NULL, NULL);
8534                if (ret)
8535                        goto out_free_delayed;
8536        }
8537        return buf;
8538
8539out_free_delayed:
8540        btrfs_free_delayed_extent_op(extent_op);
8541out_free_buf:
8542        free_extent_buffer(buf);
8543out_free_reserved:
8544        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8545out_unuse:
8546        unuse_block_rsv(fs_info, block_rsv, blocksize);
8547        return ERR_PTR(ret);
8548}
8549
8550struct walk_control {
8551        u64 refs[BTRFS_MAX_LEVEL];
8552        u64 flags[BTRFS_MAX_LEVEL];
8553        struct btrfs_key update_progress;
8554        int stage;
8555        int level;
8556        int shared_level;
8557        int update_ref;
8558        int keep_locks;
8559        int reada_slot;
8560        int reada_count;
8561        int for_reloc;
8562};
8563
8564#define DROP_REFERENCE  1
8565#define UPDATE_BACKREF  2
8566
8567static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8568                                     struct btrfs_root *root,
8569                                     struct walk_control *wc,
8570                                     struct btrfs_path *path)
8571{
8572        struct btrfs_fs_info *fs_info = root->fs_info;
8573        u64 bytenr;
8574        u64 generation;
8575        u64 refs;
8576        u64 flags;
8577        u32 nritems;
8578        struct btrfs_key key;
8579        struct extent_buffer *eb;
8580        int ret;
8581        int slot;
8582        int nread = 0;
8583
8584        if (path->slots[wc->level] < wc->reada_slot) {
8585                wc->reada_count = wc->reada_count * 2 / 3;
8586                wc->reada_count = max(wc->reada_count, 2);
8587        } else {
8588                wc->reada_count = wc->reada_count * 3 / 2;
8589                wc->reada_count = min_t(int, wc->reada_count,
8590                                        BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8591        }
8592
8593        eb = path->nodes[wc->level];
8594        nritems = btrfs_header_nritems(eb);
8595
8596        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8597                if (nread >= wc->reada_count)
8598                        break;
8599
8600                cond_resched();
8601                bytenr = btrfs_node_blockptr(eb, slot);
8602                generation = btrfs_node_ptr_generation(eb, slot);
8603
8604                if (slot == path->slots[wc->level])
8605                        goto reada;
8606
8607                if (wc->stage == UPDATE_BACKREF &&
8608                    generation <= root->root_key.offset)
8609                        continue;
8610
8611                /* We don't lock the tree block, it's OK to be racy here */
8612                ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8613                                               wc->level - 1, 1, &refs,
8614                                               &flags);
8615                /* We don't care about errors in readahead. */
8616                if (ret < 0)
8617                        continue;
8618                BUG_ON(refs == 0);
8619
8620                if (wc->stage == DROP_REFERENCE) {
8621                        if (refs == 1)
8622                                goto reada;
8623
8624                        if (wc->level == 1 &&
8625                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8626                                continue;
8627                        if (!wc->update_ref ||
8628                            generation <= root->root_key.offset)
8629                                continue;
8630                        btrfs_node_key_to_cpu(eb, &key, slot);
8631                        ret = btrfs_comp_cpu_keys(&key,
8632                                                  &wc->update_progress);
8633                        if (ret < 0)
8634                                continue;
8635                } else {
8636                        if (wc->level == 1 &&
8637                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8638                                continue;
8639                }
8640reada:
8641                readahead_tree_block(fs_info, bytenr);
8642                nread++;
8643        }
8644        wc->reada_slot = slot;
8645}
8646
8647/*
8648 * helper to process tree block while walking down the tree.
8649 *
8650 * when wc->stage == UPDATE_BACKREF, this function updates
8651 * back refs for pointers in the block.
8652 *
8653 * NOTE: return value 1 means we should stop walking down.
8654 */
8655static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8656                                   struct btrfs_root *root,
8657                                   struct btrfs_path *path,
8658                                   struct walk_control *wc, int lookup_info)
8659{
8660        struct btrfs_fs_info *fs_info = root->fs_info;
8661        int level = wc->level;
8662        struct extent_buffer *eb = path->nodes[level];
8663        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8664        int ret;
8665
8666        if (wc->stage == UPDATE_BACKREF &&
8667            btrfs_header_owner(eb) != root->root_key.objectid)
8668                return 1;
8669
8670        /*
8671         * when reference count of tree block is 1, it won't increase
8672         * again. once full backref flag is set, we never clear it.
8673         */
8674        if (lookup_info &&
8675            ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8676             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8677                BUG_ON(!path->locks[level]);
8678                ret = btrfs_lookup_extent_info(trans, fs_info,
8679                                               eb->start, level, 1,
8680                                               &wc->refs[level],
8681                                               &wc->flags[level]);
8682                BUG_ON(ret == -ENOMEM);
8683                if (ret)
8684                        return ret;
8685                BUG_ON(wc->refs[level] == 0);
8686        }
8687
8688        if (wc->stage == DROP_REFERENCE) {
8689                if (wc->refs[level] > 1)
8690                        return 1;
8691
8692                if (path->locks[level] && !wc->keep_locks) {
8693                        btrfs_tree_unlock_rw(eb, path->locks[level]);
8694                        path->locks[level] = 0;
8695                }
8696                return 0;
8697        }
8698
8699        /* wc->stage == UPDATE_BACKREF */
8700        if (!(wc->flags[level] & flag)) {
8701                BUG_ON(!path->locks[level]);
8702                ret = btrfs_inc_ref(trans, root, eb, 1);
8703                BUG_ON(ret); /* -ENOMEM */
8704                ret = btrfs_dec_ref(trans, root, eb, 0);
8705                BUG_ON(ret); /* -ENOMEM */
8706                ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8707                                                  eb->len, flag,
8708                                                  btrfs_header_level(eb), 0);
8709                BUG_ON(ret); /* -ENOMEM */
8710                wc->flags[level] |= flag;
8711        }
8712
8713        /*
8714         * the block is shared by multiple trees, so it's not good to
8715         * keep the tree lock
8716         */
8717        if (path->locks[level] && level > 0) {
8718                btrfs_tree_unlock_rw(eb, path->locks[level]);
8719                path->locks[level] = 0;
8720        }
8721        return 0;
8722}
8723
8724/*
8725 * helper to process tree block pointer.
8726 *
8727 * when wc->stage == DROP_REFERENCE, this function checks
8728 * reference count of the block pointed to. if the block
8729 * is shared and we need update back refs for the subtree
8730 * rooted at the block, this function changes wc->stage to
8731 * UPDATE_BACKREF. if the block is shared and there is no
8732 * need to update back, this function drops the reference
8733 * to the block.
8734 *
8735 * NOTE: return value 1 means we should stop walking down.
8736 */
8737static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8738                                 struct btrfs_root *root,
8739                                 struct btrfs_path *path,
8740                                 struct walk_control *wc, int *lookup_info)
8741{
8742        struct btrfs_fs_info *fs_info = root->fs_info;
8743        u64 bytenr;
8744        u64 generation;
8745        u64 parent;
8746        u32 blocksize;
8747        struct btrfs_key key;
8748        struct btrfs_key first_key;
8749        struct extent_buffer *next;
8750        int level = wc->level;
8751        int reada = 0;
8752        int ret = 0;
8753        bool need_account = false;
8754
8755        generation = btrfs_node_ptr_generation(path->nodes[level],
8756                                               path->slots[level]);
8757        /*
8758         * if the lower level block was created before the snapshot
8759         * was created, we know there is no need to update back refs
8760         * for the subtree
8761         */
8762        if (wc->stage == UPDATE_BACKREF &&
8763            generation <= root->root_key.offset) {
8764                *lookup_info = 1;
8765                return 1;
8766        }
8767
8768        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8769        btrfs_node_key_to_cpu(path->nodes[level], &first_key,
8770                              path->slots[level]);
8771        blocksize = fs_info->nodesize;
8772
8773        next = find_extent_buffer(fs_info, bytenr);
8774        if (!next) {
8775                next = btrfs_find_create_tree_block(fs_info, bytenr);
8776                if (IS_ERR(next))
8777                        return PTR_ERR(next);
8778
8779                btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8780                                               level - 1);
8781                reada = 1;
8782        }
8783        btrfs_tree_lock(next);
8784        btrfs_set_lock_blocking(next);
8785
8786        ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8787                                       &wc->refs[level - 1],
8788                                       &wc->flags[level - 1]);
8789        if (ret < 0)
8790                goto out_unlock;
8791
8792        if (unlikely(wc->refs[level - 1] == 0)) {
8793                btrfs_err(fs_info, "Missing references.");
8794                ret = -EIO;
8795                goto out_unlock;
8796        }
8797        *lookup_info = 0;
8798
8799        if (wc->stage == DROP_REFERENCE) {
8800                if (wc->refs[level - 1] > 1) {
8801                        need_account = true;
8802                        if (level == 1 &&
8803                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8804                                goto skip;
8805
8806                        if (!wc->update_ref ||
8807                            generation <= root->root_key.offset)
8808                                goto skip;
8809
8810                        btrfs_node_key_to_cpu(path->nodes[level], &key,
8811                                              path->slots[level]);
8812                        ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8813                        if (ret < 0)
8814                                goto skip;
8815
8816                        wc->stage = UPDATE_BACKREF;
8817                        wc->shared_level = level - 1;
8818                }
8819        } else {
8820                if (level == 1 &&
8821                    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8822                        goto skip;
8823        }
8824
8825        if (!btrfs_buffer_uptodate(next, generation, 0)) {
8826                btrfs_tree_unlock(next);
8827                free_extent_buffer(next);
8828                next = NULL;
8829                *lookup_info = 1;
8830        }
8831
8832        if (!next) {
8833                if (reada && level == 1)
8834                        reada_walk_down(trans, root, wc, path);
8835                next = read_tree_block(fs_info, bytenr, generation, level - 1,
8836                                       &first_key);
8837                if (IS_ERR(next)) {
8838                        return PTR_ERR(next);
8839                } else if (!extent_buffer_uptodate(next)) {
8840                        free_extent_buffer(next);
8841                        return -EIO;
8842                }
8843                btrfs_tree_lock(next);
8844                btrfs_set_lock_blocking(next);
8845        }
8846
8847        level--;
8848        ASSERT(level == btrfs_header_level(next));
8849        if (level != btrfs_header_level(next)) {
8850                btrfs_err(root->fs_info, "mismatched level");
8851                ret = -EIO;
8852                goto out_unlock;
8853        }
8854        path->nodes[level] = next;
8855        path->slots[level] = 0;
8856        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8857        wc->level = level;
8858        if (wc->level == 1)
8859                wc->reada_slot = 0;
8860        return 0;
8861skip:
8862        wc->refs[level - 1] = 0;
8863        wc->flags[level - 1] = 0;
8864        if (wc->stage == DROP_REFERENCE) {
8865                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
8866                        parent = path->nodes[level]->start;
8867                } else {
8868                        ASSERT(root->root_key.objectid ==
8869                               btrfs_header_owner(path->nodes[level]));
8870                        if (root->root_key.objectid !=
8871                            btrfs_header_owner(path->nodes[level])) {
8872                                btrfs_err(root->fs_info,
8873                                                "mismatched block owner");
8874                                ret = -EIO;
8875                                goto out_unlock;
8876                        }
8877                        parent = 0;
8878                }
8879
8880                if (need_account) {
8881                        ret = btrfs_qgroup_trace_subtree(trans, root, next,
8882                                                         generation, level - 1);
8883                        if (ret) {
8884                                btrfs_err_rl(fs_info,
8885                                             "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
8886                                             ret);
8887                        }
8888                }
8889                ret = btrfs_free_extent(trans, root, bytenr, blocksize,
8890                                        parent, root->root_key.objectid,
8891                                        level - 1, 0);
8892                if (ret)
8893                        goto out_unlock;
8894        }
8895
8896        *lookup_info = 1;
8897        ret = 1;
8898
8899out_unlock:
8900        btrfs_tree_unlock(next);
8901        free_extent_buffer(next);
8902
8903        return ret;
8904}
8905
8906/*
8907 * helper to process tree block while walking up the tree.
8908 *
8909 * when wc->stage == DROP_REFERENCE, this function drops
8910 * reference count on the block.
8911 *
8912 * when wc->stage == UPDATE_BACKREF, this function changes
8913 * wc->stage back to DROP_REFERENCE if we changed wc->stage
8914 * to UPDATE_BACKREF previously while processing the block.
8915 *
8916 * NOTE: return value 1 means we should stop walking up.
8917 */
8918static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8919                                 struct btrfs_root *root,
8920                                 struct btrfs_path *path,
8921                                 struct walk_control *wc)
8922{
8923        struct btrfs_fs_info *fs_info = root->fs_info;
8924        int ret;
8925        int level = wc->level;
8926        struct extent_buffer *eb = path->nodes[level];
8927        u64 parent = 0;
8928
8929        if (wc->stage == UPDATE_BACKREF) {
8930                BUG_ON(wc->shared_level < level);
8931                if (level < wc->shared_level)
8932                        goto out;
8933
8934                ret = find_next_key(path, level + 1, &wc->update_progress);
8935                if (ret > 0)
8936                        wc->update_ref = 0;
8937
8938                wc->stage = DROP_REFERENCE;
8939                wc->shared_level = -1;
8940                path->slots[level] = 0;
8941
8942                /*
8943                 * check reference count again if the block isn't locked.
8944                 * we should start walking down the tree again if reference
8945                 * count is one.
8946                 */
8947                if (!path->locks[level]) {
8948                        BUG_ON(level == 0);
8949                        btrfs_tree_lock(eb);
8950                        btrfs_set_lock_blocking(eb);
8951                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8952
8953                        ret = btrfs_lookup_extent_info(trans, fs_info,
8954                                                       eb->start, level, 1,
8955                                                       &wc->refs[level],
8956                                                       &wc->flags[level]);
8957                        if (ret < 0) {
8958                                btrfs_tree_unlock_rw(eb, path->locks[level]);
8959                                path->locks[level] = 0;
8960                                return ret;
8961                        }
8962                        BUG_ON(wc->refs[level] == 0);
8963                        if (wc->refs[level] == 1) {
8964                                btrfs_tree_unlock_rw(eb, path->locks[level]);
8965                                path->locks[level] = 0;
8966                                return 1;
8967                        }
8968                }
8969        }
8970
8971        /* wc->stage == DROP_REFERENCE */
8972        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
8973
8974        if (wc->refs[level] == 1) {
8975                if (level == 0) {
8976                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
8977                                ret = btrfs_dec_ref(trans, root, eb, 1);
8978                        else
8979                                ret = btrfs_dec_ref(trans, root, eb, 0);
8980                        BUG_ON(ret); /* -ENOMEM */
8981                        ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb);
8982                        if (ret) {
8983                                btrfs_err_rl(fs_info,
8984                                             "error %d accounting leaf items. Quota is out of sync, rescan required.",
8985                                             ret);
8986                        }
8987                }
8988                /* make block locked assertion in clean_tree_block happy */
8989                if (!path->locks[level] &&
8990                    btrfs_header_generation(eb) == trans->transid) {
8991                        btrfs_tree_lock(eb);
8992                        btrfs_set_lock_blocking(eb);
8993                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8994                }
8995                clean_tree_block(fs_info, eb);
8996        }
8997
8998        if (eb == root->node) {
8999                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9000                        parent = eb->start;
9001                else
9002                        BUG_ON(root->root_key.objectid !=
9003                               btrfs_header_owner(eb));
9004        } else {
9005                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9006                        parent = path->nodes[level + 1]->start;
9007                else
9008                        BUG_ON(root->root_key.objectid !=
9009                               btrfs_header_owner(path->nodes[level + 1]));
9010        }
9011
9012        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9013out:
9014        wc->refs[level] = 0;
9015        wc->flags[level] = 0;
9016        return 0;
9017}
9018
9019static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9020                                   struct btrfs_root *root,
9021                                   struct btrfs_path *path,
9022                                   struct walk_control *wc)
9023{
9024        int level = wc->level;
9025        int lookup_info = 1;
9026        int ret;
9027
9028        while (level >= 0) {
9029                ret = walk_down_proc(trans, root, path, wc, lookup_info);
9030                if (ret > 0)
9031                        break;
9032
9033                if (level == 0)
9034                        break;
9035
9036                if (path->slots[level] >=
9037                    btrfs_header_nritems(path->nodes[level]))
9038                        break;
9039
9040                ret = do_walk_down(trans, root, path, wc, &lookup_info);
9041                if (ret > 0) {
9042                        path->slots[level]++;
9043                        continue;
9044                } else if (ret < 0)
9045                        return ret;
9046                level = wc->level;
9047        }
9048        return 0;
9049}
9050
9051static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9052                                 struct btrfs_root *root,
9053                                 struct btrfs_path *path,
9054                                 struct walk_control *wc, int max_level)
9055{
9056        int level = wc->level;
9057        int ret;
9058
9059        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9060        while (level < max_level && path->nodes[level]) {
9061                wc->level = level;
9062                if (path->slots[level] + 1 <
9063                    btrfs_header_nritems(path->nodes[level])) {
9064                        path->slots[level]++;
9065                        return 0;
9066                } else {
9067                        ret = walk_up_proc(trans, root, path, wc);
9068                        if (ret > 0)
9069                                return 0;
9070
9071                        if (path->locks[level]) {
9072                                btrfs_tree_unlock_rw(path->nodes[level],
9073                                                     path->locks[level]);
9074                                path->locks[level] = 0;
9075                        }
9076                        free_extent_buffer(path->nodes[level]);
9077                        path->nodes[level] = NULL;
9078                        level++;
9079                }
9080        }
9081        return 1;
9082}
9083
9084/*
9085 * drop a subvolume tree.
9086 *
9087 * this function traverses the tree freeing any blocks that only
9088 * referenced by the tree.
9089 *
9090 * when a shared tree block is found. this function decreases its
9091 * reference count by one. if update_ref is true, this function
9092 * also make sure backrefs for the shared block and all lower level
9093 * blocks are properly updated.
9094 *
9095 * If called with for_reloc == 0, may exit early with -EAGAIN
9096 */
9097int btrfs_drop_snapshot(struct btrfs_root *root,
9098                         struct btrfs_block_rsv *block_rsv, int update_ref,
9099                         int for_reloc)
9100{
9101        struct btrfs_fs_info *fs_info = root->fs_info;
9102        struct btrfs_path *path;
9103        struct btrfs_trans_handle *trans;
9104        struct btrfs_root *tree_root = fs_info->tree_root;
9105        struct btrfs_root_item *root_item = &root->root_item;
9106        struct walk_control *wc;
9107        struct btrfs_key key;
9108        int err = 0;
9109        int ret;
9110        int level;
9111        bool root_dropped = false;
9112
9113        btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
9114
9115        path = btrfs_alloc_path();
9116        if (!path) {
9117                err = -ENOMEM;
9118                goto out;
9119        }
9120
9121        wc = kzalloc(sizeof(*wc), GFP_NOFS);
9122        if (!wc) {
9123                btrfs_free_path(path);
9124                err = -ENOMEM;
9125                goto out;
9126        }
9127
9128        trans = btrfs_start_transaction(tree_root, 0);
9129        if (IS_ERR(trans)) {
9130                err = PTR_ERR(trans);
9131                goto out_free;
9132        }
9133
9134        if (block_rsv)
9135                trans->block_rsv = block_rsv;
9136
9137        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9138                level = btrfs_header_level(root->node);
9139                path->nodes[level] = btrfs_lock_root_node(root);
9140                btrfs_set_lock_blocking(path->nodes[level]);
9141                path->slots[level] = 0;
9142                path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9143                memset(&wc->update_progress, 0,
9144                       sizeof(wc->update_progress));
9145        } else {
9146                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9147                memcpy(&wc->update_progress, &key,
9148                       sizeof(wc->update_progress));
9149
9150                level = root_item->drop_level;
9151                BUG_ON(level == 0);
9152                path->lowest_level = level;
9153                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9154                path->lowest_level = 0;
9155                if (ret < 0) {
9156                        err = ret;
9157                        goto out_end_trans;
9158                }
9159                WARN_ON(ret > 0);
9160
9161                /*
9162                 * unlock our path, this is safe because only this
9163                 * function is allowed to delete this snapshot
9164                 */
9165                btrfs_unlock_up_safe(path, 0);
9166
9167                level = btrfs_header_level(root->node);
9168                while (1) {
9169                        btrfs_tree_lock(path->nodes[level]);
9170                        btrfs_set_lock_blocking(path->nodes[level]);
9171                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9172
9173                        ret = btrfs_lookup_extent_info(trans, fs_info,
9174                                                path->nodes[level]->start,
9175                                                level, 1, &wc->refs[level],
9176                                                &wc->flags[level]);
9177                        if (ret < 0) {
9178                                err = ret;
9179                                goto out_end_trans;
9180                        }
9181                        BUG_ON(wc->refs[level] == 0);
9182
9183                        if (level == root_item->drop_level)
9184                                break;
9185
9186                        btrfs_tree_unlock(path->nodes[level]);
9187                        path->locks[level] = 0;
9188                        WARN_ON(wc->refs[level] != 1);
9189                        level--;
9190                }
9191        }
9192
9193        wc->level = level;
9194        wc->shared_level = -1;
9195        wc->stage = DROP_REFERENCE;
9196        wc->update_ref = update_ref;
9197        wc->keep_locks = 0;
9198        wc->for_reloc = for_reloc;
9199        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9200
9201        while (1) {
9202
9203                ret = walk_down_tree(trans, root, path, wc);
9204                if (ret < 0) {
9205                        err = ret;
9206                        break;
9207                }
9208
9209                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9210                if (ret < 0) {
9211                        err = ret;
9212                        break;
9213                }
9214
9215                if (ret > 0) {
9216                        BUG_ON(wc->stage != DROP_REFERENCE);
9217                        break;
9218                }
9219
9220                if (wc->stage == DROP_REFERENCE) {
9221                        level = wc->level;
9222                        btrfs_node_key(path->nodes[level],
9223                                       &root_item->drop_progress,
9224                                       path->slots[level]);
9225                        root_item->drop_level = level;
9226                }
9227
9228                BUG_ON(wc->level == 0);
9229                if (btrfs_should_end_transaction(trans) ||
9230                    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9231                        ret = btrfs_update_root(trans, tree_root,
9232                                                &root->root_key,
9233                                                root_item);
9234                        if (ret) {
9235                                btrfs_abort_transaction(trans, ret);
9236                                err = ret;
9237                                goto out_end_trans;
9238                        }
9239
9240                        btrfs_end_transaction_throttle(trans);
9241                        if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9242                                btrfs_debug(fs_info,
9243                                            "drop snapshot early exit");
9244                                err = -EAGAIN;
9245                                goto out_free;
9246                        }
9247
9248                        trans = btrfs_start_transaction(tree_root, 0);
9249                        if (IS_ERR(trans)) {
9250                                err = PTR_ERR(trans);
9251                                goto out_free;
9252                        }
9253                        if (block_rsv)
9254                                trans->block_rsv = block_rsv;
9255                }
9256        }
9257        btrfs_release_path(path);
9258        if (err)
9259                goto out_end_trans;
9260
9261        ret = btrfs_del_root(trans, fs_info, &root->root_key);
9262        if (ret) {
9263                btrfs_abort_transaction(trans, ret);
9264                err = ret;
9265                goto out_end_trans;
9266        }
9267
9268        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9269                ret = btrfs_find_root(tree_root, &root->root_key, path,
9270                                      NULL, NULL);
9271                if (ret < 0) {
9272                        btrfs_abort_transaction(trans, ret);
9273                        err = ret;
9274                        goto out_end_trans;
9275                } else if (ret > 0) {
9276                        /* if we fail to delete the orphan item this time
9277                         * around, it'll get picked up the next time.
9278                         *
9279                         * The most common failure here is just -ENOENT.
9280                         */
9281                        btrfs_del_orphan_item(trans, tree_root,
9282                                              root->root_key.objectid);
9283                }
9284        }
9285
9286        if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9287                btrfs_add_dropped_root(trans, root);
9288        } else {
9289                free_extent_buffer(root->node);
9290                free_extent_buffer(root->commit_root);
9291                btrfs_put_fs_root(root);
9292        }
9293        root_dropped = true;
9294out_end_trans:
9295        btrfs_end_transaction_throttle(trans);
9296out_free:
9297        kfree(wc);
9298        btrfs_free_path(path);
9299out:
9300        /*
9301         * So if we need to stop dropping the snapshot for whatever reason we
9302         * need to make sure to add it back to the dead root list so that we
9303         * keep trying to do the work later.  This also cleans up roots if we
9304         * don't have it in the radix (like when we recover after a power fail
9305         * or unmount) so we don't leak memory.
9306         */
9307        if (!for_reloc && !root_dropped)
9308                btrfs_add_dead_root(root);
9309        if (err && err != -EAGAIN)
9310                btrfs_handle_fs_error(fs_info, err, NULL);
9311        return err;
9312}
9313
9314/*
9315 * drop subtree rooted at tree block 'node'.
9316 *
9317 * NOTE: this function will unlock and release tree block 'node'
9318 * only used by relocation code
9319 */
9320int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9321                        struct btrfs_root *root,
9322                        struct extent_buffer *node,
9323                        struct extent_buffer *parent)
9324{
9325        struct btrfs_fs_info *fs_info = root->fs_info;
9326        struct btrfs_path *path;
9327        struct walk_control *wc;
9328        int level;
9329        int parent_level;
9330        int ret = 0;
9331        int wret;
9332
9333        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9334
9335        path = btrfs_alloc_path();
9336        if (!path)
9337                return -ENOMEM;
9338
9339        wc = kzalloc(sizeof(*wc), GFP_NOFS);
9340        if (!wc) {
9341                btrfs_free_path(path);
9342                return -ENOMEM;
9343        }
9344
9345        btrfs_assert_tree_locked(parent);
9346        parent_level = btrfs_header_level(parent);
9347        extent_buffer_get(parent);
9348        path->nodes[parent_level] = parent;
9349        path->slots[parent_level] = btrfs_header_nritems(parent);
9350
9351        btrfs_assert_tree_locked(node);
9352        level = btrfs_header_level(node);
9353        path->nodes[level] = node;
9354        path->slots[level] = 0;
9355        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9356
9357        wc->refs[parent_level] = 1;
9358        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9359        wc->level = level;
9360        wc->shared_level = -1;
9361        wc->stage = DROP_REFERENCE;
9362        wc->update_ref = 0;
9363        wc->keep_locks = 1;
9364        wc->for_reloc = 1;
9365        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9366
9367        while (1) {
9368                wret = walk_down_tree(trans, root, path, wc);
9369                if (wret < 0) {
9370                        ret = wret;
9371                        break;
9372                }
9373
9374                wret = walk_up_tree(trans, root, path, wc, parent_level);
9375                if (wret < 0)
9376                        ret = wret;
9377                if (wret != 0)
9378                        break;
9379        }
9380
9381        kfree(wc);
9382        btrfs_free_path(path);
9383        return ret;
9384}
9385
9386static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9387{
9388        u64 num_devices;
9389        u64 stripped;
9390
9391        /*
9392         * if restripe for this chunk_type is on pick target profile and
9393         * return, otherwise do the usual balance
9394         */
9395        stripped = get_restripe_target(fs_info, flags);
9396        if (stripped)
9397                return extended_to_chunk(stripped);
9398
9399        num_devices = fs_info->fs_devices->rw_devices;
9400
9401        stripped = BTRFS_BLOCK_GROUP_RAID0 |
9402                BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9403                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9404
9405        if (num_devices == 1) {
9406                stripped |= BTRFS_BLOCK_GROUP_DUP;
9407                stripped = flags & ~stripped;
9408
9409                /* turn raid0 into single device chunks */
9410                if (flags & BTRFS_BLOCK_GROUP_RAID0)
9411                        return stripped;
9412
9413                /* turn mirroring into duplication */
9414                if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9415                             BTRFS_BLOCK_GROUP_RAID10))
9416                        return stripped | BTRFS_BLOCK_GROUP_DUP;
9417        } else {
9418                /* they already had raid on here, just return */
9419                if (flags & stripped)
9420                        return flags;
9421
9422                stripped |= BTRFS_BLOCK_GROUP_DUP;
9423                stripped = flags & ~stripped;
9424
9425                /* switch duplicated blocks with raid1 */
9426                if (flags & BTRFS_BLOCK_GROUP_DUP)
9427                        return stripped | BTRFS_BLOCK_GROUP_RAID1;
9428
9429                /* this is drive concat, leave it alone */
9430        }
9431
9432        return flags;
9433}
9434
9435static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9436{
9437        struct btrfs_space_info *sinfo = cache->space_info;
9438        u64 num_bytes;
9439        u64 min_allocable_bytes;
9440        int ret = -ENOSPC;
9441
9442        /*
9443         * We need some metadata space and system metadata space for
9444         * allocating chunks in some corner cases until we force to set
9445         * it to be readonly.
9446         */
9447        if ((sinfo->flags &
9448             (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9449            !force)
9450                min_allocable_bytes = SZ_1M;
9451        else
9452                min_allocable_bytes = 0;
9453
9454        spin_lock(&sinfo->lock);
9455        spin_lock(&cache->lock);
9456
9457        if (cache->ro) {
9458                cache->ro++;
9459                ret = 0;
9460                goto out;
9461        }
9462
9463        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9464                    cache->bytes_super - btrfs_block_group_used(&cache->item);
9465
9466        if (btrfs_space_info_used(sinfo, true) + num_bytes +
9467            min_allocable_bytes <= sinfo->total_bytes) {
9468                sinfo->bytes_readonly += num_bytes;
9469                cache->ro++;
9470                list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9471                ret = 0;
9472        }
9473out:
9474        spin_unlock(&cache->lock);
9475        spin_unlock(&sinfo->lock);
9476        return ret;
9477}
9478
9479int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info,
9480                             struct btrfs_block_group_cache *cache)
9481
9482{
9483        struct btrfs_trans_handle *trans;
9484        u64 alloc_flags;
9485        int ret;
9486
9487again:
9488        trans = btrfs_join_transaction(fs_info->extent_root);
9489        if (IS_ERR(trans))
9490                return PTR_ERR(trans);
9491
9492        /*
9493         * we're not allowed to set block groups readonly after the dirty
9494         * block groups cache has started writing.  If it already started,
9495         * back off and let this transaction commit
9496         */
9497        mutex_lock(&fs_info->ro_block_group_mutex);
9498        if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9499                u64 transid = trans->transid;
9500
9501                mutex_unlock(&fs_info->ro_block_group_mutex);
9502                btrfs_end_transaction(trans);
9503
9504                ret = btrfs_wait_for_commit(fs_info, transid);
9505                if (ret)
9506                        return ret;
9507                goto again;
9508        }
9509
9510        /*
9511         * if we are changing raid levels, try to allocate a corresponding
9512         * block group with the new raid level.
9513         */
9514        alloc_flags = update_block_group_flags(fs_info, cache->flags);
9515        if (alloc_flags != cache->flags) {
9516                ret = do_chunk_alloc(trans, fs_info, alloc_flags,
9517                                     CHUNK_ALLOC_FORCE);
9518                /*
9519                 * ENOSPC is allowed here, we may have enough space
9520                 * already allocated at the new raid level to
9521                 * carry on
9522                 */
9523                if (ret == -ENOSPC)
9524                        ret = 0;
9525                if (ret < 0)
9526                        goto out;
9527        }
9528
9529        ret = inc_block_group_ro(cache, 0);
9530        if (!ret)
9531                goto out;
9532        alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9533        ret = do_chunk_alloc(trans, fs_info, alloc_flags,
9534                             CHUNK_ALLOC_FORCE);
9535        if (ret < 0)
9536                goto out;
9537        ret = inc_block_group_ro(cache, 0);
9538out:
9539        if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9540                alloc_flags = update_block_group_flags(fs_info, cache->flags);
9541                mutex_lock(&fs_info->chunk_mutex);
9542                check_system_chunk(trans, fs_info, alloc_flags);
9543                mutex_unlock(&fs_info->chunk_mutex);
9544        }
9545        mutex_unlock(&fs_info->ro_block_group_mutex);
9546
9547        btrfs_end_transaction(trans);
9548        return ret;
9549}
9550
9551int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
9552                            struct btrfs_fs_info *fs_info, u64 type)
9553{
9554        u64 alloc_flags = get_alloc_profile(fs_info, type);
9555
9556        return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE);
9557}
9558
9559/*
9560 * helper to account the unused space of all the readonly block group in the
9561 * space_info. takes mirrors into account.
9562 */
9563u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9564{
9565        struct btrfs_block_group_cache *block_group;
9566        u64 free_bytes = 0;
9567        int factor;
9568
9569        /* It's df, we don't care if it's racy */
9570        if (list_empty(&sinfo->ro_bgs))
9571                return 0;
9572
9573        spin_lock(&sinfo->lock);
9574        list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9575                spin_lock(&block_group->lock);
9576
9577                if (!block_group->ro) {
9578                        spin_unlock(&block_group->lock);
9579                        continue;
9580                }
9581
9582                if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
9583                                          BTRFS_BLOCK_GROUP_RAID10 |
9584                                          BTRFS_BLOCK_GROUP_DUP))
9585                        factor = 2;
9586                else
9587                        factor = 1;
9588
9589                free_bytes += (block_group->key.offset -
9590                               btrfs_block_group_used(&block_group->item)) *
9591                               factor;
9592
9593                spin_unlock(&block_group->lock);
9594        }
9595        spin_unlock(&sinfo->lock);
9596
9597        return free_bytes;
9598}
9599
9600void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9601{
9602        struct btrfs_space_info *sinfo = cache->space_info;
9603        u64 num_bytes;
9604
9605        BUG_ON(!cache->ro);
9606
9607        spin_lock(&sinfo->lock);
9608        spin_lock(&cache->lock);
9609        if (!--cache->ro) {
9610                num_bytes = cache->key.offset - cache->reserved -
9611                            cache->pinned - cache->bytes_super -
9612                            btrfs_block_group_used(&cache->item);
9613                sinfo->bytes_readonly -= num_bytes;
9614                list_del_init(&cache->ro_list);
9615        }
9616        spin_unlock(&cache->lock);
9617        spin_unlock(&sinfo->lock);
9618}
9619
9620/*
9621 * checks to see if its even possible to relocate this block group.
9622 *
9623 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9624 * ok to go ahead and try.
9625 */
9626int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9627{
9628        struct btrfs_root *root = fs_info->extent_root;
9629        struct btrfs_block_group_cache *block_group;
9630        struct btrfs_space_info *space_info;
9631        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9632        struct btrfs_device *device;
9633        struct btrfs_trans_handle *trans;
9634        u64 min_free;
9635        u64 dev_min = 1;
9636        u64 dev_nr = 0;
9637        u64 target;
9638        int debug;
9639        int index;
9640        int full = 0;
9641        int ret = 0;
9642
9643        debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9644
9645        block_group = btrfs_lookup_block_group(fs_info, bytenr);
9646
9647        /* odd, couldn't find the block group, leave it alone */
9648        if (!block_group) {
9649                if (debug)
9650                        btrfs_warn(fs_info,
9651                                   "can't find block group for bytenr %llu",
9652                                   bytenr);
9653                return -1;
9654        }
9655
9656        min_free = btrfs_block_group_used(&block_group->item);
9657
9658        /* no bytes used, we're good */
9659        if (!min_free)
9660                goto out;
9661
9662        space_info = block_group->space_info;
9663        spin_lock(&space_info->lock);
9664
9665        full = space_info->full;
9666
9667        /*
9668         * if this is the last block group we have in this space, we can't
9669         * relocate it unless we're able to allocate a new chunk below.
9670         *
9671         * Otherwise, we need to make sure we have room in the space to handle
9672         * all of the extents from this block group.  If we can, we're good
9673         */
9674        if ((space_info->total_bytes != block_group->key.offset) &&
9675            (btrfs_space_info_used(space_info, false) + min_free <
9676             space_info->total_bytes)) {
9677                spin_unlock(&space_info->lock);
9678                goto out;
9679        }
9680        spin_unlock(&space_info->lock);
9681
9682        /*
9683         * ok we don't have enough space, but maybe we have free space on our
9684         * devices to allocate new chunks for relocation, so loop through our
9685         * alloc devices and guess if we have enough space.  if this block
9686         * group is going to be restriped, run checks against the target
9687         * profile instead of the current one.
9688         */
9689        ret = -1;
9690
9691        /*
9692         * index:
9693         *      0: raid10
9694         *      1: raid1
9695         *      2: dup
9696         *      3: raid0
9697         *      4: single
9698         */
9699        target = get_restripe_target(fs_info, block_group->flags);
9700        if (target) {
9701                index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9702        } else {
9703                /*
9704                 * this is just a balance, so if we were marked as full
9705                 * we know there is no space for a new chunk
9706                 */
9707                if (full) {
9708                        if (debug)
9709                                btrfs_warn(fs_info,
9710                                           "no space to alloc new chunk for block group %llu",
9711                                           block_group->key.objectid);
9712                        goto out;
9713                }
9714
9715                index = btrfs_bg_flags_to_raid_index(block_group->flags);
9716        }
9717
9718        if (index == BTRFS_RAID_RAID10) {
9719                dev_min = 4;
9720                /* Divide by 2 */
9721                min_free >>= 1;
9722        } else if (index == BTRFS_RAID_RAID1) {
9723                dev_min = 2;
9724        } else if (index == BTRFS_RAID_DUP) {
9725                /* Multiply by 2 */
9726                min_free <<= 1;
9727        } else if (index == BTRFS_RAID_RAID0) {
9728                dev_min = fs_devices->rw_devices;
9729                min_free = div64_u64(min_free, dev_min);
9730        }
9731
9732        /* We need to do this so that we can look at pending chunks */
9733        trans = btrfs_join_transaction(root);
9734        if (IS_ERR(trans)) {
9735                ret = PTR_ERR(trans);
9736                goto out;
9737        }
9738
9739        mutex_lock(&fs_info->chunk_mutex);
9740        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9741                u64 dev_offset;
9742
9743                /*
9744                 * check to make sure we can actually find a chunk with enough
9745                 * space to fit our block group in.
9746                 */
9747                if (device->total_bytes > device->bytes_used + min_free &&
9748                    !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
9749                        ret = find_free_dev_extent(trans, device, min_free,
9750                                                   &dev_offset, NULL);
9751                        if (!ret)
9752                                dev_nr++;
9753
9754                        if (dev_nr >= dev_min)
9755                                break;
9756
9757                        ret = -1;
9758                }
9759        }
9760        if (debug && ret == -1)
9761                btrfs_warn(fs_info,
9762                           "no space to allocate a new chunk for block group %llu",
9763                           block_group->key.objectid);
9764        mutex_unlock(&fs_info->chunk_mutex);
9765        btrfs_end_transaction(trans);
9766out:
9767        btrfs_put_block_group(block_group);
9768        return ret;
9769}
9770
9771static int find_first_block_group(struct btrfs_fs_info *fs_info,
9772                                  struct btrfs_path *path,
9773                                  struct btrfs_key *key)
9774{
9775        struct btrfs_root *root = fs_info->extent_root;
9776        int ret = 0;
9777        struct btrfs_key found_key;
9778        struct extent_buffer *leaf;
9779        int slot;
9780
9781        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9782        if (ret < 0)
9783                goto out;
9784
9785        while (1) {
9786                slot = path->slots[0];
9787                leaf = path->nodes[0];
9788                if (slot >= btrfs_header_nritems(leaf)) {
9789                        ret = btrfs_next_leaf(root, path);
9790                        if (ret == 0)
9791                                continue;
9792                        if (ret < 0)
9793                                goto out;
9794                        break;
9795                }
9796                btrfs_item_key_to_cpu(leaf, &found_key, slot);
9797
9798                if (found_key.objectid >= key->objectid &&
9799                    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9800                        struct extent_map_tree *em_tree;
9801                        struct extent_map *em;
9802
9803                        em_tree = &root->fs_info->mapping_tree.map_tree;
9804                        read_lock(&em_tree->lock);
9805                        em = lookup_extent_mapping(em_tree, found_key.objectid,
9806                                                   found_key.offset);
9807                        read_unlock(&em_tree->lock);
9808                        if (!em) {
9809                                btrfs_err(fs_info,
9810                        "logical %llu len %llu found bg but no related chunk",
9811                                          found_key.objectid, found_key.offset);
9812                                ret = -ENOENT;
9813                        } else {
9814                                ret = 0;
9815                        }
9816                        free_extent_map(em);
9817                        goto out;
9818                }
9819                path->slots[0]++;
9820        }
9821out:
9822        return ret;
9823}
9824
9825void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
9826{
9827        struct btrfs_block_group_cache *block_group;
9828        u64 last = 0;
9829
9830        while (1) {
9831                struct inode *inode;
9832
9833                block_group = btrfs_lookup_first_block_group(info, last);
9834                while (block_group) {
9835                        spin_lock(&block_group->lock);
9836                        if (block_group->iref)
9837                                break;
9838                        spin_unlock(&block_group->lock);
9839                        block_group = next_block_group(info, block_group);
9840                }
9841                if (!block_group) {
9842                        if (last == 0)
9843                                break;
9844                        last = 0;
9845                        continue;
9846                }
9847
9848                inode = block_group->inode;
9849                block_group->iref = 0;
9850                block_group->inode = NULL;
9851                spin_unlock(&block_group->lock);
9852                ASSERT(block_group->io_ctl.inode == NULL);
9853                iput(inode);
9854                last = block_group->key.objectid + block_group->key.offset;
9855                btrfs_put_block_group(block_group);
9856        }
9857}
9858
9859/*
9860 * Must be called only after stopping all workers, since we could have block
9861 * group caching kthreads running, and therefore they could race with us if we
9862 * freed the block groups before stopping them.
9863 */
9864int btrfs_free_block_groups(struct btrfs_fs_info *info)
9865{
9866        struct btrfs_block_group_cache *block_group;
9867        struct btrfs_space_info *space_info;
9868        struct btrfs_caching_control *caching_ctl;
9869        struct rb_node *n;
9870
9871        down_write(&info->commit_root_sem);
9872        while (!list_empty(&info->caching_block_groups)) {
9873                caching_ctl = list_entry(info->caching_block_groups.next,
9874                                         struct btrfs_caching_control, list);
9875                list_del(&caching_ctl->list);
9876                put_caching_control(caching_ctl);
9877        }
9878        up_write(&info->commit_root_sem);
9879
9880        spin_lock(&info->unused_bgs_lock);
9881        while (!list_empty(&info->unused_bgs)) {
9882                block_group = list_first_entry(&info->unused_bgs,
9883                                               struct btrfs_block_group_cache,
9884                                               bg_list);
9885                list_del_init(&block_group->bg_list);
9886                btrfs_put_block_group(block_group);
9887        }
9888        spin_unlock(&info->unused_bgs_lock);
9889
9890        spin_lock(&info->block_group_cache_lock);
9891        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
9892                block_group = rb_entry(n, struct btrfs_block_group_cache,
9893                                       cache_node);
9894                rb_erase(&block_group->cache_node,
9895                         &info->block_group_cache_tree);
9896                RB_CLEAR_NODE(&block_group->cache_node);
9897                spin_unlock(&info->block_group_cache_lock);
9898
9899                down_write(&block_group->space_info->groups_sem);
9900                list_del(&block_group->list);
9901                up_write(&block_group->space_info->groups_sem);
9902
9903                /*
9904                 * We haven't cached this block group, which means we could
9905                 * possibly have excluded extents on this block group.
9906                 */
9907                if (block_group->cached == BTRFS_CACHE_NO ||
9908                    block_group->cached == BTRFS_CACHE_ERROR)
9909                        free_excluded_extents(info, block_group);
9910
9911                btrfs_remove_free_space_cache(block_group);
9912                ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
9913                ASSERT(list_empty(&block_group->dirty_list));
9914                ASSERT(list_empty(&block_group->io_list));
9915                ASSERT(list_empty(&block_group->bg_list));
9916                ASSERT(atomic_read(&block_group->count) == 1);
9917                btrfs_put_block_group(block_group);
9918
9919                spin_lock(&info->block_group_cache_lock);
9920        }
9921        spin_unlock(&info->block_group_cache_lock);
9922
9923        /* now that all the block groups are freed, go through and
9924         * free all the space_info structs.  This is only called during
9925         * the final stages of unmount, and so we know nobody is
9926         * using them.  We call synchronize_rcu() once before we start,
9927         * just to be on the safe side.
9928         */
9929        synchronize_rcu();
9930
9931        release_global_block_rsv(info);
9932
9933        while (!list_empty(&info->space_info)) {
9934                int i;
9935
9936                space_info = list_entry(info->space_info.next,
9937                                        struct btrfs_space_info,
9938                                        list);
9939
9940                /*
9941                 * Do not hide this behind enospc_debug, this is actually
9942                 * important and indicates a real bug if this happens.
9943                 */
9944                if (WARN_ON(space_info->bytes_pinned > 0 ||
9945                            space_info->bytes_reserved > 0 ||
9946                            space_info->bytes_may_use > 0))
9947                        dump_space_info(info, space_info, 0, 0);
9948                list_del(&space_info->list);
9949                for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
9950                        struct kobject *kobj;
9951                        kobj = space_info->block_group_kobjs[i];
9952                        space_info->block_group_kobjs[i] = NULL;
9953                        if (kobj) {
9954                                kobject_del(kobj);
9955                                kobject_put(kobj);
9956                        }
9957                }
9958                kobject_del(&space_info->kobj);
9959                kobject_put(&space_info->kobj);
9960        }
9961        return 0;
9962}
9963
9964/* link_block_group will queue up kobjects to add when we're reclaim-safe */
9965void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
9966{
9967        struct btrfs_space_info *space_info;
9968        struct raid_kobject *rkobj;
9969        LIST_HEAD(list);
9970        int index;
9971        int ret = 0;
9972
9973        spin_lock(&fs_info->pending_raid_kobjs_lock);
9974        list_splice_init(&fs_info->pending_raid_kobjs, &list);
9975        spin_unlock(&fs_info->pending_raid_kobjs_lock);
9976
9977        list_for_each_entry(rkobj, &list, list) {
9978                space_info = __find_space_info(fs_info, rkobj->flags);
9979                index = btrfs_bg_flags_to_raid_index(rkobj->flags);
9980
9981                ret = kobject_add(&rkobj->kobj, &space_info->kobj,
9982                                  "%s", get_raid_name(index));
9983                if (ret) {
9984                        kobject_put(&rkobj->kobj);
9985                        break;
9986                }
9987        }
9988        if (ret)
9989                btrfs_warn(fs_info,
9990                           "failed to add kobject for block cache, ignoring");
9991}
9992
9993static void link_block_group(struct btrfs_block_group_cache *cache)
9994{
9995        struct btrfs_space_info *space_info = cache->space_info;
9996        struct btrfs_fs_info *fs_info = cache->fs_info;
9997        int index = btrfs_bg_flags_to_raid_index(cache->flags);
9998        bool first = false;
9999
10000        down_write(&space_info->groups_sem);
10001        if (list_empty(&space_info->block_groups[index]))
10002                first = true;
10003        list_add_tail(&cache->list, &space_info->block_groups[index]);
10004        up_write(&space_info->groups_sem);
10005
10006        if (first) {
10007                struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
10008                if (!rkobj) {
10009                        btrfs_warn(cache->fs_info,
10010                                "couldn't alloc memory for raid level kobject");
10011                        return;
10012                }
10013                rkobj->flags = cache->flags;
10014                kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10015
10016                spin_lock(&fs_info->pending_raid_kobjs_lock);
10017                list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
10018                spin_unlock(&fs_info->pending_raid_kobjs_lock);
10019                space_info->block_group_kobjs[index] = &rkobj->kobj;
10020        }
10021}
10022
10023static struct btrfs_block_group_cache *
10024btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
10025                               u64 start, u64 size)
10026{
10027        struct btrfs_block_group_cache *cache;
10028
10029        cache = kzalloc(sizeof(*cache), GFP_NOFS);
10030        if (!cache)
10031                return NULL;
10032
10033        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10034                                        GFP_NOFS);
10035        if (!cache->free_space_ctl) {
10036                kfree(cache);
10037                return NULL;
10038        }
10039
10040        cache->key.objectid = start;
10041        cache->key.offset = size;
10042        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10043
10044        cache->fs_info = fs_info;
10045        cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
10046        set_free_space_tree_thresholds(cache);
10047
10048        atomic_set(&cache->count, 1);
10049        spin_lock_init(&cache->lock);
10050        init_rwsem(&cache->data_rwsem);
10051        INIT_LIST_HEAD(&cache->list);
10052        INIT_LIST_HEAD(&cache->cluster_list);
10053        INIT_LIST_HEAD(&cache->bg_list);
10054        INIT_LIST_HEAD(&cache->ro_list);
10055        INIT_LIST_HEAD(&cache->dirty_list);
10056        INIT_LIST_HEAD(&cache->io_list);
10057        btrfs_init_free_space_ctl(cache);
10058        atomic_set(&cache->trimming, 0);
10059        mutex_init(&cache->free_space_lock);
10060        btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
10061
10062        return cache;
10063}
10064
10065int btrfs_read_block_groups(struct btrfs_fs_info *info)
10066{
10067        struct btrfs_path *path;
10068        int ret;
10069        struct btrfs_block_group_cache *cache;
10070        struct btrfs_space_info *space_info;
10071        struct btrfs_key key;
10072        struct btrfs_key found_key;
10073        struct extent_buffer *leaf;
10074        int need_clear = 0;
10075        u64 cache_gen;
10076        u64 feature;
10077        int mixed;
10078
10079        feature = btrfs_super_incompat_flags(info->super_copy);
10080        mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10081
10082        key.objectid = 0;
10083        key.offset = 0;
10084        key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10085        path = btrfs_alloc_path();
10086        if (!path)
10087                return -ENOMEM;
10088        path->reada = READA_FORWARD;
10089
10090        cache_gen = btrfs_super_cache_generation(info->super_copy);
10091        if (btrfs_test_opt(info, SPACE_CACHE) &&
10092            btrfs_super_generation(info->super_copy) != cache_gen)
10093                need_clear = 1;
10094        if (btrfs_test_opt(info, CLEAR_CACHE))
10095                need_clear = 1;
10096
10097        while (1) {
10098                ret = find_first_block_group(info, path, &key);
10099                if (ret > 0)
10100                        break;
10101                if (ret != 0)
10102                        goto error;
10103
10104                leaf = path->nodes[0];
10105                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10106
10107                cache = btrfs_create_block_group_cache(info, found_key.objectid,
10108                                                       found_key.offset);
10109                if (!cache) {
10110                        ret = -ENOMEM;
10111                        goto error;
10112                }
10113
10114                if (need_clear) {
10115                        /*
10116                         * When we mount with old space cache, we need to
10117                         * set BTRFS_DC_CLEAR and set dirty flag.
10118                         *
10119                         * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10120                         *    truncate the old free space cache inode and
10121                         *    setup a new one.
10122                         * b) Setting 'dirty flag' makes sure that we flush
10123                         *    the new space cache info onto disk.
10124                         */
10125                        if (btrfs_test_opt(info, SPACE_CACHE))
10126                                cache->disk_cache_state = BTRFS_DC_CLEAR;
10127                }
10128
10129                read_extent_buffer(leaf, &cache->item,
10130                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
10131                                   sizeof(cache->item));
10132                cache->flags = btrfs_block_group_flags(&cache->item);
10133                if (!mixed &&
10134                    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10135                    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10136                        btrfs_err(info,
10137"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10138                                  cache->key.objectid);
10139                        ret = -EINVAL;
10140                        goto error;
10141                }
10142
10143                key.objectid = found_key.objectid + found_key.offset;
10144                btrfs_release_path(path);
10145
10146                /*
10147                 * We need to exclude the super stripes now so that the space
10148                 * info has super bytes accounted for, otherwise we'll think
10149                 * we have more space than we actually do.
10150                 */
10151                ret = exclude_super_stripes(info, cache);
10152                if (ret) {
10153                        /*
10154                         * We may have excluded something, so call this just in
10155                         * case.
10156                         */
10157                        free_excluded_extents(info, cache);
10158                        btrfs_put_block_group(cache);
10159                        goto error;
10160                }
10161
10162                /*
10163                 * check for two cases, either we are full, and therefore
10164                 * don't need to bother with the caching work since we won't
10165                 * find any space, or we are empty, and we can just add all
10166                 * the space in and be done with it.  This saves us _alot_ of
10167                 * time, particularly in the full case.
10168                 */
10169                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10170                        cache->last_byte_to_unpin = (u64)-1;
10171                        cache->cached = BTRFS_CACHE_FINISHED;
10172                        free_excluded_extents(info, cache);
10173                } else if (btrfs_block_group_used(&cache->item) == 0) {
10174                        cache->last_byte_to_unpin = (u64)-1;
10175                        cache->cached = BTRFS_CACHE_FINISHED;
10176                        add_new_free_space(cache, info,
10177                                           found_key.objectid,
10178                                           found_key.objectid +
10179                                           found_key.offset);
10180                        free_excluded_extents(info, cache);
10181                }
10182
10183                ret = btrfs_add_block_group_cache(info, cache);
10184                if (ret) {
10185                        btrfs_remove_free_space_cache(cache);
10186                        btrfs_put_block_group(cache);
10187                        goto error;
10188                }
10189
10190                trace_btrfs_add_block_group(info, cache, 0);
10191                update_space_info(info, cache->flags, found_key.offset,
10192                                  btrfs_block_group_used(&cache->item),
10193                                  cache->bytes_super, &space_info);
10194
10195                cache->space_info = space_info;
10196
10197                link_block_group(cache);
10198
10199                set_avail_alloc_bits(info, cache->flags);
10200                if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10201                        inc_block_group_ro(cache, 1);
10202                } else if (btrfs_block_group_used(&cache->item) == 0) {
10203                        spin_lock(&info->unused_bgs_lock);
10204                        /* Should always be true but just in case. */
10205                        if (list_empty(&cache->bg_list)) {
10206                                btrfs_get_block_group(cache);
10207                                list_add_tail(&cache->bg_list,
10208                                              &info->unused_bgs);
10209                        }
10210                        spin_unlock(&info->unused_bgs_lock);
10211                }
10212        }
10213
10214        list_for_each_entry_rcu(space_info, &info->space_info, list) {
10215                if (!(get_alloc_profile(info, space_info->flags) &
10216                      (BTRFS_BLOCK_GROUP_RAID10 |
10217                       BTRFS_BLOCK_GROUP_RAID1 |
10218                       BTRFS_BLOCK_GROUP_RAID5 |
10219                       BTRFS_BLOCK_GROUP_RAID6 |
10220                       BTRFS_BLOCK_GROUP_DUP)))
10221                        continue;
10222                /*
10223                 * avoid allocating from un-mirrored block group if there are
10224                 * mirrored block groups.
10225                 */
10226                list_for_each_entry(cache,
10227                                &space_info->block_groups[BTRFS_RAID_RAID0],
10228                                list)
10229                        inc_block_group_ro(cache, 1);
10230                list_for_each_entry(cache,
10231                                &space_info->block_groups[BTRFS_RAID_SINGLE],
10232                                list)
10233                        inc_block_group_ro(cache, 1);
10234        }
10235
10236        btrfs_add_raid_kobjects(info);
10237        init_global_block_rsv(info);
10238        ret = 0;
10239error:
10240        btrfs_free_path(path);
10241        return ret;
10242}
10243
10244void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10245{
10246        struct btrfs_fs_info *fs_info = trans->fs_info;
10247        struct btrfs_block_group_cache *block_group, *tmp;
10248        struct btrfs_root *extent_root = fs_info->extent_root;
10249        struct btrfs_block_group_item item;
10250        struct btrfs_key key;
10251        int ret = 0;
10252        bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
10253
10254        trans->can_flush_pending_bgs = false;
10255        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
10256                if (ret)
10257                        goto next;
10258
10259                spin_lock(&block_group->lock);
10260                memcpy(&item, &block_group->item, sizeof(item));
10261                memcpy(&key, &block_group->key, sizeof(key));
10262                spin_unlock(&block_group->lock);
10263
10264                ret = btrfs_insert_item(trans, extent_root, &key, &item,
10265                                        sizeof(item));
10266                if (ret)
10267                        btrfs_abort_transaction(trans, ret);
10268                ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid,
10269                                               key.offset);
10270                if (ret)
10271                        btrfs_abort_transaction(trans, ret);
10272                add_block_group_free_space(trans, fs_info, block_group);
10273                /* already aborted the transaction if it failed. */
10274next:
10275                list_del_init(&block_group->bg_list);
10276        }
10277        trans->can_flush_pending_bgs = can_flush_pending_bgs;
10278}
10279
10280int btrfs_make_block_group(struct btrfs_trans_handle *trans,
10281                           struct btrfs_fs_info *fs_info, u64 bytes_used,
10282                           u64 type, u64 chunk_offset, u64 size)
10283{
10284        struct btrfs_block_group_cache *cache;
10285        int ret;
10286
10287        btrfs_set_log_full_commit(fs_info, trans);
10288
10289        cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10290        if (!cache)
10291                return -ENOMEM;
10292
10293        btrfs_set_block_group_used(&cache->item, bytes_used);
10294        btrfs_set_block_group_chunk_objectid(&cache->item,
10295                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10296        btrfs_set_block_group_flags(&cache->item, type);
10297
10298        cache->flags = type;
10299        cache->last_byte_to_unpin = (u64)-1;
10300        cache->cached = BTRFS_CACHE_FINISHED;
10301        cache->needs_free_space = 1;
10302        ret = exclude_super_stripes(fs_info, cache);
10303        if (ret) {
10304                /*
10305                 * We may have excluded something, so call this just in
10306                 * case.
10307                 */
10308                free_excluded_extents(fs_info, cache);
10309                btrfs_put_block_group(cache);
10310                return ret;
10311        }
10312
10313        add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size);
10314
10315        free_excluded_extents(fs_info, cache);
10316
10317#ifdef CONFIG_BTRFS_DEBUG
10318        if (btrfs_should_fragment_free_space(cache)) {
10319                u64 new_bytes_used = size - bytes_used;
10320
10321                bytes_used += new_bytes_used >> 1;
10322                fragment_free_space(cache);
10323        }
10324#endif
10325        /*
10326         * Ensure the corresponding space_info object is created and
10327         * assigned to our block group. We want our bg to be added to the rbtree
10328         * with its ->space_info set.
10329         */
10330        cache->space_info = __find_space_info(fs_info, cache->flags);
10331        ASSERT(cache->space_info);
10332
10333        ret = btrfs_add_block_group_cache(fs_info, cache);
10334        if (ret) {
10335                btrfs_remove_free_space_cache(cache);
10336                btrfs_put_block_group(cache);
10337                return ret;
10338        }
10339
10340        /*
10341         * Now that our block group has its ->space_info set and is inserted in
10342         * the rbtree, update the space info's counters.
10343         */
10344        trace_btrfs_add_block_group(fs_info, cache, 1);
10345        update_space_info(fs_info, cache->flags, size, bytes_used,
10346                                cache->bytes_super, &cache->space_info);
10347        update_global_block_rsv(fs_info);
10348
10349        link_block_group(cache);
10350
10351        list_add_tail(&cache->bg_list, &trans->new_bgs);
10352
10353        set_avail_alloc_bits(fs_info, type);
10354        return 0;
10355}
10356
10357static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10358{
10359        u64 extra_flags = chunk_to_extended(flags) &
10360                                BTRFS_EXTENDED_PROFILE_MASK;
10361
10362        write_seqlock(&fs_info->profiles_lock);
10363        if (flags & BTRFS_BLOCK_GROUP_DATA)
10364                fs_info->avail_data_alloc_bits &= ~extra_flags;
10365        if (flags & BTRFS_BLOCK_GROUP_METADATA)
10366                fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10367        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10368                fs_info->avail_system_alloc_bits &= ~extra_flags;
10369        write_sequnlock(&fs_info->profiles_lock);
10370}
10371
10372int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10373                             struct btrfs_fs_info *fs_info, u64 group_start,
10374                             struct extent_map *em)
10375{
10376        struct btrfs_root *root = fs_info->extent_root;
10377        struct btrfs_path *path;
10378        struct btrfs_block_group_cache *block_group;
10379        struct btrfs_free_cluster *cluster;
10380        struct btrfs_root *tree_root = fs_info->tree_root;
10381        struct btrfs_key key;
10382        struct inode *inode;
10383        struct kobject *kobj = NULL;
10384        int ret;
10385        int index;
10386        int factor;
10387        struct btrfs_caching_control *caching_ctl = NULL;
10388        bool remove_em;
10389
10390        block_group = btrfs_lookup_block_group(fs_info, group_start);
10391        BUG_ON(!block_group);
10392        BUG_ON(!block_group->ro);
10393
10394        /*
10395         * Free the reserved super bytes from this block group before
10396         * remove it.
10397         */
10398        free_excluded_extents(fs_info, block_group);
10399        btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10400                                  block_group->key.offset);
10401
10402        memcpy(&key, &block_group->key, sizeof(key));
10403        index = btrfs_bg_flags_to_raid_index(block_group->flags);
10404        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
10405                                  BTRFS_BLOCK_GROUP_RAID1 |
10406                                  BTRFS_BLOCK_GROUP_RAID10))
10407                factor = 2;
10408        else
10409                factor = 1;
10410
10411        /* make sure this block group isn't part of an allocation cluster */
10412        cluster = &fs_info->data_alloc_cluster;
10413        spin_lock(&cluster->refill_lock);
10414        btrfs_return_cluster_to_free_space(block_group, cluster);
10415        spin_unlock(&cluster->refill_lock);
10416
10417        /*
10418         * make sure this block group isn't part of a metadata
10419         * allocation cluster
10420         */
10421        cluster = &fs_info->meta_alloc_cluster;
10422        spin_lock(&cluster->refill_lock);
10423        btrfs_return_cluster_to_free_space(block_group, cluster);
10424        spin_unlock(&cluster->refill_lock);
10425
10426        path = btrfs_alloc_path();
10427        if (!path) {
10428                ret = -ENOMEM;
10429                goto out;
10430        }
10431
10432        /*
10433         * get the inode first so any iput calls done for the io_list
10434         * aren't the final iput (no unlinks allowed now)
10435         */
10436        inode = lookup_free_space_inode(fs_info, block_group, path);
10437
10438        mutex_lock(&trans->transaction->cache_write_mutex);
10439        /*
10440         * make sure our free spache cache IO is done before remove the
10441         * free space inode
10442         */
10443        spin_lock(&trans->transaction->dirty_bgs_lock);
10444        if (!list_empty(&block_group->io_list)) {
10445                list_del_init(&block_group->io_list);
10446
10447                WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10448
10449                spin_unlock(&trans->transaction->dirty_bgs_lock);
10450                btrfs_wait_cache_io(trans, block_group, path);
10451                btrfs_put_block_group(block_group);
10452                spin_lock(&trans->transaction->dirty_bgs_lock);
10453        }
10454
10455        if (!list_empty(&block_group->dirty_list)) {
10456                list_del_init(&block_group->dirty_list);
10457                btrfs_put_block_group(block_group);
10458        }
10459        spin_unlock(&trans->transaction->dirty_bgs_lock);
10460        mutex_unlock(&trans->transaction->cache_write_mutex);
10461
10462        if (!IS_ERR(inode)) {
10463                ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10464                if (ret) {
10465                        btrfs_add_delayed_iput(inode);
10466                        goto out;
10467                }
10468                clear_nlink(inode);
10469                /* One for the block groups ref */
10470                spin_lock(&block_group->lock);
10471                if (block_group->iref) {
10472                        block_group->iref = 0;
10473                        block_group->inode = NULL;
10474                        spin_unlock(&block_group->lock);
10475                        iput(inode);
10476                } else {
10477                        spin_unlock(&block_group->lock);
10478                }
10479                /* One for our lookup ref */
10480                btrfs_add_delayed_iput(inode);
10481        }
10482
10483        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10484        key.offset = block_group->key.objectid;
10485        key.type = 0;
10486
10487        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10488        if (ret < 0)
10489                goto out;
10490        if (ret > 0)
10491                btrfs_release_path(path);
10492        if (ret == 0) {
10493                ret = btrfs_del_item(trans, tree_root, path);
10494                if (ret)
10495                        goto out;
10496                btrfs_release_path(path);
10497        }
10498
10499        spin_lock(&fs_info->block_group_cache_lock);
10500        rb_erase(&block_group->cache_node,
10501                 &fs_info->block_group_cache_tree);
10502        RB_CLEAR_NODE(&block_group->cache_node);
10503
10504        if (fs_info->first_logical_byte == block_group->key.objectid)
10505                fs_info->first_logical_byte = (u64)-1;
10506        spin_unlock(&fs_info->block_group_cache_lock);
10507
10508        down_write(&block_group->space_info->groups_sem);
10509        /*
10510         * we must use list_del_init so people can check to see if they
10511         * are still on the list after taking the semaphore
10512         */
10513        list_del_init(&block_group->list);
10514        if (list_empty(&block_group->space_info->block_groups[index])) {
10515                kobj = block_group->space_info->block_group_kobjs[index];
10516                block_group->space_info->block_group_kobjs[index] = NULL;
10517                clear_avail_alloc_bits(fs_info, block_group->flags);
10518        }
10519        up_write(&block_group->space_info->groups_sem);
10520        if (kobj) {
10521                kobject_del(kobj);
10522                kobject_put(kobj);
10523        }
10524
10525        if (block_group->has_caching_ctl)
10526                caching_ctl = get_caching_control(block_group);
10527        if (block_group->cached == BTRFS_CACHE_STARTED)
10528                wait_block_group_cache_done(block_group);
10529        if (block_group->has_caching_ctl) {
10530                down_write(&fs_info->commit_root_sem);
10531                if (!caching_ctl) {
10532                        struct btrfs_caching_control *ctl;
10533
10534                        list_for_each_entry(ctl,
10535                                    &fs_info->caching_block_groups, list)
10536                                if (ctl->block_group == block_group) {
10537                                        caching_ctl = ctl;
10538                                        refcount_inc(&caching_ctl->count);
10539                                        break;
10540                                }
10541                }
10542                if (caching_ctl)
10543                        list_del_init(&caching_ctl->list);
10544                up_write(&fs_info->commit_root_sem);
10545                if (caching_ctl) {
10546                        /* Once for the caching bgs list and once for us. */
10547                        put_caching_control(caching_ctl);
10548                        put_caching_control(caching_ctl);
10549                }
10550        }
10551
10552        spin_lock(&trans->transaction->dirty_bgs_lock);
10553        if (!list_empty(&block_group->dirty_list)) {
10554                WARN_ON(1);
10555        }
10556        if (!list_empty(&block_group->io_list)) {
10557                WARN_ON(1);
10558        }
10559        spin_unlock(&trans->transaction->dirty_bgs_lock);
10560        btrfs_remove_free_space_cache(block_group);
10561
10562        spin_lock(&block_group->space_info->lock);
10563        list_del_init(&block_group->ro_list);
10564
10565        if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10566                WARN_ON(block_group->space_info->total_bytes
10567                        < block_group->key.offset);
10568                WARN_ON(block_group->space_info->bytes_readonly
10569                        < block_group->key.offset);
10570                WARN_ON(block_group->space_info->disk_total
10571                        < block_group->key.offset * factor);
10572        }
10573        block_group->space_info->total_bytes -= block_group->key.offset;
10574        block_group->space_info->bytes_readonly -= block_group->key.offset;
10575        block_group->space_info->disk_total -= block_group->key.offset * factor;
10576
10577        spin_unlock(&block_group->space_info->lock);
10578
10579        memcpy(&key, &block_group->key, sizeof(key));
10580
10581        mutex_lock(&fs_info->chunk_mutex);
10582        if (!list_empty(&em->list)) {
10583                /* We're in the transaction->pending_chunks list. */
10584                free_extent_map(em);
10585        }
10586        spin_lock(&block_group->lock);
10587        block_group->removed = 1;
10588        /*
10589         * At this point trimming can't start on this block group, because we
10590         * removed the block group from the tree fs_info->block_group_cache_tree
10591         * so no one can't find it anymore and even if someone already got this
10592         * block group before we removed it from the rbtree, they have already
10593         * incremented block_group->trimming - if they didn't, they won't find
10594         * any free space entries because we already removed them all when we
10595         * called btrfs_remove_free_space_cache().
10596         *
10597         * And we must not remove the extent map from the fs_info->mapping_tree
10598         * to prevent the same logical address range and physical device space
10599         * ranges from being reused for a new block group. This is because our
10600         * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10601         * completely transactionless, so while it is trimming a range the
10602         * currently running transaction might finish and a new one start,
10603         * allowing for new block groups to be created that can reuse the same
10604         * physical device locations unless we take this special care.
10605         *
10606         * There may also be an implicit trim operation if the file system
10607         * is mounted with -odiscard. The same protections must remain
10608         * in place until the extents have been discarded completely when
10609         * the transaction commit has completed.
10610         */
10611        remove_em = (atomic_read(&block_group->trimming) == 0);
10612        /*
10613         * Make sure a trimmer task always sees the em in the pinned_chunks list
10614         * if it sees block_group->removed == 1 (needs to lock block_group->lock
10615         * before checking block_group->removed).
10616         */
10617        if (!remove_em) {
10618                /*
10619                 * Our em might be in trans->transaction->pending_chunks which
10620                 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10621                 * and so is the fs_info->pinned_chunks list.
10622                 *
10623                 * So at this point we must be holding the chunk_mutex to avoid
10624                 * any races with chunk allocation (more specifically at
10625                 * volumes.c:contains_pending_extent()), to ensure it always
10626                 * sees the em, either in the pending_chunks list or in the
10627                 * pinned_chunks list.
10628                 */
10629                list_move_tail(&em->list, &fs_info->pinned_chunks);
10630        }
10631        spin_unlock(&block_group->lock);
10632
10633        if (remove_em) {
10634                struct extent_map_tree *em_tree;
10635
10636                em_tree = &fs_info->mapping_tree.map_tree;
10637                write_lock(&em_tree->lock);
10638                /*
10639                 * The em might be in the pending_chunks list, so make sure the
10640                 * chunk mutex is locked, since remove_extent_mapping() will
10641                 * delete us from that list.
10642                 */
10643                remove_extent_mapping(em_tree, em);
10644                write_unlock(&em_tree->lock);
10645                /* once for the tree */
10646                free_extent_map(em);
10647        }
10648
10649        mutex_unlock(&fs_info->chunk_mutex);
10650
10651        ret = remove_block_group_free_space(trans, fs_info, block_group);
10652        if (ret)
10653                goto out;
10654
10655        btrfs_put_block_group(block_group);
10656        btrfs_put_block_group(block_group);
10657
10658        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10659        if (ret > 0)
10660                ret = -EIO;
10661        if (ret < 0)
10662                goto out;
10663
10664        ret = btrfs_del_item(trans, root, path);
10665out:
10666        btrfs_free_path(path);
10667        return ret;
10668}
10669
10670struct btrfs_trans_handle *
10671btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10672                                     const u64 chunk_offset)
10673{
10674        struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10675        struct extent_map *em;
10676        struct map_lookup *map;
10677        unsigned int num_items;
10678
10679        read_lock(&em_tree->lock);
10680        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10681        read_unlock(&em_tree->lock);
10682        ASSERT(em && em->start == chunk_offset);
10683
10684        /*
10685         * We need to reserve 3 + N units from the metadata space info in order
10686         * to remove a block group (done at btrfs_remove_chunk() and at
10687         * btrfs_remove_block_group()), which are used for:
10688         *
10689         * 1 unit for adding the free space inode's orphan (located in the tree
10690         * of tree roots).
10691         * 1 unit for deleting the block group item (located in the extent
10692         * tree).
10693         * 1 unit for deleting the free space item (located in tree of tree
10694         * roots).
10695         * N units for deleting N device extent items corresponding to each
10696         * stripe (located in the device tree).
10697         *
10698         * In order to remove a block group we also need to reserve units in the
10699         * system space info in order to update the chunk tree (update one or
10700         * more device items and remove one chunk item), but this is done at
10701         * btrfs_remove_chunk() through a call to check_system_chunk().
10702         */
10703        map = em->map_lookup;
10704        num_items = 3 + map->num_stripes;
10705        free_extent_map(em);
10706
10707        return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10708                                                           num_items, 1);
10709}
10710
10711/*
10712 * Process the unused_bgs list and remove any that don't have any allocated
10713 * space inside of them.
10714 */
10715void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10716{
10717        struct btrfs_block_group_cache *block_group;
10718        struct btrfs_space_info *space_info;
10719        struct btrfs_trans_handle *trans;
10720        int ret = 0;
10721
10722        if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10723                return;
10724
10725        spin_lock(&fs_info->unused_bgs_lock);
10726        while (!list_empty(&fs_info->unused_bgs)) {
10727                u64 start, end;
10728                int trimming;
10729
10730                block_group = list_first_entry(&fs_info->unused_bgs,
10731                                               struct btrfs_block_group_cache,
10732                                               bg_list);
10733                list_del_init(&block_group->bg_list);
10734
10735                space_info = block_group->space_info;
10736
10737                if (ret || btrfs_mixed_space_info(space_info)) {
10738                        btrfs_put_block_group(block_group);
10739                        continue;
10740                }
10741                spin_unlock(&fs_info->unused_bgs_lock);
10742
10743                mutex_lock(&fs_info->delete_unused_bgs_mutex);
10744
10745                /* Don't want to race with allocators so take the groups_sem */
10746                down_write(&space_info->groups_sem);
10747                spin_lock(&block_group->lock);
10748                if (block_group->reserved ||
10749                    btrfs_block_group_used(&block_group->item) ||
10750                    block_group->ro ||
10751                    list_is_singular(&block_group->list)) {
10752                        /*
10753                         * We want to bail if we made new allocations or have
10754                         * outstanding allocations in this block group.  We do
10755                         * the ro check in case balance is currently acting on
10756                         * this block group.
10757                         */
10758                        spin_unlock(&block_group->lock);
10759                        up_write(&space_info->groups_sem);
10760                        goto next;
10761                }
10762                spin_unlock(&block_group->lock);
10763
10764                /* We don't want to force the issue, only flip if it's ok. */
10765                ret = inc_block_group_ro(block_group, 0);
10766                up_write(&space_info->groups_sem);
10767                if (ret < 0) {
10768                        ret = 0;
10769                        goto next;
10770                }
10771
10772                /*
10773                 * Want to do this before we do anything else so we can recover
10774                 * properly if we fail to join the transaction.
10775                 */
10776                trans = btrfs_start_trans_remove_block_group(fs_info,
10777                                                     block_group->key.objectid);
10778                if (IS_ERR(trans)) {
10779                        btrfs_dec_block_group_ro(block_group);
10780                        ret = PTR_ERR(trans);
10781                        goto next;
10782                }
10783
10784                /*
10785                 * We could have pending pinned extents for this block group,
10786                 * just delete them, we don't care about them anymore.
10787                 */
10788                start = block_group->key.objectid;
10789                end = start + block_group->key.offset - 1;
10790                /*
10791                 * Hold the unused_bg_unpin_mutex lock to avoid racing with
10792                 * btrfs_finish_extent_commit(). If we are at transaction N,
10793                 * another task might be running finish_extent_commit() for the
10794                 * previous transaction N - 1, and have seen a range belonging
10795                 * to the block group in freed_extents[] before we were able to
10796                 * clear the whole block group range from freed_extents[]. This
10797                 * means that task can lookup for the block group after we
10798                 * unpinned it from freed_extents[] and removed it, leading to
10799                 * a BUG_ON() at btrfs_unpin_extent_range().
10800                 */
10801                mutex_lock(&fs_info->unused_bg_unpin_mutex);
10802                ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10803                                  EXTENT_DIRTY);
10804                if (ret) {
10805                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10806                        btrfs_dec_block_group_ro(block_group);
10807                        goto end_trans;
10808                }
10809                ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10810                                  EXTENT_DIRTY);
10811                if (ret) {
10812                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10813                        btrfs_dec_block_group_ro(block_group);
10814                        goto end_trans;
10815                }
10816                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10817
10818                /* Reset pinned so btrfs_put_block_group doesn't complain */
10819                spin_lock(&space_info->lock);
10820                spin_lock(&block_group->lock);
10821
10822                space_info->bytes_pinned -= block_group->pinned;
10823                space_info->bytes_readonly += block_group->pinned;
10824                percpu_counter_add(&space_info->total_bytes_pinned,
10825                                   -block_group->pinned);
10826                block_group->pinned = 0;
10827
10828                spin_unlock(&block_group->lock);
10829                spin_unlock(&space_info->lock);
10830
10831                /* DISCARD can flip during remount */
10832                trimming = btrfs_test_opt(fs_info, DISCARD);
10833
10834                /* Implicit trim during transaction commit. */
10835                if (trimming)
10836                        btrfs_get_block_group_trimming(block_group);
10837
10838                /*
10839                 * Btrfs_remove_chunk will abort the transaction if things go
10840                 * horribly wrong.
10841                 */
10842                ret = btrfs_remove_chunk(trans, fs_info,
10843                                         block_group->key.objectid);
10844
10845                if (ret) {
10846                        if (trimming)
10847                                btrfs_put_block_group_trimming(block_group);
10848                        goto end_trans;
10849                }
10850
10851                /*
10852                 * If we're not mounted with -odiscard, we can just forget
10853                 * about this block group. Otherwise we'll need to wait
10854                 * until transaction commit to do the actual discard.
10855                 */
10856                if (trimming) {
10857                        spin_lock(&fs_info->unused_bgs_lock);
10858                        /*
10859                         * A concurrent scrub might have added us to the list
10860                         * fs_info->unused_bgs, so use a list_move operation
10861                         * to add the block group to the deleted_bgs list.
10862                         */
10863                        list_move(&block_group->bg_list,
10864                                  &trans->transaction->deleted_bgs);
10865                        spin_unlock(&fs_info->unused_bgs_lock);
10866                        btrfs_get_block_group(block_group);
10867                }
10868end_trans:
10869                btrfs_end_transaction(trans);
10870next:
10871                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10872                btrfs_put_block_group(block_group);
10873                spin_lock(&fs_info->unused_bgs_lock);
10874        }
10875        spin_unlock(&fs_info->unused_bgs_lock);
10876}
10877
10878int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
10879{
10880        struct btrfs_space_info *space_info;
10881        struct btrfs_super_block *disk_super;
10882        u64 features;
10883        u64 flags;
10884        int mixed = 0;
10885        int ret;
10886
10887        disk_super = fs_info->super_copy;
10888        if (!btrfs_super_root(disk_super))
10889                return -EINVAL;
10890
10891        features = btrfs_super_incompat_flags(disk_super);
10892        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
10893                mixed = 1;
10894
10895        flags = BTRFS_BLOCK_GROUP_SYSTEM;
10896        ret = create_space_info(fs_info, flags, &space_info);
10897        if (ret)
10898                goto out;
10899
10900        if (mixed) {
10901                flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10902                ret = create_space_info(fs_info, flags, &space_info);
10903        } else {
10904                flags = BTRFS_BLOCK_GROUP_METADATA;
10905                ret = create_space_info(fs_info, flags, &space_info);
10906                if (ret)
10907                        goto out;
10908
10909                flags = BTRFS_BLOCK_GROUP_DATA;
10910                ret = create_space_info(fs_info, flags, &space_info);
10911        }
10912out:
10913        return ret;
10914}
10915
10916int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
10917                                   u64 start, u64 end)
10918{
10919        return unpin_extent_range(fs_info, start, end, false);
10920}
10921
10922/*
10923 * It used to be that old block groups would be left around forever.
10924 * Iterating over them would be enough to trim unused space.  Since we
10925 * now automatically remove them, we also need to iterate over unallocated
10926 * space.
10927 *
10928 * We don't want a transaction for this since the discard may take a
10929 * substantial amount of time.  We don't require that a transaction be
10930 * running, but we do need to take a running transaction into account
10931 * to ensure that we're not discarding chunks that were released in
10932 * the current transaction.
10933 *
10934 * Holding the chunks lock will prevent other threads from allocating
10935 * or releasing chunks, but it won't prevent a running transaction
10936 * from committing and releasing the memory that the pending chunks
10937 * list head uses.  For that, we need to take a reference to the
10938 * transaction.
10939 */
10940static int btrfs_trim_free_extents(struct btrfs_device *device,
10941                                   u64 minlen, u64 *trimmed)
10942{
10943        u64 start = 0, len = 0;
10944        int ret;
10945
10946        *trimmed = 0;
10947
10948        /* Not writeable = nothing to do. */
10949        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
10950                return 0;
10951
10952        /* No free space = nothing to do. */
10953        if (device->total_bytes <= device->bytes_used)
10954                return 0;
10955
10956        ret = 0;
10957
10958        while (1) {
10959                struct btrfs_fs_info *fs_info = device->fs_info;
10960                struct btrfs_transaction *trans;
10961                u64 bytes;
10962
10963                ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
10964                if (ret)
10965                        return ret;
10966
10967                down_read(&fs_info->commit_root_sem);
10968
10969                spin_lock(&fs_info->trans_lock);
10970                trans = fs_info->running_transaction;
10971                if (trans)
10972                        refcount_inc(&trans->use_count);
10973                spin_unlock(&fs_info->trans_lock);
10974
10975                ret = find_free_dev_extent_start(trans, device, minlen, start,
10976                                                 &start, &len);
10977                if (trans)
10978                        btrfs_put_transaction(trans);
10979
10980                if (ret) {
10981                        up_read(&fs_info->commit_root_sem);
10982                        mutex_unlock(&fs_info->chunk_mutex);
10983                        if (ret == -ENOSPC)
10984                                ret = 0;
10985                        break;
10986                }
10987
10988                ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
10989                up_read(&fs_info->commit_root_sem);
10990                mutex_unlock(&fs_info->chunk_mutex);
10991
10992                if (ret)
10993                        break;
10994
10995                start += len;
10996                *trimmed += bytes;
10997
10998                if (fatal_signal_pending(current)) {
10999                        ret = -ERESTARTSYS;
11000                        break;
11001                }
11002
11003                cond_resched();
11004        }
11005
11006        return ret;
11007}
11008
11009int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
11010{
11011        struct btrfs_block_group_cache *cache = NULL;
11012        struct btrfs_device *device;
11013        struct list_head *devices;
11014        u64 group_trimmed;
11015        u64 start;
11016        u64 end;
11017        u64 trimmed = 0;
11018        u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
11019        int ret = 0;
11020
11021        /*
11022         * try to trim all FS space, our block group may start from non-zero.
11023         */
11024        if (range->len == total_bytes)
11025                cache = btrfs_lookup_first_block_group(fs_info, range->start);
11026        else
11027                cache = btrfs_lookup_block_group(fs_info, range->start);
11028
11029        while (cache) {
11030                if (cache->key.objectid >= (range->start + range->len)) {
11031                        btrfs_put_block_group(cache);
11032                        break;
11033                }
11034
11035                start = max(range->start, cache->key.objectid);
11036                end = min(range->start + range->len,
11037                                cache->key.objectid + cache->key.offset);
11038
11039                if (end - start >= range->minlen) {
11040                        if (!block_group_cache_done(cache)) {
11041                                ret = cache_block_group(cache, 0);
11042                                if (ret) {
11043                                        btrfs_put_block_group(cache);
11044                                        break;
11045                                }
11046                                ret = wait_block_group_cache_done(cache);
11047                                if (ret) {
11048                                        btrfs_put_block_group(cache);
11049                                        break;
11050                                }
11051                        }
11052                        ret = btrfs_trim_block_group(cache,
11053                                                     &group_trimmed,
11054                                                     start,
11055                                                     end,
11056                                                     range->minlen);
11057
11058                        trimmed += group_trimmed;
11059                        if (ret) {
11060                                btrfs_put_block_group(cache);
11061                                break;
11062                        }
11063                }
11064
11065                cache = next_block_group(fs_info, cache);
11066        }
11067
11068        mutex_lock(&fs_info->fs_devices->device_list_mutex);
11069        devices = &fs_info->fs_devices->alloc_list;
11070        list_for_each_entry(device, devices, dev_alloc_list) {
11071                ret = btrfs_trim_free_extents(device, range->minlen,
11072                                              &group_trimmed);
11073                if (ret)
11074                        break;
11075
11076                trimmed += group_trimmed;
11077        }
11078        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11079
11080        range->len = trimmed;
11081        return ret;
11082}
11083
11084/*
11085 * btrfs_{start,end}_write_no_snapshotting() are similar to
11086 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11087 * data into the page cache through nocow before the subvolume is snapshoted,
11088 * but flush the data into disk after the snapshot creation, or to prevent
11089 * operations while snapshotting is ongoing and that cause the snapshot to be
11090 * inconsistent (writes followed by expanding truncates for example).
11091 */
11092void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11093{
11094        percpu_counter_dec(&root->subv_writers->counter);
11095        /*
11096         * Make sure counter is updated before we wake up waiters.
11097         */
11098        smp_mb();
11099        if (waitqueue_active(&root->subv_writers->wait))
11100                wake_up(&root->subv_writers->wait);
11101}
11102
11103int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11104{
11105        if (atomic_read(&root->will_be_snapshotted))
11106                return 0;
11107
11108        percpu_counter_inc(&root->subv_writers->counter);
11109        /*
11110         * Make sure counter is updated before we check for snapshot creation.
11111         */
11112        smp_mb();
11113        if (atomic_read(&root->will_be_snapshotted)) {
11114                btrfs_end_write_no_snapshotting(root);
11115                return 0;
11116        }
11117        return 1;
11118}
11119
11120void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11121{
11122        while (true) {
11123                int ret;
11124
11125                ret = btrfs_start_write_no_snapshotting(root);
11126                if (ret)
11127                        break;
11128                wait_var_event(&root->will_be_snapshotted,
11129                               !atomic_read(&root->will_be_snapshotted));
11130        }
11131}
11132