linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/sched/signal.h>
   8#include <linux/pagemap.h>
   9#include <linux/writeback.h>
  10#include <linux/blkdev.h>
  11#include <linux/sort.h>
  12#include <linux/rcupdate.h>
  13#include <linux/kthread.h>
  14#include <linux/slab.h>
  15#include <linux/ratelimit.h>
  16#include <linux/percpu_counter.h>
  17#include <linux/lockdep.h>
  18#include <linux/crc32c.h>
  19#include "tree-log.h"
  20#include "disk-io.h"
  21#include "print-tree.h"
  22#include "volumes.h"
  23#include "raid56.h"
  24#include "locking.h"
  25#include "free-space-cache.h"
  26#include "free-space-tree.h"
  27#include "math.h"
  28#include "sysfs.h"
  29#include "qgroup.h"
  30#include "ref-verify.h"
  31
  32#undef SCRAMBLE_DELAYED_REFS
  33
  34/*
  35 * control flags for do_chunk_alloc's force field
  36 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  37 * if we really need one.
  38 *
  39 * CHUNK_ALLOC_LIMITED means to only try and allocate one
  40 * if we have very few chunks already allocated.  This is
  41 * used as part of the clustering code to help make sure
  42 * we have a good pool of storage to cluster in, without
  43 * filling the FS with empty chunks
  44 *
  45 * CHUNK_ALLOC_FORCE means it must try to allocate one
  46 *
  47 */
  48enum {
  49        CHUNK_ALLOC_NO_FORCE = 0,
  50        CHUNK_ALLOC_LIMITED = 1,
  51        CHUNK_ALLOC_FORCE = 2,
  52};
  53
  54/*
  55 * Declare a helper function to detect underflow of various space info members
  56 */
  57#define DECLARE_SPACE_INFO_UPDATE(name)                                 \
  58static inline void update_##name(struct btrfs_space_info *sinfo,        \
  59                                 s64 bytes)                             \
  60{                                                                       \
  61        if (bytes < 0 && sinfo->name < -bytes) {                        \
  62                WARN_ON(1);                                             \
  63                sinfo->name = 0;                                        \
  64                return;                                                 \
  65        }                                                               \
  66        sinfo->name += bytes;                                           \
  67}
  68
  69DECLARE_SPACE_INFO_UPDATE(bytes_may_use);
  70DECLARE_SPACE_INFO_UPDATE(bytes_pinned);
  71
  72static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  73                               struct btrfs_delayed_ref_node *node, u64 parent,
  74                               u64 root_objectid, u64 owner_objectid,
  75                               u64 owner_offset, int refs_to_drop,
  76                               struct btrfs_delayed_extent_op *extra_op);
  77static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  78                                    struct extent_buffer *leaf,
  79                                    struct btrfs_extent_item *ei);
  80static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  81                                      u64 parent, u64 root_objectid,
  82                                      u64 flags, u64 owner, u64 offset,
  83                                      struct btrfs_key *ins, int ref_mod);
  84static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  85                                     struct btrfs_delayed_ref_node *node,
  86                                     struct btrfs_delayed_extent_op *extent_op);
  87static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
  88                          int force);
  89static int find_next_key(struct btrfs_path *path, int level,
  90                         struct btrfs_key *key);
  91static void dump_space_info(struct btrfs_fs_info *fs_info,
  92                            struct btrfs_space_info *info, u64 bytes,
  93                            int dump_block_groups);
  94static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
  95                               u64 num_bytes);
  96static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
  97                                     struct btrfs_space_info *space_info,
  98                                     u64 num_bytes);
  99static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
 100                                     struct btrfs_space_info *space_info,
 101                                     u64 num_bytes);
 102
 103static noinline int
 104block_group_cache_done(struct btrfs_block_group_cache *cache)
 105{
 106        smp_mb();
 107        return cache->cached == BTRFS_CACHE_FINISHED ||
 108                cache->cached == BTRFS_CACHE_ERROR;
 109}
 110
 111static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 112{
 113        return (cache->flags & bits) == bits;
 114}
 115
 116void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 117{
 118        atomic_inc(&cache->count);
 119}
 120
 121void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 122{
 123        if (atomic_dec_and_test(&cache->count)) {
 124                WARN_ON(cache->pinned > 0);
 125                WARN_ON(cache->reserved > 0);
 126
 127                /*
 128                 * If not empty, someone is still holding mutex of
 129                 * full_stripe_lock, which can only be released by caller.
 130                 * And it will definitely cause use-after-free when caller
 131                 * tries to release full stripe lock.
 132                 *
 133                 * No better way to resolve, but only to warn.
 134                 */
 135                WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
 136                kfree(cache->free_space_ctl);
 137                kfree(cache);
 138        }
 139}
 140
 141/*
 142 * this adds the block group to the fs_info rb tree for the block group
 143 * cache
 144 */
 145static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 146                                struct btrfs_block_group_cache *block_group)
 147{
 148        struct rb_node **p;
 149        struct rb_node *parent = NULL;
 150        struct btrfs_block_group_cache *cache;
 151
 152        spin_lock(&info->block_group_cache_lock);
 153        p = &info->block_group_cache_tree.rb_node;
 154
 155        while (*p) {
 156                parent = *p;
 157                cache = rb_entry(parent, struct btrfs_block_group_cache,
 158                                 cache_node);
 159                if (block_group->key.objectid < cache->key.objectid) {
 160                        p = &(*p)->rb_left;
 161                } else if (block_group->key.objectid > cache->key.objectid) {
 162                        p = &(*p)->rb_right;
 163                } else {
 164                        spin_unlock(&info->block_group_cache_lock);
 165                        return -EEXIST;
 166                }
 167        }
 168
 169        rb_link_node(&block_group->cache_node, parent, p);
 170        rb_insert_color(&block_group->cache_node,
 171                        &info->block_group_cache_tree);
 172
 173        if (info->first_logical_byte > block_group->key.objectid)
 174                info->first_logical_byte = block_group->key.objectid;
 175
 176        spin_unlock(&info->block_group_cache_lock);
 177
 178        return 0;
 179}
 180
 181/*
 182 * This will return the block group at or after bytenr if contains is 0, else
 183 * it will return the block group that contains the bytenr
 184 */
 185static struct btrfs_block_group_cache *
 186block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 187                              int contains)
 188{
 189        struct btrfs_block_group_cache *cache, *ret = NULL;
 190        struct rb_node *n;
 191        u64 end, start;
 192
 193        spin_lock(&info->block_group_cache_lock);
 194        n = info->block_group_cache_tree.rb_node;
 195
 196        while (n) {
 197                cache = rb_entry(n, struct btrfs_block_group_cache,
 198                                 cache_node);
 199                end = cache->key.objectid + cache->key.offset - 1;
 200                start = cache->key.objectid;
 201
 202                if (bytenr < start) {
 203                        if (!contains && (!ret || start < ret->key.objectid))
 204                                ret = cache;
 205                        n = n->rb_left;
 206                } else if (bytenr > start) {
 207                        if (contains && bytenr <= end) {
 208                                ret = cache;
 209                                break;
 210                        }
 211                        n = n->rb_right;
 212                } else {
 213                        ret = cache;
 214                        break;
 215                }
 216        }
 217        if (ret) {
 218                btrfs_get_block_group(ret);
 219                if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 220                        info->first_logical_byte = ret->key.objectid;
 221        }
 222        spin_unlock(&info->block_group_cache_lock);
 223
 224        return ret;
 225}
 226
 227static int add_excluded_extent(struct btrfs_fs_info *fs_info,
 228                               u64 start, u64 num_bytes)
 229{
 230        u64 end = start + num_bytes - 1;
 231        set_extent_bits(&fs_info->freed_extents[0],
 232                        start, end, EXTENT_UPTODATE);
 233        set_extent_bits(&fs_info->freed_extents[1],
 234                        start, end, EXTENT_UPTODATE);
 235        return 0;
 236}
 237
 238static void free_excluded_extents(struct btrfs_block_group_cache *cache)
 239{
 240        struct btrfs_fs_info *fs_info = cache->fs_info;
 241        u64 start, end;
 242
 243        start = cache->key.objectid;
 244        end = start + cache->key.offset - 1;
 245
 246        clear_extent_bits(&fs_info->freed_extents[0],
 247                          start, end, EXTENT_UPTODATE);
 248        clear_extent_bits(&fs_info->freed_extents[1],
 249                          start, end, EXTENT_UPTODATE);
 250}
 251
 252static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
 253{
 254        struct btrfs_fs_info *fs_info = cache->fs_info;
 255        u64 bytenr;
 256        u64 *logical;
 257        int stripe_len;
 258        int i, nr, ret;
 259
 260        if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 261                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 262                cache->bytes_super += stripe_len;
 263                ret = add_excluded_extent(fs_info, cache->key.objectid,
 264                                          stripe_len);
 265                if (ret)
 266                        return ret;
 267        }
 268
 269        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 270                bytenr = btrfs_sb_offset(i);
 271                ret = btrfs_rmap_block(fs_info, cache->key.objectid,
 272                                       bytenr, &logical, &nr, &stripe_len);
 273                if (ret)
 274                        return ret;
 275
 276                while (nr--) {
 277                        u64 start, len;
 278
 279                        if (logical[nr] > cache->key.objectid +
 280                            cache->key.offset)
 281                                continue;
 282
 283                        if (logical[nr] + stripe_len <= cache->key.objectid)
 284                                continue;
 285
 286                        start = logical[nr];
 287                        if (start < cache->key.objectid) {
 288                                start = cache->key.objectid;
 289                                len = (logical[nr] + stripe_len) - start;
 290                        } else {
 291                                len = min_t(u64, stripe_len,
 292                                            cache->key.objectid +
 293                                            cache->key.offset - start);
 294                        }
 295
 296                        cache->bytes_super += len;
 297                        ret = add_excluded_extent(fs_info, start, len);
 298                        if (ret) {
 299                                kfree(logical);
 300                                return ret;
 301                        }
 302                }
 303
 304                kfree(logical);
 305        }
 306        return 0;
 307}
 308
 309static struct btrfs_caching_control *
 310get_caching_control(struct btrfs_block_group_cache *cache)
 311{
 312        struct btrfs_caching_control *ctl;
 313
 314        spin_lock(&cache->lock);
 315        if (!cache->caching_ctl) {
 316                spin_unlock(&cache->lock);
 317                return NULL;
 318        }
 319
 320        ctl = cache->caching_ctl;
 321        refcount_inc(&ctl->count);
 322        spin_unlock(&cache->lock);
 323        return ctl;
 324}
 325
 326static void put_caching_control(struct btrfs_caching_control *ctl)
 327{
 328        if (refcount_dec_and_test(&ctl->count))
 329                kfree(ctl);
 330}
 331
 332#ifdef CONFIG_BTRFS_DEBUG
 333static void fragment_free_space(struct btrfs_block_group_cache *block_group)
 334{
 335        struct btrfs_fs_info *fs_info = block_group->fs_info;
 336        u64 start = block_group->key.objectid;
 337        u64 len = block_group->key.offset;
 338        u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
 339                fs_info->nodesize : fs_info->sectorsize;
 340        u64 step = chunk << 1;
 341
 342        while (len > chunk) {
 343                btrfs_remove_free_space(block_group, start, chunk);
 344                start += step;
 345                if (len < step)
 346                        len = 0;
 347                else
 348                        len -= step;
 349        }
 350}
 351#endif
 352
 353/*
 354 * this is only called by cache_block_group, since we could have freed extents
 355 * we need to check the pinned_extents for any extents that can't be used yet
 356 * since their free space will be released as soon as the transaction commits.
 357 */
 358u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 359                       u64 start, u64 end)
 360{
 361        struct btrfs_fs_info *info = block_group->fs_info;
 362        u64 extent_start, extent_end, size, total_added = 0;
 363        int ret;
 364
 365        while (start < end) {
 366                ret = find_first_extent_bit(info->pinned_extents, start,
 367                                            &extent_start, &extent_end,
 368                                            EXTENT_DIRTY | EXTENT_UPTODATE,
 369                                            NULL);
 370                if (ret)
 371                        break;
 372
 373                if (extent_start <= start) {
 374                        start = extent_end + 1;
 375                } else if (extent_start > start && extent_start < end) {
 376                        size = extent_start - start;
 377                        total_added += size;
 378                        ret = btrfs_add_free_space(block_group, start,
 379                                                   size);
 380                        BUG_ON(ret); /* -ENOMEM or logic error */
 381                        start = extent_end + 1;
 382                } else {
 383                        break;
 384                }
 385        }
 386
 387        if (start < end) {
 388                size = end - start;
 389                total_added += size;
 390                ret = btrfs_add_free_space(block_group, start, size);
 391                BUG_ON(ret); /* -ENOMEM or logic error */
 392        }
 393
 394        return total_added;
 395}
 396
 397static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 398{
 399        struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
 400        struct btrfs_fs_info *fs_info = block_group->fs_info;
 401        struct btrfs_root *extent_root = fs_info->extent_root;
 402        struct btrfs_path *path;
 403        struct extent_buffer *leaf;
 404        struct btrfs_key key;
 405        u64 total_found = 0;
 406        u64 last = 0;
 407        u32 nritems;
 408        int ret;
 409        bool wakeup = true;
 410
 411        path = btrfs_alloc_path();
 412        if (!path)
 413                return -ENOMEM;
 414
 415        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 416
 417#ifdef CONFIG_BTRFS_DEBUG
 418        /*
 419         * If we're fragmenting we don't want to make anybody think we can
 420         * allocate from this block group until we've had a chance to fragment
 421         * the free space.
 422         */
 423        if (btrfs_should_fragment_free_space(block_group))
 424                wakeup = false;
 425#endif
 426        /*
 427         * We don't want to deadlock with somebody trying to allocate a new
 428         * extent for the extent root while also trying to search the extent
 429         * root to add free space.  So we skip locking and search the commit
 430         * root, since its read-only
 431         */
 432        path->skip_locking = 1;
 433        path->search_commit_root = 1;
 434        path->reada = READA_FORWARD;
 435
 436        key.objectid = last;
 437        key.offset = 0;
 438        key.type = BTRFS_EXTENT_ITEM_KEY;
 439
 440next:
 441        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 442        if (ret < 0)
 443                goto out;
 444
 445        leaf = path->nodes[0];
 446        nritems = btrfs_header_nritems(leaf);
 447
 448        while (1) {
 449                if (btrfs_fs_closing(fs_info) > 1) {
 450                        last = (u64)-1;
 451                        break;
 452                }
 453
 454                if (path->slots[0] < nritems) {
 455                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 456                } else {
 457                        ret = find_next_key(path, 0, &key);
 458                        if (ret)
 459                                break;
 460
 461                        if (need_resched() ||
 462                            rwsem_is_contended(&fs_info->commit_root_sem)) {
 463                                if (wakeup)
 464                                        caching_ctl->progress = last;
 465                                btrfs_release_path(path);
 466                                up_read(&fs_info->commit_root_sem);
 467                                mutex_unlock(&caching_ctl->mutex);
 468                                cond_resched();
 469                                mutex_lock(&caching_ctl->mutex);
 470                                down_read(&fs_info->commit_root_sem);
 471                                goto next;
 472                        }
 473
 474                        ret = btrfs_next_leaf(extent_root, path);
 475                        if (ret < 0)
 476                                goto out;
 477                        if (ret)
 478                                break;
 479                        leaf = path->nodes[0];
 480                        nritems = btrfs_header_nritems(leaf);
 481                        continue;
 482                }
 483
 484                if (key.objectid < last) {
 485                        key.objectid = last;
 486                        key.offset = 0;
 487                        key.type = BTRFS_EXTENT_ITEM_KEY;
 488
 489                        if (wakeup)
 490                                caching_ctl->progress = last;
 491                        btrfs_release_path(path);
 492                        goto next;
 493                }
 494
 495                if (key.objectid < block_group->key.objectid) {
 496                        path->slots[0]++;
 497                        continue;
 498                }
 499
 500                if (key.objectid >= block_group->key.objectid +
 501                    block_group->key.offset)
 502                        break;
 503
 504                if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 505                    key.type == BTRFS_METADATA_ITEM_KEY) {
 506                        total_found += add_new_free_space(block_group, last,
 507                                                          key.objectid);
 508                        if (key.type == BTRFS_METADATA_ITEM_KEY)
 509                                last = key.objectid +
 510                                        fs_info->nodesize;
 511                        else
 512                                last = key.objectid + key.offset;
 513
 514                        if (total_found > CACHING_CTL_WAKE_UP) {
 515                                total_found = 0;
 516                                if (wakeup)
 517                                        wake_up(&caching_ctl->wait);
 518                        }
 519                }
 520                path->slots[0]++;
 521        }
 522        ret = 0;
 523
 524        total_found += add_new_free_space(block_group, last,
 525                                          block_group->key.objectid +
 526                                          block_group->key.offset);
 527        caching_ctl->progress = (u64)-1;
 528
 529out:
 530        btrfs_free_path(path);
 531        return ret;
 532}
 533
 534static noinline void caching_thread(struct btrfs_work *work)
 535{
 536        struct btrfs_block_group_cache *block_group;
 537        struct btrfs_fs_info *fs_info;
 538        struct btrfs_caching_control *caching_ctl;
 539        int ret;
 540
 541        caching_ctl = container_of(work, struct btrfs_caching_control, work);
 542        block_group = caching_ctl->block_group;
 543        fs_info = block_group->fs_info;
 544
 545        mutex_lock(&caching_ctl->mutex);
 546        down_read(&fs_info->commit_root_sem);
 547
 548        if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
 549                ret = load_free_space_tree(caching_ctl);
 550        else
 551                ret = load_extent_tree_free(caching_ctl);
 552
 553        spin_lock(&block_group->lock);
 554        block_group->caching_ctl = NULL;
 555        block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 556        spin_unlock(&block_group->lock);
 557
 558#ifdef CONFIG_BTRFS_DEBUG
 559        if (btrfs_should_fragment_free_space(block_group)) {
 560                u64 bytes_used;
 561
 562                spin_lock(&block_group->space_info->lock);
 563                spin_lock(&block_group->lock);
 564                bytes_used = block_group->key.offset -
 565                        btrfs_block_group_used(&block_group->item);
 566                block_group->space_info->bytes_used += bytes_used >> 1;
 567                spin_unlock(&block_group->lock);
 568                spin_unlock(&block_group->space_info->lock);
 569                fragment_free_space(block_group);
 570        }
 571#endif
 572
 573        caching_ctl->progress = (u64)-1;
 574
 575        up_read(&fs_info->commit_root_sem);
 576        free_excluded_extents(block_group);
 577        mutex_unlock(&caching_ctl->mutex);
 578
 579        wake_up(&caching_ctl->wait);
 580
 581        put_caching_control(caching_ctl);
 582        btrfs_put_block_group(block_group);
 583}
 584
 585static int cache_block_group(struct btrfs_block_group_cache *cache,
 586                             int load_cache_only)
 587{
 588        DEFINE_WAIT(wait);
 589        struct btrfs_fs_info *fs_info = cache->fs_info;
 590        struct btrfs_caching_control *caching_ctl;
 591        int ret = 0;
 592
 593        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 594        if (!caching_ctl)
 595                return -ENOMEM;
 596
 597        INIT_LIST_HEAD(&caching_ctl->list);
 598        mutex_init(&caching_ctl->mutex);
 599        init_waitqueue_head(&caching_ctl->wait);
 600        caching_ctl->block_group = cache;
 601        caching_ctl->progress = cache->key.objectid;
 602        refcount_set(&caching_ctl->count, 1);
 603        btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
 604                        caching_thread, NULL, NULL);
 605
 606        spin_lock(&cache->lock);
 607        /*
 608         * This should be a rare occasion, but this could happen I think in the
 609         * case where one thread starts to load the space cache info, and then
 610         * some other thread starts a transaction commit which tries to do an
 611         * allocation while the other thread is still loading the space cache
 612         * info.  The previous loop should have kept us from choosing this block
 613         * group, but if we've moved to the state where we will wait on caching
 614         * block groups we need to first check if we're doing a fast load here,
 615         * so we can wait for it to finish, otherwise we could end up allocating
 616         * from a block group who's cache gets evicted for one reason or
 617         * another.
 618         */
 619        while (cache->cached == BTRFS_CACHE_FAST) {
 620                struct btrfs_caching_control *ctl;
 621
 622                ctl = cache->caching_ctl;
 623                refcount_inc(&ctl->count);
 624                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 625                spin_unlock(&cache->lock);
 626
 627                schedule();
 628
 629                finish_wait(&ctl->wait, &wait);
 630                put_caching_control(ctl);
 631                spin_lock(&cache->lock);
 632        }
 633
 634        if (cache->cached != BTRFS_CACHE_NO) {
 635                spin_unlock(&cache->lock);
 636                kfree(caching_ctl);
 637                return 0;
 638        }
 639        WARN_ON(cache->caching_ctl);
 640        cache->caching_ctl = caching_ctl;
 641        cache->cached = BTRFS_CACHE_FAST;
 642        spin_unlock(&cache->lock);
 643
 644        if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
 645                mutex_lock(&caching_ctl->mutex);
 646                ret = load_free_space_cache(fs_info, cache);
 647
 648                spin_lock(&cache->lock);
 649                if (ret == 1) {
 650                        cache->caching_ctl = NULL;
 651                        cache->cached = BTRFS_CACHE_FINISHED;
 652                        cache->last_byte_to_unpin = (u64)-1;
 653                        caching_ctl->progress = (u64)-1;
 654                } else {
 655                        if (load_cache_only) {
 656                                cache->caching_ctl = NULL;
 657                                cache->cached = BTRFS_CACHE_NO;
 658                        } else {
 659                                cache->cached = BTRFS_CACHE_STARTED;
 660                                cache->has_caching_ctl = 1;
 661                        }
 662                }
 663                spin_unlock(&cache->lock);
 664#ifdef CONFIG_BTRFS_DEBUG
 665                if (ret == 1 &&
 666                    btrfs_should_fragment_free_space(cache)) {
 667                        u64 bytes_used;
 668
 669                        spin_lock(&cache->space_info->lock);
 670                        spin_lock(&cache->lock);
 671                        bytes_used = cache->key.offset -
 672                                btrfs_block_group_used(&cache->item);
 673                        cache->space_info->bytes_used += bytes_used >> 1;
 674                        spin_unlock(&cache->lock);
 675                        spin_unlock(&cache->space_info->lock);
 676                        fragment_free_space(cache);
 677                }
 678#endif
 679                mutex_unlock(&caching_ctl->mutex);
 680
 681                wake_up(&caching_ctl->wait);
 682                if (ret == 1) {
 683                        put_caching_control(caching_ctl);
 684                        free_excluded_extents(cache);
 685                        return 0;
 686                }
 687        } else {
 688                /*
 689                 * We're either using the free space tree or no caching at all.
 690                 * Set cached to the appropriate value and wakeup any waiters.
 691                 */
 692                spin_lock(&cache->lock);
 693                if (load_cache_only) {
 694                        cache->caching_ctl = NULL;
 695                        cache->cached = BTRFS_CACHE_NO;
 696                } else {
 697                        cache->cached = BTRFS_CACHE_STARTED;
 698                        cache->has_caching_ctl = 1;
 699                }
 700                spin_unlock(&cache->lock);
 701                wake_up(&caching_ctl->wait);
 702        }
 703
 704        if (load_cache_only) {
 705                put_caching_control(caching_ctl);
 706                return 0;
 707        }
 708
 709        down_write(&fs_info->commit_root_sem);
 710        refcount_inc(&caching_ctl->count);
 711        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 712        up_write(&fs_info->commit_root_sem);
 713
 714        btrfs_get_block_group(cache);
 715
 716        btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 717
 718        return ret;
 719}
 720
 721/*
 722 * return the block group that starts at or after bytenr
 723 */
 724static struct btrfs_block_group_cache *
 725btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 726{
 727        return block_group_cache_tree_search(info, bytenr, 0);
 728}
 729
 730/*
 731 * return the block group that contains the given bytenr
 732 */
 733struct btrfs_block_group_cache *btrfs_lookup_block_group(
 734                                                 struct btrfs_fs_info *info,
 735                                                 u64 bytenr)
 736{
 737        return block_group_cache_tree_search(info, bytenr, 1);
 738}
 739
 740static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 741                                                  u64 flags)
 742{
 743        struct list_head *head = &info->space_info;
 744        struct btrfs_space_info *found;
 745
 746        flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 747
 748        rcu_read_lock();
 749        list_for_each_entry_rcu(found, head, list) {
 750                if (found->flags & flags) {
 751                        rcu_read_unlock();
 752                        return found;
 753                }
 754        }
 755        rcu_read_unlock();
 756        return NULL;
 757}
 758
 759static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
 760                             bool metadata, u64 root_objectid)
 761{
 762        struct btrfs_space_info *space_info;
 763        u64 flags;
 764
 765        if (metadata) {
 766                if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
 767                        flags = BTRFS_BLOCK_GROUP_SYSTEM;
 768                else
 769                        flags = BTRFS_BLOCK_GROUP_METADATA;
 770        } else {
 771                flags = BTRFS_BLOCK_GROUP_DATA;
 772        }
 773
 774        space_info = __find_space_info(fs_info, flags);
 775        ASSERT(space_info);
 776        percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
 777                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
 778}
 779
 780/*
 781 * after adding space to the filesystem, we need to clear the full flags
 782 * on all the space infos.
 783 */
 784void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 785{
 786        struct list_head *head = &info->space_info;
 787        struct btrfs_space_info *found;
 788
 789        rcu_read_lock();
 790        list_for_each_entry_rcu(found, head, list)
 791                found->full = 0;
 792        rcu_read_unlock();
 793}
 794
 795/* simple helper to search for an existing data extent at a given offset */
 796int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
 797{
 798        int ret;
 799        struct btrfs_key key;
 800        struct btrfs_path *path;
 801
 802        path = btrfs_alloc_path();
 803        if (!path)
 804                return -ENOMEM;
 805
 806        key.objectid = start;
 807        key.offset = len;
 808        key.type = BTRFS_EXTENT_ITEM_KEY;
 809        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 810        btrfs_free_path(path);
 811        return ret;
 812}
 813
 814/*
 815 * helper function to lookup reference count and flags of a tree block.
 816 *
 817 * the head node for delayed ref is used to store the sum of all the
 818 * reference count modifications queued up in the rbtree. the head
 819 * node may also store the extent flags to set. This way you can check
 820 * to see what the reference count and extent flags would be if all of
 821 * the delayed refs are not processed.
 822 */
 823int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 824                             struct btrfs_fs_info *fs_info, u64 bytenr,
 825                             u64 offset, int metadata, u64 *refs, u64 *flags)
 826{
 827        struct btrfs_delayed_ref_head *head;
 828        struct btrfs_delayed_ref_root *delayed_refs;
 829        struct btrfs_path *path;
 830        struct btrfs_extent_item *ei;
 831        struct extent_buffer *leaf;
 832        struct btrfs_key key;
 833        u32 item_size;
 834        u64 num_refs;
 835        u64 extent_flags;
 836        int ret;
 837
 838        /*
 839         * If we don't have skinny metadata, don't bother doing anything
 840         * different
 841         */
 842        if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
 843                offset = fs_info->nodesize;
 844                metadata = 0;
 845        }
 846
 847        path = btrfs_alloc_path();
 848        if (!path)
 849                return -ENOMEM;
 850
 851        if (!trans) {
 852                path->skip_locking = 1;
 853                path->search_commit_root = 1;
 854        }
 855
 856search_again:
 857        key.objectid = bytenr;
 858        key.offset = offset;
 859        if (metadata)
 860                key.type = BTRFS_METADATA_ITEM_KEY;
 861        else
 862                key.type = BTRFS_EXTENT_ITEM_KEY;
 863
 864        ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
 865        if (ret < 0)
 866                goto out_free;
 867
 868        if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 869                if (path->slots[0]) {
 870                        path->slots[0]--;
 871                        btrfs_item_key_to_cpu(path->nodes[0], &key,
 872                                              path->slots[0]);
 873                        if (key.objectid == bytenr &&
 874                            key.type == BTRFS_EXTENT_ITEM_KEY &&
 875                            key.offset == fs_info->nodesize)
 876                                ret = 0;
 877                }
 878        }
 879
 880        if (ret == 0) {
 881                leaf = path->nodes[0];
 882                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 883                if (item_size >= sizeof(*ei)) {
 884                        ei = btrfs_item_ptr(leaf, path->slots[0],
 885                                            struct btrfs_extent_item);
 886                        num_refs = btrfs_extent_refs(leaf, ei);
 887                        extent_flags = btrfs_extent_flags(leaf, ei);
 888                } else {
 889                        ret = -EINVAL;
 890                        btrfs_print_v0_err(fs_info);
 891                        if (trans)
 892                                btrfs_abort_transaction(trans, ret);
 893                        else
 894                                btrfs_handle_fs_error(fs_info, ret, NULL);
 895
 896                        goto out_free;
 897                }
 898
 899                BUG_ON(num_refs == 0);
 900        } else {
 901                num_refs = 0;
 902                extent_flags = 0;
 903                ret = 0;
 904        }
 905
 906        if (!trans)
 907                goto out;
 908
 909        delayed_refs = &trans->transaction->delayed_refs;
 910        spin_lock(&delayed_refs->lock);
 911        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
 912        if (head) {
 913                if (!mutex_trylock(&head->mutex)) {
 914                        refcount_inc(&head->refs);
 915                        spin_unlock(&delayed_refs->lock);
 916
 917                        btrfs_release_path(path);
 918
 919                        /*
 920                         * Mutex was contended, block until it's released and try
 921                         * again
 922                         */
 923                        mutex_lock(&head->mutex);
 924                        mutex_unlock(&head->mutex);
 925                        btrfs_put_delayed_ref_head(head);
 926                        goto search_again;
 927                }
 928                spin_lock(&head->lock);
 929                if (head->extent_op && head->extent_op->update_flags)
 930                        extent_flags |= head->extent_op->flags_to_set;
 931                else
 932                        BUG_ON(num_refs == 0);
 933
 934                num_refs += head->ref_mod;
 935                spin_unlock(&head->lock);
 936                mutex_unlock(&head->mutex);
 937        }
 938        spin_unlock(&delayed_refs->lock);
 939out:
 940        WARN_ON(num_refs == 0);
 941        if (refs)
 942                *refs = num_refs;
 943        if (flags)
 944                *flags = extent_flags;
 945out_free:
 946        btrfs_free_path(path);
 947        return ret;
 948}
 949
 950/*
 951 * Back reference rules.  Back refs have three main goals:
 952 *
 953 * 1) differentiate between all holders of references to an extent so that
 954 *    when a reference is dropped we can make sure it was a valid reference
 955 *    before freeing the extent.
 956 *
 957 * 2) Provide enough information to quickly find the holders of an extent
 958 *    if we notice a given block is corrupted or bad.
 959 *
 960 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 961 *    maintenance.  This is actually the same as #2, but with a slightly
 962 *    different use case.
 963 *
 964 * There are two kinds of back refs. The implicit back refs is optimized
 965 * for pointers in non-shared tree blocks. For a given pointer in a block,
 966 * back refs of this kind provide information about the block's owner tree
 967 * and the pointer's key. These information allow us to find the block by
 968 * b-tree searching. The full back refs is for pointers in tree blocks not
 969 * referenced by their owner trees. The location of tree block is recorded
 970 * in the back refs. Actually the full back refs is generic, and can be
 971 * used in all cases the implicit back refs is used. The major shortcoming
 972 * of the full back refs is its overhead. Every time a tree block gets
 973 * COWed, we have to update back refs entry for all pointers in it.
 974 *
 975 * For a newly allocated tree block, we use implicit back refs for
 976 * pointers in it. This means most tree related operations only involve
 977 * implicit back refs. For a tree block created in old transaction, the
 978 * only way to drop a reference to it is COW it. So we can detect the
 979 * event that tree block loses its owner tree's reference and do the
 980 * back refs conversion.
 981 *
 982 * When a tree block is COWed through a tree, there are four cases:
 983 *
 984 * The reference count of the block is one and the tree is the block's
 985 * owner tree. Nothing to do in this case.
 986 *
 987 * The reference count of the block is one and the tree is not the
 988 * block's owner tree. In this case, full back refs is used for pointers
 989 * in the block. Remove these full back refs, add implicit back refs for
 990 * every pointers in the new block.
 991 *
 992 * The reference count of the block is greater than one and the tree is
 993 * the block's owner tree. In this case, implicit back refs is used for
 994 * pointers in the block. Add full back refs for every pointers in the
 995 * block, increase lower level extents' reference counts. The original
 996 * implicit back refs are entailed to the new block.
 997 *
 998 * The reference count of the block is greater than one and the tree is
 999 * not the block's owner tree. Add implicit back refs for every pointer in
1000 * the new block, increase lower level extents' reference count.
1001 *
1002 * Back Reference Key composing:
1003 *
1004 * The key objectid corresponds to the first byte in the extent,
1005 * The key type is used to differentiate between types of back refs.
1006 * There are different meanings of the key offset for different types
1007 * of back refs.
1008 *
1009 * File extents can be referenced by:
1010 *
1011 * - multiple snapshots, subvolumes, or different generations in one subvol
1012 * - different files inside a single subvolume
1013 * - different offsets inside a file (bookend extents in file.c)
1014 *
1015 * The extent ref structure for the implicit back refs has fields for:
1016 *
1017 * - Objectid of the subvolume root
1018 * - objectid of the file holding the reference
1019 * - original offset in the file
1020 * - how many bookend extents
1021 *
1022 * The key offset for the implicit back refs is hash of the first
1023 * three fields.
1024 *
1025 * The extent ref structure for the full back refs has field for:
1026 *
1027 * - number of pointers in the tree leaf
1028 *
1029 * The key offset for the implicit back refs is the first byte of
1030 * the tree leaf
1031 *
1032 * When a file extent is allocated, The implicit back refs is used.
1033 * the fields are filled in:
1034 *
1035 *     (root_key.objectid, inode objectid, offset in file, 1)
1036 *
1037 * When a file extent is removed file truncation, we find the
1038 * corresponding implicit back refs and check the following fields:
1039 *
1040 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1041 *
1042 * Btree extents can be referenced by:
1043 *
1044 * - Different subvolumes
1045 *
1046 * Both the implicit back refs and the full back refs for tree blocks
1047 * only consist of key. The key offset for the implicit back refs is
1048 * objectid of block's owner tree. The key offset for the full back refs
1049 * is the first byte of parent block.
1050 *
1051 * When implicit back refs is used, information about the lowest key and
1052 * level of the tree block are required. These information are stored in
1053 * tree block info structure.
1054 */
1055
1056/*
1057 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
1058 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
1059 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
1060 */
1061int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
1062                                     struct btrfs_extent_inline_ref *iref,
1063                                     enum btrfs_inline_ref_type is_data)
1064{
1065        int type = btrfs_extent_inline_ref_type(eb, iref);
1066        u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
1067
1068        if (type == BTRFS_TREE_BLOCK_REF_KEY ||
1069            type == BTRFS_SHARED_BLOCK_REF_KEY ||
1070            type == BTRFS_SHARED_DATA_REF_KEY ||
1071            type == BTRFS_EXTENT_DATA_REF_KEY) {
1072                if (is_data == BTRFS_REF_TYPE_BLOCK) {
1073                        if (type == BTRFS_TREE_BLOCK_REF_KEY)
1074                                return type;
1075                        if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1076                                ASSERT(eb->fs_info);
1077                                /*
1078                                 * Every shared one has parent tree
1079                                 * block, which must be aligned to
1080                                 * nodesize.
1081                                 */
1082                                if (offset &&
1083                                    IS_ALIGNED(offset, eb->fs_info->nodesize))
1084                                        return type;
1085                        }
1086                } else if (is_data == BTRFS_REF_TYPE_DATA) {
1087                        if (type == BTRFS_EXTENT_DATA_REF_KEY)
1088                                return type;
1089                        if (type == BTRFS_SHARED_DATA_REF_KEY) {
1090                                ASSERT(eb->fs_info);
1091                                /*
1092                                 * Every shared one has parent tree
1093                                 * block, which must be aligned to
1094                                 * nodesize.
1095                                 */
1096                                if (offset &&
1097                                    IS_ALIGNED(offset, eb->fs_info->nodesize))
1098                                        return type;
1099                        }
1100                } else {
1101                        ASSERT(is_data == BTRFS_REF_TYPE_ANY);
1102                        return type;
1103                }
1104        }
1105
1106        btrfs_print_leaf((struct extent_buffer *)eb);
1107        btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
1108                  eb->start, type);
1109        WARN_ON(1);
1110
1111        return BTRFS_REF_TYPE_INVALID;
1112}
1113
1114static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1115{
1116        u32 high_crc = ~(u32)0;
1117        u32 low_crc = ~(u32)0;
1118        __le64 lenum;
1119
1120        lenum = cpu_to_le64(root_objectid);
1121        high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1122        lenum = cpu_to_le64(owner);
1123        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1124        lenum = cpu_to_le64(offset);
1125        low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1126
1127        return ((u64)high_crc << 31) ^ (u64)low_crc;
1128}
1129
1130static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1131                                     struct btrfs_extent_data_ref *ref)
1132{
1133        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1134                                    btrfs_extent_data_ref_objectid(leaf, ref),
1135                                    btrfs_extent_data_ref_offset(leaf, ref));
1136}
1137
1138static int match_extent_data_ref(struct extent_buffer *leaf,
1139                                 struct btrfs_extent_data_ref *ref,
1140                                 u64 root_objectid, u64 owner, u64 offset)
1141{
1142        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1143            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1144            btrfs_extent_data_ref_offset(leaf, ref) != offset)
1145                return 0;
1146        return 1;
1147}
1148
1149static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1150                                           struct btrfs_path *path,
1151                                           u64 bytenr, u64 parent,
1152                                           u64 root_objectid,
1153                                           u64 owner, u64 offset)
1154{
1155        struct btrfs_root *root = trans->fs_info->extent_root;
1156        struct btrfs_key key;
1157        struct btrfs_extent_data_ref *ref;
1158        struct extent_buffer *leaf;
1159        u32 nritems;
1160        int ret;
1161        int recow;
1162        int err = -ENOENT;
1163
1164        key.objectid = bytenr;
1165        if (parent) {
1166                key.type = BTRFS_SHARED_DATA_REF_KEY;
1167                key.offset = parent;
1168        } else {
1169                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1170                key.offset = hash_extent_data_ref(root_objectid,
1171                                                  owner, offset);
1172        }
1173again:
1174        recow = 0;
1175        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1176        if (ret < 0) {
1177                err = ret;
1178                goto fail;
1179        }
1180
1181        if (parent) {
1182                if (!ret)
1183                        return 0;
1184                goto fail;
1185        }
1186
1187        leaf = path->nodes[0];
1188        nritems = btrfs_header_nritems(leaf);
1189        while (1) {
1190                if (path->slots[0] >= nritems) {
1191                        ret = btrfs_next_leaf(root, path);
1192                        if (ret < 0)
1193                                err = ret;
1194                        if (ret)
1195                                goto fail;
1196
1197                        leaf = path->nodes[0];
1198                        nritems = btrfs_header_nritems(leaf);
1199                        recow = 1;
1200                }
1201
1202                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1203                if (key.objectid != bytenr ||
1204                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1205                        goto fail;
1206
1207                ref = btrfs_item_ptr(leaf, path->slots[0],
1208                                     struct btrfs_extent_data_ref);
1209
1210                if (match_extent_data_ref(leaf, ref, root_objectid,
1211                                          owner, offset)) {
1212                        if (recow) {
1213                                btrfs_release_path(path);
1214                                goto again;
1215                        }
1216                        err = 0;
1217                        break;
1218                }
1219                path->slots[0]++;
1220        }
1221fail:
1222        return err;
1223}
1224
1225static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1226                                           struct btrfs_path *path,
1227                                           u64 bytenr, u64 parent,
1228                                           u64 root_objectid, u64 owner,
1229                                           u64 offset, int refs_to_add)
1230{
1231        struct btrfs_root *root = trans->fs_info->extent_root;
1232        struct btrfs_key key;
1233        struct extent_buffer *leaf;
1234        u32 size;
1235        u32 num_refs;
1236        int ret;
1237
1238        key.objectid = bytenr;
1239        if (parent) {
1240                key.type = BTRFS_SHARED_DATA_REF_KEY;
1241                key.offset = parent;
1242                size = sizeof(struct btrfs_shared_data_ref);
1243        } else {
1244                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1245                key.offset = hash_extent_data_ref(root_objectid,
1246                                                  owner, offset);
1247                size = sizeof(struct btrfs_extent_data_ref);
1248        }
1249
1250        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1251        if (ret && ret != -EEXIST)
1252                goto fail;
1253
1254        leaf = path->nodes[0];
1255        if (parent) {
1256                struct btrfs_shared_data_ref *ref;
1257                ref = btrfs_item_ptr(leaf, path->slots[0],
1258                                     struct btrfs_shared_data_ref);
1259                if (ret == 0) {
1260                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1261                } else {
1262                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
1263                        num_refs += refs_to_add;
1264                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1265                }
1266        } else {
1267                struct btrfs_extent_data_ref *ref;
1268                while (ret == -EEXIST) {
1269                        ref = btrfs_item_ptr(leaf, path->slots[0],
1270                                             struct btrfs_extent_data_ref);
1271                        if (match_extent_data_ref(leaf, ref, root_objectid,
1272                                                  owner, offset))
1273                                break;
1274                        btrfs_release_path(path);
1275                        key.offset++;
1276                        ret = btrfs_insert_empty_item(trans, root, path, &key,
1277                                                      size);
1278                        if (ret && ret != -EEXIST)
1279                                goto fail;
1280
1281                        leaf = path->nodes[0];
1282                }
1283                ref = btrfs_item_ptr(leaf, path->slots[0],
1284                                     struct btrfs_extent_data_ref);
1285                if (ret == 0) {
1286                        btrfs_set_extent_data_ref_root(leaf, ref,
1287                                                       root_objectid);
1288                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1289                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1290                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1291                } else {
1292                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
1293                        num_refs += refs_to_add;
1294                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1295                }
1296        }
1297        btrfs_mark_buffer_dirty(leaf);
1298        ret = 0;
1299fail:
1300        btrfs_release_path(path);
1301        return ret;
1302}
1303
1304static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1305                                           struct btrfs_path *path,
1306                                           int refs_to_drop, int *last_ref)
1307{
1308        struct btrfs_key key;
1309        struct btrfs_extent_data_ref *ref1 = NULL;
1310        struct btrfs_shared_data_ref *ref2 = NULL;
1311        struct extent_buffer *leaf;
1312        u32 num_refs = 0;
1313        int ret = 0;
1314
1315        leaf = path->nodes[0];
1316        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1317
1318        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1319                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1320                                      struct btrfs_extent_data_ref);
1321                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1322        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1323                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1324                                      struct btrfs_shared_data_ref);
1325                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1326        } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
1327                btrfs_print_v0_err(trans->fs_info);
1328                btrfs_abort_transaction(trans, -EINVAL);
1329                return -EINVAL;
1330        } else {
1331                BUG();
1332        }
1333
1334        BUG_ON(num_refs < refs_to_drop);
1335        num_refs -= refs_to_drop;
1336
1337        if (num_refs == 0) {
1338                ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1339                *last_ref = 1;
1340        } else {
1341                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1342                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1343                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1344                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1345                btrfs_mark_buffer_dirty(leaf);
1346        }
1347        return ret;
1348}
1349
1350static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1351                                          struct btrfs_extent_inline_ref *iref)
1352{
1353        struct btrfs_key key;
1354        struct extent_buffer *leaf;
1355        struct btrfs_extent_data_ref *ref1;
1356        struct btrfs_shared_data_ref *ref2;
1357        u32 num_refs = 0;
1358        int type;
1359
1360        leaf = path->nodes[0];
1361        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1362
1363        BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
1364        if (iref) {
1365                /*
1366                 * If type is invalid, we should have bailed out earlier than
1367                 * this call.
1368                 */
1369                type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
1370                ASSERT(type != BTRFS_REF_TYPE_INVALID);
1371                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1372                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1373                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1374                } else {
1375                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1376                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1377                }
1378        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1379                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1380                                      struct btrfs_extent_data_ref);
1381                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1382        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1383                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1384                                      struct btrfs_shared_data_ref);
1385                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1386        } else {
1387                WARN_ON(1);
1388        }
1389        return num_refs;
1390}
1391
1392static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1393                                          struct btrfs_path *path,
1394                                          u64 bytenr, u64 parent,
1395                                          u64 root_objectid)
1396{
1397        struct btrfs_root *root = trans->fs_info->extent_root;
1398        struct btrfs_key key;
1399        int ret;
1400
1401        key.objectid = bytenr;
1402        if (parent) {
1403                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1404                key.offset = parent;
1405        } else {
1406                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1407                key.offset = root_objectid;
1408        }
1409
1410        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1411        if (ret > 0)
1412                ret = -ENOENT;
1413        return ret;
1414}
1415
1416static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1417                                          struct btrfs_path *path,
1418                                          u64 bytenr, u64 parent,
1419                                          u64 root_objectid)
1420{
1421        struct btrfs_key key;
1422        int ret;
1423
1424        key.objectid = bytenr;
1425        if (parent) {
1426                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1427                key.offset = parent;
1428        } else {
1429                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1430                key.offset = root_objectid;
1431        }
1432
1433        ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
1434                                      path, &key, 0);
1435        btrfs_release_path(path);
1436        return ret;
1437}
1438
1439static inline int extent_ref_type(u64 parent, u64 owner)
1440{
1441        int type;
1442        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1443                if (parent > 0)
1444                        type = BTRFS_SHARED_BLOCK_REF_KEY;
1445                else
1446                        type = BTRFS_TREE_BLOCK_REF_KEY;
1447        } else {
1448                if (parent > 0)
1449                        type = BTRFS_SHARED_DATA_REF_KEY;
1450                else
1451                        type = BTRFS_EXTENT_DATA_REF_KEY;
1452        }
1453        return type;
1454}
1455
1456static int find_next_key(struct btrfs_path *path, int level,
1457                         struct btrfs_key *key)
1458
1459{
1460        for (; level < BTRFS_MAX_LEVEL; level++) {
1461                if (!path->nodes[level])
1462                        break;
1463                if (path->slots[level] + 1 >=
1464                    btrfs_header_nritems(path->nodes[level]))
1465                        continue;
1466                if (level == 0)
1467                        btrfs_item_key_to_cpu(path->nodes[level], key,
1468                                              path->slots[level] + 1);
1469                else
1470                        btrfs_node_key_to_cpu(path->nodes[level], key,
1471                                              path->slots[level] + 1);
1472                return 0;
1473        }
1474        return 1;
1475}
1476
1477/*
1478 * look for inline back ref. if back ref is found, *ref_ret is set
1479 * to the address of inline back ref, and 0 is returned.
1480 *
1481 * if back ref isn't found, *ref_ret is set to the address where it
1482 * should be inserted, and -ENOENT is returned.
1483 *
1484 * if insert is true and there are too many inline back refs, the path
1485 * points to the extent item, and -EAGAIN is returned.
1486 *
1487 * NOTE: inline back refs are ordered in the same way that back ref
1488 *       items in the tree are ordered.
1489 */
1490static noinline_for_stack
1491int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1492                                 struct btrfs_path *path,
1493                                 struct btrfs_extent_inline_ref **ref_ret,
1494                                 u64 bytenr, u64 num_bytes,
1495                                 u64 parent, u64 root_objectid,
1496                                 u64 owner, u64 offset, int insert)
1497{
1498        struct btrfs_fs_info *fs_info = trans->fs_info;
1499        struct btrfs_root *root = fs_info->extent_root;
1500        struct btrfs_key key;
1501        struct extent_buffer *leaf;
1502        struct btrfs_extent_item *ei;
1503        struct btrfs_extent_inline_ref *iref;
1504        u64 flags;
1505        u64 item_size;
1506        unsigned long ptr;
1507        unsigned long end;
1508        int extra_size;
1509        int type;
1510        int want;
1511        int ret;
1512        int err = 0;
1513        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
1514        int needed;
1515
1516        key.objectid = bytenr;
1517        key.type = BTRFS_EXTENT_ITEM_KEY;
1518        key.offset = num_bytes;
1519
1520        want = extent_ref_type(parent, owner);
1521        if (insert) {
1522                extra_size = btrfs_extent_inline_ref_size(want);
1523                path->keep_locks = 1;
1524        } else
1525                extra_size = -1;
1526
1527        /*
1528         * Owner is our level, so we can just add one to get the level for the
1529         * block we are interested in.
1530         */
1531        if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1532                key.type = BTRFS_METADATA_ITEM_KEY;
1533                key.offset = owner;
1534        }
1535
1536again:
1537        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1538        if (ret < 0) {
1539                err = ret;
1540                goto out;
1541        }
1542
1543        /*
1544         * We may be a newly converted file system which still has the old fat
1545         * extent entries for metadata, so try and see if we have one of those.
1546         */
1547        if (ret > 0 && skinny_metadata) {
1548                skinny_metadata = false;
1549                if (path->slots[0]) {
1550                        path->slots[0]--;
1551                        btrfs_item_key_to_cpu(path->nodes[0], &key,
1552                                              path->slots[0]);
1553                        if (key.objectid == bytenr &&
1554                            key.type == BTRFS_EXTENT_ITEM_KEY &&
1555                            key.offset == num_bytes)
1556                                ret = 0;
1557                }
1558                if (ret) {
1559                        key.objectid = bytenr;
1560                        key.type = BTRFS_EXTENT_ITEM_KEY;
1561                        key.offset = num_bytes;
1562                        btrfs_release_path(path);
1563                        goto again;
1564                }
1565        }
1566
1567        if (ret && !insert) {
1568                err = -ENOENT;
1569                goto out;
1570        } else if (WARN_ON(ret)) {
1571                err = -EIO;
1572                goto out;
1573        }
1574
1575        leaf = path->nodes[0];
1576        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1577        if (unlikely(item_size < sizeof(*ei))) {
1578                err = -EINVAL;
1579                btrfs_print_v0_err(fs_info);
1580                btrfs_abort_transaction(trans, err);
1581                goto out;
1582        }
1583
1584        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1585        flags = btrfs_extent_flags(leaf, ei);
1586
1587        ptr = (unsigned long)(ei + 1);
1588        end = (unsigned long)ei + item_size;
1589
1590        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1591                ptr += sizeof(struct btrfs_tree_block_info);
1592                BUG_ON(ptr > end);
1593        }
1594
1595        if (owner >= BTRFS_FIRST_FREE_OBJECTID)
1596                needed = BTRFS_REF_TYPE_DATA;
1597        else
1598                needed = BTRFS_REF_TYPE_BLOCK;
1599
1600        err = -ENOENT;
1601        while (1) {
1602                if (ptr >= end) {
1603                        WARN_ON(ptr > end);
1604                        break;
1605                }
1606                iref = (struct btrfs_extent_inline_ref *)ptr;
1607                type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
1608                if (type == BTRFS_REF_TYPE_INVALID) {
1609                        err = -EUCLEAN;
1610                        goto out;
1611                }
1612
1613                if (want < type)
1614                        break;
1615                if (want > type) {
1616                        ptr += btrfs_extent_inline_ref_size(type);
1617                        continue;
1618                }
1619
1620                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1621                        struct btrfs_extent_data_ref *dref;
1622                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1623                        if (match_extent_data_ref(leaf, dref, root_objectid,
1624                                                  owner, offset)) {
1625                                err = 0;
1626                                break;
1627                        }
1628                        if (hash_extent_data_ref_item(leaf, dref) <
1629                            hash_extent_data_ref(root_objectid, owner, offset))
1630                                break;
1631                } else {
1632                        u64 ref_offset;
1633                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1634                        if (parent > 0) {
1635                                if (parent == ref_offset) {
1636                                        err = 0;
1637                                        break;
1638                                }
1639                                if (ref_offset < parent)
1640                                        break;
1641                        } else {
1642                                if (root_objectid == ref_offset) {
1643                                        err = 0;
1644                                        break;
1645                                }
1646                                if (ref_offset < root_objectid)
1647                                        break;
1648                        }
1649                }
1650                ptr += btrfs_extent_inline_ref_size(type);
1651        }
1652        if (err == -ENOENT && insert) {
1653                if (item_size + extra_size >=
1654                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1655                        err = -EAGAIN;
1656                        goto out;
1657                }
1658                /*
1659                 * To add new inline back ref, we have to make sure
1660                 * there is no corresponding back ref item.
1661                 * For simplicity, we just do not add new inline back
1662                 * ref if there is any kind of item for this block
1663                 */
1664                if (find_next_key(path, 0, &key) == 0 &&
1665                    key.objectid == bytenr &&
1666                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1667                        err = -EAGAIN;
1668                        goto out;
1669                }
1670        }
1671        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1672out:
1673        if (insert) {
1674                path->keep_locks = 0;
1675                btrfs_unlock_up_safe(path, 1);
1676        }
1677        return err;
1678}
1679
1680/*
1681 * helper to add new inline back ref
1682 */
1683static noinline_for_stack
1684void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1685                                 struct btrfs_path *path,
1686                                 struct btrfs_extent_inline_ref *iref,
1687                                 u64 parent, u64 root_objectid,
1688                                 u64 owner, u64 offset, int refs_to_add,
1689                                 struct btrfs_delayed_extent_op *extent_op)
1690{
1691        struct extent_buffer *leaf;
1692        struct btrfs_extent_item *ei;
1693        unsigned long ptr;
1694        unsigned long end;
1695        unsigned long item_offset;
1696        u64 refs;
1697        int size;
1698        int type;
1699
1700        leaf = path->nodes[0];
1701        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1702        item_offset = (unsigned long)iref - (unsigned long)ei;
1703
1704        type = extent_ref_type(parent, owner);
1705        size = btrfs_extent_inline_ref_size(type);
1706
1707        btrfs_extend_item(fs_info, path, size);
1708
1709        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1710        refs = btrfs_extent_refs(leaf, ei);
1711        refs += refs_to_add;
1712        btrfs_set_extent_refs(leaf, ei, refs);
1713        if (extent_op)
1714                __run_delayed_extent_op(extent_op, leaf, ei);
1715
1716        ptr = (unsigned long)ei + item_offset;
1717        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1718        if (ptr < end - size)
1719                memmove_extent_buffer(leaf, ptr + size, ptr,
1720                                      end - size - ptr);
1721
1722        iref = (struct btrfs_extent_inline_ref *)ptr;
1723        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1724        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1725                struct btrfs_extent_data_ref *dref;
1726                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1727                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1728                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1729                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1730                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1731        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1732                struct btrfs_shared_data_ref *sref;
1733                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1734                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1735                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1736        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1737                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1738        } else {
1739                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1740        }
1741        btrfs_mark_buffer_dirty(leaf);
1742}
1743
1744static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1745                                 struct btrfs_path *path,
1746                                 struct btrfs_extent_inline_ref **ref_ret,
1747                                 u64 bytenr, u64 num_bytes, u64 parent,
1748                                 u64 root_objectid, u64 owner, u64 offset)
1749{
1750        int ret;
1751
1752        ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1753                                           num_bytes, parent, root_objectid,
1754                                           owner, offset, 0);
1755        if (ret != -ENOENT)
1756                return ret;
1757
1758        btrfs_release_path(path);
1759        *ref_ret = NULL;
1760
1761        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1762                ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1763                                            root_objectid);
1764        } else {
1765                ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1766                                             root_objectid, owner, offset);
1767        }
1768        return ret;
1769}
1770
1771/*
1772 * helper to update/remove inline back ref
1773 */
1774static noinline_for_stack
1775void update_inline_extent_backref(struct btrfs_path *path,
1776                                  struct btrfs_extent_inline_ref *iref,
1777                                  int refs_to_mod,
1778                                  struct btrfs_delayed_extent_op *extent_op,
1779                                  int *last_ref)
1780{
1781        struct extent_buffer *leaf = path->nodes[0];
1782        struct btrfs_fs_info *fs_info = leaf->fs_info;
1783        struct btrfs_extent_item *ei;
1784        struct btrfs_extent_data_ref *dref = NULL;
1785        struct btrfs_shared_data_ref *sref = NULL;
1786        unsigned long ptr;
1787        unsigned long end;
1788        u32 item_size;
1789        int size;
1790        int type;
1791        u64 refs;
1792
1793        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1794        refs = btrfs_extent_refs(leaf, ei);
1795        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1796        refs += refs_to_mod;
1797        btrfs_set_extent_refs(leaf, ei, refs);
1798        if (extent_op)
1799                __run_delayed_extent_op(extent_op, leaf, ei);
1800
1801        /*
1802         * If type is invalid, we should have bailed out after
1803         * lookup_inline_extent_backref().
1804         */
1805        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1806        ASSERT(type != BTRFS_REF_TYPE_INVALID);
1807
1808        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1809                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1810                refs = btrfs_extent_data_ref_count(leaf, dref);
1811        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1812                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1813                refs = btrfs_shared_data_ref_count(leaf, sref);
1814        } else {
1815                refs = 1;
1816                BUG_ON(refs_to_mod != -1);
1817        }
1818
1819        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1820        refs += refs_to_mod;
1821
1822        if (refs > 0) {
1823                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1824                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1825                else
1826                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1827        } else {
1828                *last_ref = 1;
1829                size =  btrfs_extent_inline_ref_size(type);
1830                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1831                ptr = (unsigned long)iref;
1832                end = (unsigned long)ei + item_size;
1833                if (ptr + size < end)
1834                        memmove_extent_buffer(leaf, ptr, ptr + size,
1835                                              end - ptr - size);
1836                item_size -= size;
1837                btrfs_truncate_item(fs_info, path, item_size, 1);
1838        }
1839        btrfs_mark_buffer_dirty(leaf);
1840}
1841
1842static noinline_for_stack
1843int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1844                                 struct btrfs_path *path,
1845                                 u64 bytenr, u64 num_bytes, u64 parent,
1846                                 u64 root_objectid, u64 owner,
1847                                 u64 offset, int refs_to_add,
1848                                 struct btrfs_delayed_extent_op *extent_op)
1849{
1850        struct btrfs_extent_inline_ref *iref;
1851        int ret;
1852
1853        ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1854                                           num_bytes, parent, root_objectid,
1855                                           owner, offset, 1);
1856        if (ret == 0) {
1857                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1858                update_inline_extent_backref(path, iref, refs_to_add,
1859                                             extent_op, NULL);
1860        } else if (ret == -ENOENT) {
1861                setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1862                                            root_objectid, owner, offset,
1863                                            refs_to_add, extent_op);
1864                ret = 0;
1865        }
1866        return ret;
1867}
1868
1869static int insert_extent_backref(struct btrfs_trans_handle *trans,
1870                                 struct btrfs_path *path,
1871                                 u64 bytenr, u64 parent, u64 root_objectid,
1872                                 u64 owner, u64 offset, int refs_to_add)
1873{
1874        int ret;
1875        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1876                BUG_ON(refs_to_add != 1);
1877                ret = insert_tree_block_ref(trans, path, bytenr, parent,
1878                                            root_objectid);
1879        } else {
1880                ret = insert_extent_data_ref(trans, path, bytenr, parent,
1881                                             root_objectid, owner, offset,
1882                                             refs_to_add);
1883        }
1884        return ret;
1885}
1886
1887static int remove_extent_backref(struct btrfs_trans_handle *trans,
1888                                 struct btrfs_path *path,
1889                                 struct btrfs_extent_inline_ref *iref,
1890                                 int refs_to_drop, int is_data, int *last_ref)
1891{
1892        int ret = 0;
1893
1894        BUG_ON(!is_data && refs_to_drop != 1);
1895        if (iref) {
1896                update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1897                                             last_ref);
1898        } else if (is_data) {
1899                ret = remove_extent_data_ref(trans, path, refs_to_drop,
1900                                             last_ref);
1901        } else {
1902                *last_ref = 1;
1903                ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1904        }
1905        return ret;
1906}
1907
1908#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1909static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1910                               u64 *discarded_bytes)
1911{
1912        int j, ret = 0;
1913        u64 bytes_left, end;
1914        u64 aligned_start = ALIGN(start, 1 << 9);
1915
1916        if (WARN_ON(start != aligned_start)) {
1917                len -= aligned_start - start;
1918                len = round_down(len, 1 << 9);
1919                start = aligned_start;
1920        }
1921
1922        *discarded_bytes = 0;
1923
1924        if (!len)
1925                return 0;
1926
1927        end = start + len;
1928        bytes_left = len;
1929
1930        /* Skip any superblocks on this device. */
1931        for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1932                u64 sb_start = btrfs_sb_offset(j);
1933                u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1934                u64 size = sb_start - start;
1935
1936                if (!in_range(sb_start, start, bytes_left) &&
1937                    !in_range(sb_end, start, bytes_left) &&
1938                    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1939                        continue;
1940
1941                /*
1942                 * Superblock spans beginning of range.  Adjust start and
1943                 * try again.
1944                 */
1945                if (sb_start <= start) {
1946                        start += sb_end - start;
1947                        if (start > end) {
1948                                bytes_left = 0;
1949                                break;
1950                        }
1951                        bytes_left = end - start;
1952                        continue;
1953                }
1954
1955                if (size) {
1956                        ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1957                                                   GFP_NOFS, 0);
1958                        if (!ret)
1959                                *discarded_bytes += size;
1960                        else if (ret != -EOPNOTSUPP)
1961                                return ret;
1962                }
1963
1964                start = sb_end;
1965                if (start > end) {
1966                        bytes_left = 0;
1967                        break;
1968                }
1969                bytes_left = end - start;
1970        }
1971
1972        if (bytes_left) {
1973                ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1974                                           GFP_NOFS, 0);
1975                if (!ret)
1976                        *discarded_bytes += bytes_left;
1977        }
1978        return ret;
1979}
1980
1981int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1982                         u64 num_bytes, u64 *actual_bytes)
1983{
1984        int ret;
1985        u64 discarded_bytes = 0;
1986        struct btrfs_bio *bbio = NULL;
1987
1988
1989        /*
1990         * Avoid races with device replace and make sure our bbio has devices
1991         * associated to its stripes that don't go away while we are discarding.
1992         */
1993        btrfs_bio_counter_inc_blocked(fs_info);
1994        /* Tell the block device(s) that the sectors can be discarded */
1995        ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
1996                              &bbio, 0);
1997        /* Error condition is -ENOMEM */
1998        if (!ret) {
1999                struct btrfs_bio_stripe *stripe = bbio->stripes;
2000                int i;
2001
2002
2003                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2004                        u64 bytes;
2005                        struct request_queue *req_q;
2006
2007                        if (!stripe->dev->bdev) {
2008                                ASSERT(btrfs_test_opt(fs_info, DEGRADED));
2009                                continue;
2010                        }
2011                        req_q = bdev_get_queue(stripe->dev->bdev);
2012                        if (!blk_queue_discard(req_q))
2013                                continue;
2014
2015                        ret = btrfs_issue_discard(stripe->dev->bdev,
2016                                                  stripe->physical,
2017                                                  stripe->length,
2018                                                  &bytes);
2019                        if (!ret)
2020                                discarded_bytes += bytes;
2021                        else if (ret != -EOPNOTSUPP)
2022                                break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2023
2024                        /*
2025                         * Just in case we get back EOPNOTSUPP for some reason,
2026                         * just ignore the return value so we don't screw up
2027                         * people calling discard_extent.
2028                         */
2029                        ret = 0;
2030                }
2031                btrfs_put_bbio(bbio);
2032        }
2033        btrfs_bio_counter_dec(fs_info);
2034
2035        if (actual_bytes)
2036                *actual_bytes = discarded_bytes;
2037
2038
2039        if (ret == -EOPNOTSUPP)
2040                ret = 0;
2041        return ret;
2042}
2043
2044/* Can return -ENOMEM */
2045int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2046                         struct btrfs_root *root,
2047                         u64 bytenr, u64 num_bytes, u64 parent,
2048                         u64 root_objectid, u64 owner, u64 offset)
2049{
2050        struct btrfs_fs_info *fs_info = root->fs_info;
2051        int old_ref_mod, new_ref_mod;
2052        int ret;
2053
2054        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
2055               root_objectid == BTRFS_TREE_LOG_OBJECTID);
2056
2057        btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
2058                           owner, offset, BTRFS_ADD_DELAYED_REF);
2059
2060        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
2061                ret = btrfs_add_delayed_tree_ref(trans, bytenr,
2062                                                 num_bytes, parent,
2063                                                 root_objectid, (int)owner,
2064                                                 BTRFS_ADD_DELAYED_REF, NULL,
2065                                                 &old_ref_mod, &new_ref_mod);
2066        } else {
2067                ret = btrfs_add_delayed_data_ref(trans, bytenr,
2068                                                 num_bytes, parent,
2069                                                 root_objectid, owner, offset,
2070                                                 0, BTRFS_ADD_DELAYED_REF,
2071                                                 &old_ref_mod, &new_ref_mod);
2072        }
2073
2074        if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
2075                bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
2076
2077                add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
2078        }
2079
2080        return ret;
2081}
2082
2083/*
2084 * __btrfs_inc_extent_ref - insert backreference for a given extent
2085 *
2086 * @trans:          Handle of transaction
2087 *
2088 * @node:           The delayed ref node used to get the bytenr/length for
2089 *                  extent whose references are incremented.
2090 *
2091 * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
2092 *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
2093 *                  bytenr of the parent block. Since new extents are always
2094 *                  created with indirect references, this will only be the case
2095 *                  when relocating a shared extent. In that case, root_objectid
2096 *                  will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
2097 *                  be 0
2098 *
2099 * @root_objectid:  The id of the root where this modification has originated,
2100 *                  this can be either one of the well-known metadata trees or
2101 *                  the subvolume id which references this extent.
2102 *
2103 * @owner:          For data extents it is the inode number of the owning file.
2104 *                  For metadata extents this parameter holds the level in the
2105 *                  tree of the extent.
2106 *
2107 * @offset:         For metadata extents the offset is ignored and is currently
2108 *                  always passed as 0. For data extents it is the fileoffset
2109 *                  this extent belongs to.
2110 *
2111 * @refs_to_add     Number of references to add
2112 *
2113 * @extent_op       Pointer to a structure, holding information necessary when
2114 *                  updating a tree block's flags
2115 *
2116 */
2117static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2118                                  struct btrfs_delayed_ref_node *node,
2119                                  u64 parent, u64 root_objectid,
2120                                  u64 owner, u64 offset, int refs_to_add,
2121                                  struct btrfs_delayed_extent_op *extent_op)
2122{
2123        struct btrfs_path *path;
2124        struct extent_buffer *leaf;
2125        struct btrfs_extent_item *item;
2126        struct btrfs_key key;
2127        u64 bytenr = node->bytenr;
2128        u64 num_bytes = node->num_bytes;
2129        u64 refs;
2130        int ret;
2131
2132        path = btrfs_alloc_path();
2133        if (!path)
2134                return -ENOMEM;
2135
2136        path->reada = READA_FORWARD;
2137        path->leave_spinning = 1;
2138        /* this will setup the path even if it fails to insert the back ref */
2139        ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
2140                                           parent, root_objectid, owner,
2141                                           offset, refs_to_add, extent_op);
2142        if ((ret < 0 && ret != -EAGAIN) || !ret)
2143                goto out;
2144
2145        /*
2146         * Ok we had -EAGAIN which means we didn't have space to insert and
2147         * inline extent ref, so just update the reference count and add a
2148         * normal backref.
2149         */
2150        leaf = path->nodes[0];
2151        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2152        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2153        refs = btrfs_extent_refs(leaf, item);
2154        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2155        if (extent_op)
2156                __run_delayed_extent_op(extent_op, leaf, item);
2157
2158        btrfs_mark_buffer_dirty(leaf);
2159        btrfs_release_path(path);
2160
2161        path->reada = READA_FORWARD;
2162        path->leave_spinning = 1;
2163        /* now insert the actual backref */
2164        ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
2165                                    owner, offset, refs_to_add);
2166        if (ret)
2167                btrfs_abort_transaction(trans, ret);
2168out:
2169        btrfs_free_path(path);
2170        return ret;
2171}
2172
2173static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2174                                struct btrfs_delayed_ref_node *node,
2175                                struct btrfs_delayed_extent_op *extent_op,
2176                                int insert_reserved)
2177{
2178        int ret = 0;
2179        struct btrfs_delayed_data_ref *ref;
2180        struct btrfs_key ins;
2181        u64 parent = 0;
2182        u64 ref_root = 0;
2183        u64 flags = 0;
2184
2185        ins.objectid = node->bytenr;
2186        ins.offset = node->num_bytes;
2187        ins.type = BTRFS_EXTENT_ITEM_KEY;
2188
2189        ref = btrfs_delayed_node_to_data_ref(node);
2190        trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
2191
2192        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2193                parent = ref->parent;
2194        ref_root = ref->root;
2195
2196        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2197                if (extent_op)
2198                        flags |= extent_op->flags_to_set;
2199                ret = alloc_reserved_file_extent(trans, parent, ref_root,
2200                                                 flags, ref->objectid,
2201                                                 ref->offset, &ins,
2202                                                 node->ref_mod);
2203        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2204                ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2205                                             ref->objectid, ref->offset,
2206                                             node->ref_mod, extent_op);
2207        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2208                ret = __btrfs_free_extent(trans, node, parent,
2209                                          ref_root, ref->objectid,
2210                                          ref->offset, node->ref_mod,
2211                                          extent_op);
2212        } else {
2213                BUG();
2214        }
2215        return ret;
2216}
2217
2218static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2219                                    struct extent_buffer *leaf,
2220                                    struct btrfs_extent_item *ei)
2221{
2222        u64 flags = btrfs_extent_flags(leaf, ei);
2223        if (extent_op->update_flags) {
2224                flags |= extent_op->flags_to_set;
2225                btrfs_set_extent_flags(leaf, ei, flags);
2226        }
2227
2228        if (extent_op->update_key) {
2229                struct btrfs_tree_block_info *bi;
2230                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2231                bi = (struct btrfs_tree_block_info *)(ei + 1);
2232                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2233        }
2234}
2235
2236static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2237                                 struct btrfs_delayed_ref_head *head,
2238                                 struct btrfs_delayed_extent_op *extent_op)
2239{
2240        struct btrfs_fs_info *fs_info = trans->fs_info;
2241        struct btrfs_key key;
2242        struct btrfs_path *path;
2243        struct btrfs_extent_item *ei;
2244        struct extent_buffer *leaf;
2245        u32 item_size;
2246        int ret;
2247        int err = 0;
2248        int metadata = !extent_op->is_data;
2249
2250        if (trans->aborted)
2251                return 0;
2252
2253        if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2254                metadata = 0;
2255
2256        path = btrfs_alloc_path();
2257        if (!path)
2258                return -ENOMEM;
2259
2260        key.objectid = head->bytenr;
2261
2262        if (metadata) {
2263                key.type = BTRFS_METADATA_ITEM_KEY;
2264                key.offset = extent_op->level;
2265        } else {
2266                key.type = BTRFS_EXTENT_ITEM_KEY;
2267                key.offset = head->num_bytes;
2268        }
2269
2270again:
2271        path->reada = READA_FORWARD;
2272        path->leave_spinning = 1;
2273        ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
2274        if (ret < 0) {
2275                err = ret;
2276                goto out;
2277        }
2278        if (ret > 0) {
2279                if (metadata) {
2280                        if (path->slots[0] > 0) {
2281                                path->slots[0]--;
2282                                btrfs_item_key_to_cpu(path->nodes[0], &key,
2283                                                      path->slots[0]);
2284                                if (key.objectid == head->bytenr &&
2285                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
2286                                    key.offset == head->num_bytes)
2287                                        ret = 0;
2288                        }
2289                        if (ret > 0) {
2290                                btrfs_release_path(path);
2291                                metadata = 0;
2292
2293                                key.objectid = head->bytenr;
2294                                key.offset = head->num_bytes;
2295                                key.type = BTRFS_EXTENT_ITEM_KEY;
2296                                goto again;
2297                        }
2298                } else {
2299                        err = -EIO;
2300                        goto out;
2301                }
2302        }
2303
2304        leaf = path->nodes[0];
2305        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2306
2307        if (unlikely(item_size < sizeof(*ei))) {
2308                err = -EINVAL;
2309                btrfs_print_v0_err(fs_info);
2310                btrfs_abort_transaction(trans, err);
2311                goto out;
2312        }
2313
2314        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2315        __run_delayed_extent_op(extent_op, leaf, ei);
2316
2317        btrfs_mark_buffer_dirty(leaf);
2318out:
2319        btrfs_free_path(path);
2320        return err;
2321}
2322
2323static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2324                                struct btrfs_delayed_ref_node *node,
2325                                struct btrfs_delayed_extent_op *extent_op,
2326                                int insert_reserved)
2327{
2328        int ret = 0;
2329        struct btrfs_delayed_tree_ref *ref;
2330        u64 parent = 0;
2331        u64 ref_root = 0;
2332
2333        ref = btrfs_delayed_node_to_tree_ref(node);
2334        trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
2335
2336        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2337                parent = ref->parent;
2338        ref_root = ref->root;
2339
2340        if (node->ref_mod != 1) {
2341                btrfs_err(trans->fs_info,
2342        "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
2343                          node->bytenr, node->ref_mod, node->action, ref_root,
2344                          parent);
2345                return -EIO;
2346        }
2347        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2348                BUG_ON(!extent_op || !extent_op->update_flags);
2349                ret = alloc_reserved_tree_block(trans, node, extent_op);
2350        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2351                ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
2352                                             ref->level, 0, 1, extent_op);
2353        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2354                ret = __btrfs_free_extent(trans, node, parent, ref_root,
2355                                          ref->level, 0, 1, extent_op);
2356        } else {
2357                BUG();
2358        }
2359        return ret;
2360}
2361
2362/* helper function to actually process a single delayed ref entry */
2363static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2364                               struct btrfs_delayed_ref_node *node,
2365                               struct btrfs_delayed_extent_op *extent_op,
2366                               int insert_reserved)
2367{
2368        int ret = 0;
2369
2370        if (trans->aborted) {
2371                if (insert_reserved)
2372                        btrfs_pin_extent(trans->fs_info, node->bytenr,
2373                                         node->num_bytes, 1);
2374                return 0;
2375        }
2376
2377        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2378            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2379                ret = run_delayed_tree_ref(trans, node, extent_op,
2380                                           insert_reserved);
2381        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2382                 node->type == BTRFS_SHARED_DATA_REF_KEY)
2383                ret = run_delayed_data_ref(trans, node, extent_op,
2384                                           insert_reserved);
2385        else
2386                BUG();
2387        if (ret && insert_reserved)
2388                btrfs_pin_extent(trans->fs_info, node->bytenr,
2389                                 node->num_bytes, 1);
2390        return ret;
2391}
2392
2393static inline struct btrfs_delayed_ref_node *
2394select_delayed_ref(struct btrfs_delayed_ref_head *head)
2395{
2396        struct btrfs_delayed_ref_node *ref;
2397
2398        if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
2399                return NULL;
2400
2401        /*
2402         * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
2403         * This is to prevent a ref count from going down to zero, which deletes
2404         * the extent item from the extent tree, when there still are references
2405         * to add, which would fail because they would not find the extent item.
2406         */
2407        if (!list_empty(&head->ref_add_list))
2408                return list_first_entry(&head->ref_add_list,
2409                                struct btrfs_delayed_ref_node, add_list);
2410
2411        ref = rb_entry(rb_first_cached(&head->ref_tree),
2412                       struct btrfs_delayed_ref_node, ref_node);
2413        ASSERT(list_empty(&ref->add_list));
2414        return ref;
2415}
2416
2417static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
2418                                      struct btrfs_delayed_ref_head *head)
2419{
2420        spin_lock(&delayed_refs->lock);
2421        head->processing = 0;
2422        delayed_refs->num_heads_ready++;
2423        spin_unlock(&delayed_refs->lock);
2424        btrfs_delayed_ref_unlock(head);
2425}
2426
2427static struct btrfs_delayed_extent_op *cleanup_extent_op(
2428                                struct btrfs_delayed_ref_head *head)
2429{
2430        struct btrfs_delayed_extent_op *extent_op = head->extent_op;
2431
2432        if (!extent_op)
2433                return NULL;
2434
2435        if (head->must_insert_reserved) {
2436                head->extent_op = NULL;
2437                btrfs_free_delayed_extent_op(extent_op);
2438                return NULL;
2439        }
2440        return extent_op;
2441}
2442
2443static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
2444                                     struct btrfs_delayed_ref_head *head)
2445{
2446        struct btrfs_delayed_extent_op *extent_op;
2447        int ret;
2448
2449        extent_op = cleanup_extent_op(head);
2450        if (!extent_op)
2451                return 0;
2452        head->extent_op = NULL;
2453        spin_unlock(&head->lock);
2454        ret = run_delayed_extent_op(trans, head, extent_op);
2455        btrfs_free_delayed_extent_op(extent_op);
2456        return ret ? ret : 1;
2457}
2458
2459void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
2460                                  struct btrfs_delayed_ref_root *delayed_refs,
2461                                  struct btrfs_delayed_ref_head *head)
2462{
2463        int nr_items = 1;       /* Dropping this ref head update. */
2464
2465        if (head->total_ref_mod < 0) {
2466                struct btrfs_space_info *space_info;
2467                u64 flags;
2468
2469                if (head->is_data)
2470                        flags = BTRFS_BLOCK_GROUP_DATA;
2471                else if (head->is_system)
2472                        flags = BTRFS_BLOCK_GROUP_SYSTEM;
2473                else
2474                        flags = BTRFS_BLOCK_GROUP_METADATA;
2475                space_info = __find_space_info(fs_info, flags);
2476                ASSERT(space_info);
2477                percpu_counter_add_batch(&space_info->total_bytes_pinned,
2478                                   -head->num_bytes,
2479                                   BTRFS_TOTAL_BYTES_PINNED_BATCH);
2480
2481                /*
2482                 * We had csum deletions accounted for in our delayed refs rsv,
2483                 * we need to drop the csum leaves for this update from our
2484                 * delayed_refs_rsv.
2485                 */
2486                if (head->is_data) {
2487                        spin_lock(&delayed_refs->lock);
2488                        delayed_refs->pending_csums -= head->num_bytes;
2489                        spin_unlock(&delayed_refs->lock);
2490                        nr_items += btrfs_csum_bytes_to_leaves(fs_info,
2491                                head->num_bytes);
2492                }
2493        }
2494
2495        /* Also free its reserved qgroup space */
2496        btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
2497                                      head->qgroup_reserved);
2498        btrfs_delayed_refs_rsv_release(fs_info, nr_items);
2499}
2500
2501static int cleanup_ref_head(struct btrfs_trans_handle *trans,
2502                            struct btrfs_delayed_ref_head *head)
2503{
2504
2505        struct btrfs_fs_info *fs_info = trans->fs_info;
2506        struct btrfs_delayed_ref_root *delayed_refs;
2507        int ret;
2508
2509        delayed_refs = &trans->transaction->delayed_refs;
2510
2511        ret = run_and_cleanup_extent_op(trans, head);
2512        if (ret < 0) {
2513                unselect_delayed_ref_head(delayed_refs, head);
2514                btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2515                return ret;
2516        } else if (ret) {
2517                return ret;
2518        }
2519
2520        /*
2521         * Need to drop our head ref lock and re-acquire the delayed ref lock
2522         * and then re-check to make sure nobody got added.
2523         */
2524        spin_unlock(&head->lock);
2525        spin_lock(&delayed_refs->lock);
2526        spin_lock(&head->lock);
2527        if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
2528                spin_unlock(&head->lock);
2529                spin_unlock(&delayed_refs->lock);
2530                return 1;
2531        }
2532        btrfs_delete_ref_head(delayed_refs, head);
2533        spin_unlock(&head->lock);
2534        spin_unlock(&delayed_refs->lock);
2535
2536        if (head->must_insert_reserved) {
2537                btrfs_pin_extent(fs_info, head->bytenr,
2538                                 head->num_bytes, 1);
2539                if (head->is_data) {
2540                        ret = btrfs_del_csums(trans, fs_info, head->bytenr,
2541                                              head->num_bytes);
2542                }
2543        }
2544
2545        btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
2546
2547        trace_run_delayed_ref_head(fs_info, head, 0);
2548        btrfs_delayed_ref_unlock(head);
2549        btrfs_put_delayed_ref_head(head);
2550        return 0;
2551}
2552
2553static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
2554                                        struct btrfs_trans_handle *trans)
2555{
2556        struct btrfs_delayed_ref_root *delayed_refs =
2557                &trans->transaction->delayed_refs;
2558        struct btrfs_delayed_ref_head *head = NULL;
2559        int ret;
2560
2561        spin_lock(&delayed_refs->lock);
2562        head = btrfs_select_ref_head(delayed_refs);
2563        if (!head) {
2564                spin_unlock(&delayed_refs->lock);
2565                return head;
2566        }
2567
2568        /*
2569         * Grab the lock that says we are going to process all the refs for
2570         * this head
2571         */
2572        ret = btrfs_delayed_ref_lock(delayed_refs, head);
2573        spin_unlock(&delayed_refs->lock);
2574
2575        /*
2576         * We may have dropped the spin lock to get the head mutex lock, and
2577         * that might have given someone else time to free the head.  If that's
2578         * true, it has been removed from our list and we can move on.
2579         */
2580        if (ret == -EAGAIN)
2581                head = ERR_PTR(-EAGAIN);
2582
2583        return head;
2584}
2585
2586static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
2587                                    struct btrfs_delayed_ref_head *locked_ref,
2588                                    unsigned long *run_refs)
2589{
2590        struct btrfs_fs_info *fs_info = trans->fs_info;
2591        struct btrfs_delayed_ref_root *delayed_refs;
2592        struct btrfs_delayed_extent_op *extent_op;
2593        struct btrfs_delayed_ref_node *ref;
2594        int must_insert_reserved = 0;
2595        int ret;
2596
2597        delayed_refs = &trans->transaction->delayed_refs;
2598
2599        lockdep_assert_held(&locked_ref->mutex);
2600        lockdep_assert_held(&locked_ref->lock);
2601
2602        while ((ref = select_delayed_ref(locked_ref))) {
2603                if (ref->seq &&
2604                    btrfs_check_delayed_seq(fs_info, ref->seq)) {
2605                        spin_unlock(&locked_ref->lock);
2606                        unselect_delayed_ref_head(delayed_refs, locked_ref);
2607                        return -EAGAIN;
2608                }
2609
2610                (*run_refs)++;
2611                ref->in_tree = 0;
2612                rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
2613                RB_CLEAR_NODE(&ref->ref_node);
2614                if (!list_empty(&ref->add_list))
2615                        list_del(&ref->add_list);
2616                /*
2617                 * When we play the delayed ref, also correct the ref_mod on
2618                 * head
2619                 */
2620                switch (ref->action) {
2621                case BTRFS_ADD_DELAYED_REF:
2622                case BTRFS_ADD_DELAYED_EXTENT:
2623                        locked_ref->ref_mod -= ref->ref_mod;
2624                        break;
2625                case BTRFS_DROP_DELAYED_REF:
2626                        locked_ref->ref_mod += ref->ref_mod;
2627                        break;
2628                default:
2629                        WARN_ON(1);
2630                }
2631                atomic_dec(&delayed_refs->num_entries);
2632
2633                /*
2634                 * Record the must_insert_reserved flag before we drop the
2635                 * spin lock.
2636                 */
2637                must_insert_reserved = locked_ref->must_insert_reserved;
2638                locked_ref->must_insert_reserved = 0;
2639
2640                extent_op = locked_ref->extent_op;
2641                locked_ref->extent_op = NULL;
2642                spin_unlock(&locked_ref->lock);
2643
2644                ret = run_one_delayed_ref(trans, ref, extent_op,
2645                                          must_insert_reserved);
2646
2647                btrfs_free_delayed_extent_op(extent_op);
2648                if (ret) {
2649                        unselect_delayed_ref_head(delayed_refs, locked_ref);
2650                        btrfs_put_delayed_ref(ref);
2651                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
2652                                    ret);
2653                        return ret;
2654                }
2655
2656                btrfs_put_delayed_ref(ref);
2657                cond_resched();
2658
2659                spin_lock(&locked_ref->lock);
2660                btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2661        }
2662
2663        return 0;
2664}
2665
2666/*
2667 * Returns 0 on success or if called with an already aborted transaction.
2668 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2669 */
2670static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2671                                             unsigned long nr)
2672{
2673        struct btrfs_fs_info *fs_info = trans->fs_info;
2674        struct btrfs_delayed_ref_root *delayed_refs;
2675        struct btrfs_delayed_ref_head *locked_ref = NULL;
2676        ktime_t start = ktime_get();
2677        int ret;
2678        unsigned long count = 0;
2679        unsigned long actual_count = 0;
2680
2681        delayed_refs = &trans->transaction->delayed_refs;
2682        do {
2683                if (!locked_ref) {
2684                        locked_ref = btrfs_obtain_ref_head(trans);
2685                        if (IS_ERR_OR_NULL(locked_ref)) {
2686                                if (PTR_ERR(locked_ref) == -EAGAIN) {
2687                                        continue;
2688                                } else {
2689                                        break;
2690                                }
2691                        }
2692                        count++;
2693                }
2694                /*
2695                 * We need to try and merge add/drops of the same ref since we
2696                 * can run into issues with relocate dropping the implicit ref
2697                 * and then it being added back again before the drop can
2698                 * finish.  If we merged anything we need to re-loop so we can
2699                 * get a good ref.
2700                 * Or we can get node references of the same type that weren't
2701                 * merged when created due to bumps in the tree mod seq, and
2702                 * we need to merge them to prevent adding an inline extent
2703                 * backref before dropping it (triggering a BUG_ON at
2704                 * insert_inline_extent_backref()).
2705                 */
2706                spin_lock(&locked_ref->lock);
2707                btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2708
2709                ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2710                                                      &actual_count);
2711                if (ret < 0 && ret != -EAGAIN) {
2712                        /*
2713                         * Error, btrfs_run_delayed_refs_for_head already
2714                         * unlocked everything so just bail out
2715                         */
2716                        return ret;
2717                } else if (!ret) {
2718                        /*
2719                         * Success, perform the usual cleanup of a processed
2720                         * head
2721                         */
2722                        ret = cleanup_ref_head(trans, locked_ref);
2723                        if (ret > 0 ) {
2724                                /* We dropped our lock, we need to loop. */
2725                                ret = 0;
2726                                continue;
2727                        } else if (ret) {
2728                                return ret;
2729                        }
2730                }
2731
2732                /*
2733                 * Either success case or btrfs_run_delayed_refs_for_head
2734                 * returned -EAGAIN, meaning we need to select another head
2735                 */
2736
2737                locked_ref = NULL;
2738                cond_resched();
2739        } while ((nr != -1 && count < nr) || locked_ref);
2740
2741        /*
2742         * We don't want to include ref heads since we can have empty ref heads
2743         * and those will drastically skew our runtime down since we just do
2744         * accounting, no actual extent tree updates.
2745         */
2746        if (actual_count > 0) {
2747                u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2748                u64 avg;
2749
2750                /*
2751                 * We weigh the current average higher than our current runtime
2752                 * to avoid large swings in the average.
2753                 */
2754                spin_lock(&delayed_refs->lock);
2755                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2756                fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2757                spin_unlock(&delayed_refs->lock);
2758        }
2759        return 0;
2760}
2761
2762#ifdef SCRAMBLE_DELAYED_REFS
2763/*
2764 * Normally delayed refs get processed in ascending bytenr order. This
2765 * correlates in most cases to the order added. To expose dependencies on this
2766 * order, we start to process the tree in the middle instead of the beginning
2767 */
2768static u64 find_middle(struct rb_root *root)
2769{
2770        struct rb_node *n = root->rb_node;
2771        struct btrfs_delayed_ref_node *entry;
2772        int alt = 1;
2773        u64 middle;
2774        u64 first = 0, last = 0;
2775
2776        n = rb_first(root);
2777        if (n) {
2778                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2779                first = entry->bytenr;
2780        }
2781        n = rb_last(root);
2782        if (n) {
2783                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2784                last = entry->bytenr;
2785        }
2786        n = root->rb_node;
2787
2788        while (n) {
2789                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2790                WARN_ON(!entry->in_tree);
2791
2792                middle = entry->bytenr;
2793
2794                if (alt)
2795                        n = n->rb_left;
2796                else
2797                        n = n->rb_right;
2798
2799                alt = 1 - alt;
2800        }
2801        return middle;
2802}
2803#endif
2804
2805static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2806{
2807        u64 num_bytes;
2808
2809        num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2810                             sizeof(struct btrfs_extent_inline_ref));
2811        if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2812                num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2813
2814        /*
2815         * We don't ever fill up leaves all the way so multiply by 2 just to be
2816         * closer to what we're really going to want to use.
2817         */
2818        return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2819}
2820
2821/*
2822 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2823 * would require to store the csums for that many bytes.
2824 */
2825u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2826{
2827        u64 csum_size;
2828        u64 num_csums_per_leaf;
2829        u64 num_csums;
2830
2831        csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2832        num_csums_per_leaf = div64_u64(csum_size,
2833                        (u64)btrfs_super_csum_size(fs_info->super_copy));
2834        num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2835        num_csums += num_csums_per_leaf - 1;
2836        num_csums = div64_u64(num_csums, num_csums_per_leaf);
2837        return num_csums;
2838}
2839
2840bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
2841{
2842        struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
2843        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2844        bool ret = false;
2845        u64 reserved;
2846
2847        spin_lock(&global_rsv->lock);
2848        reserved = global_rsv->reserved;
2849        spin_unlock(&global_rsv->lock);
2850
2851        /*
2852         * Since the global reserve is just kind of magic we don't really want
2853         * to rely on it to save our bacon, so if our size is more than the
2854         * delayed_refs_rsv and the global rsv then it's time to think about
2855         * bailing.
2856         */
2857        spin_lock(&delayed_refs_rsv->lock);
2858        reserved += delayed_refs_rsv->reserved;
2859        if (delayed_refs_rsv->size >= reserved)
2860                ret = true;
2861        spin_unlock(&delayed_refs_rsv->lock);
2862        return ret;
2863}
2864
2865int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
2866{
2867        u64 num_entries =
2868                atomic_read(&trans->transaction->delayed_refs.num_entries);
2869        u64 avg_runtime;
2870        u64 val;
2871
2872        smp_mb();
2873        avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
2874        val = num_entries * avg_runtime;
2875        if (val >= NSEC_PER_SEC)
2876                return 1;
2877        if (val >= NSEC_PER_SEC / 2)
2878                return 2;
2879
2880        return btrfs_check_space_for_delayed_refs(trans->fs_info);
2881}
2882
2883struct async_delayed_refs {
2884        struct btrfs_root *root;
2885        u64 transid;
2886        int count;
2887        int error;
2888        int sync;
2889        struct completion wait;
2890        struct btrfs_work work;
2891};
2892
2893static inline struct async_delayed_refs *
2894to_async_delayed_refs(struct btrfs_work *work)
2895{
2896        return container_of(work, struct async_delayed_refs, work);
2897}
2898
2899static void delayed_ref_async_start(struct btrfs_work *work)
2900{
2901        struct async_delayed_refs *async = to_async_delayed_refs(work);
2902        struct btrfs_trans_handle *trans;
2903        struct btrfs_fs_info *fs_info = async->root->fs_info;
2904        int ret;
2905
2906        /* if the commit is already started, we don't need to wait here */
2907        if (btrfs_transaction_blocked(fs_info))
2908                goto done;
2909
2910        trans = btrfs_join_transaction(async->root);
2911        if (IS_ERR(trans)) {
2912                async->error = PTR_ERR(trans);
2913                goto done;
2914        }
2915
2916        /*
2917         * trans->sync means that when we call end_transaction, we won't
2918         * wait on delayed refs
2919         */
2920        trans->sync = true;
2921
2922        /* Don't bother flushing if we got into a different transaction */
2923        if (trans->transid > async->transid)
2924                goto end;
2925
2926        ret = btrfs_run_delayed_refs(trans, async->count);
2927        if (ret)
2928                async->error = ret;
2929end:
2930        ret = btrfs_end_transaction(trans);
2931        if (ret && !async->error)
2932                async->error = ret;
2933done:
2934        if (async->sync)
2935                complete(&async->wait);
2936        else
2937                kfree(async);
2938}
2939
2940int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
2941                                 unsigned long count, u64 transid, int wait)
2942{
2943        struct async_delayed_refs *async;
2944        int ret;
2945
2946        async = kmalloc(sizeof(*async), GFP_NOFS);
2947        if (!async)
2948                return -ENOMEM;
2949
2950        async->root = fs_info->tree_root;
2951        async->count = count;
2952        async->error = 0;
2953        async->transid = transid;
2954        if (wait)
2955                async->sync = 1;
2956        else
2957                async->sync = 0;
2958        init_completion(&async->wait);
2959
2960        btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2961                        delayed_ref_async_start, NULL, NULL);
2962
2963        btrfs_queue_work(fs_info->extent_workers, &async->work);
2964
2965        if (wait) {
2966                wait_for_completion(&async->wait);
2967                ret = async->error;
2968                kfree(async);
2969                return ret;
2970        }
2971        return 0;
2972}
2973
2974/*
2975 * this starts processing the delayed reference count updates and
2976 * extent insertions we have queued up so far.  count can be
2977 * 0, which means to process everything in the tree at the start
2978 * of the run (but not newly added entries), or it can be some target
2979 * number you'd like to process.
2980 *
2981 * Returns 0 on success or if called with an aborted transaction
2982 * Returns <0 on error and aborts the transaction
2983 */
2984int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2985                           unsigned long count)
2986{
2987        struct btrfs_fs_info *fs_info = trans->fs_info;
2988        struct rb_node *node;
2989        struct btrfs_delayed_ref_root *delayed_refs;
2990        struct btrfs_delayed_ref_head *head;
2991        int ret;
2992        int run_all = count == (unsigned long)-1;
2993
2994        /* We'll clean this up in btrfs_cleanup_transaction */
2995        if (trans->aborted)
2996                return 0;
2997
2998        if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2999                return 0;
3000
3001        delayed_refs = &trans->transaction->delayed_refs;
3002        if (count == 0)
3003                count = atomic_read(&delayed_refs->num_entries) * 2;
3004
3005again:
3006#ifdef SCRAMBLE_DELAYED_REFS
3007        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
3008#endif
3009        ret = __btrfs_run_delayed_refs(trans, count);
3010        if (ret < 0) {
3011                btrfs_abort_transaction(trans, ret);
3012                return ret;
3013        }
3014
3015        if (run_all) {
3016                if (!list_empty(&trans->new_bgs))
3017                        btrfs_create_pending_block_groups(trans);
3018
3019                spin_lock(&delayed_refs->lock);
3020                node = rb_first_cached(&delayed_refs->href_root);
3021                if (!node) {
3022                        spin_unlock(&delayed_refs->lock);
3023                        goto out;
3024                }
3025                head = rb_entry(node, struct btrfs_delayed_ref_head,
3026                                href_node);
3027                refcount_inc(&head->refs);
3028                spin_unlock(&delayed_refs->lock);
3029
3030                /* Mutex was contended, block until it's released and retry. */
3031                mutex_lock(&head->mutex);
3032                mutex_unlock(&head->mutex);
3033
3034                btrfs_put_delayed_ref_head(head);
3035                cond_resched();
3036                goto again;
3037        }
3038out:
3039        return 0;
3040}
3041
3042int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3043                                struct btrfs_fs_info *fs_info,
3044                                u64 bytenr, u64 num_bytes, u64 flags,
3045                                int level, int is_data)
3046{
3047        struct btrfs_delayed_extent_op *extent_op;
3048        int ret;
3049
3050        extent_op = btrfs_alloc_delayed_extent_op();
3051        if (!extent_op)
3052                return -ENOMEM;
3053
3054        extent_op->flags_to_set = flags;
3055        extent_op->update_flags = true;
3056        extent_op->update_key = false;
3057        extent_op->is_data = is_data ? true : false;
3058        extent_op->level = level;
3059
3060        ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
3061                                          num_bytes, extent_op);
3062        if (ret)
3063                btrfs_free_delayed_extent_op(extent_op);
3064        return ret;
3065}
3066
3067static noinline int check_delayed_ref(struct btrfs_root *root,
3068                                      struct btrfs_path *path,
3069                                      u64 objectid, u64 offset, u64 bytenr)
3070{
3071        struct btrfs_delayed_ref_head *head;
3072        struct btrfs_delayed_ref_node *ref;
3073        struct btrfs_delayed_data_ref *data_ref;
3074        struct btrfs_delayed_ref_root *delayed_refs;
3075        struct btrfs_transaction *cur_trans;
3076        struct rb_node *node;
3077        int ret = 0;
3078
3079        spin_lock(&root->fs_info->trans_lock);
3080        cur_trans = root->fs_info->running_transaction;
3081        if (cur_trans)
3082                refcount_inc(&cur_trans->use_count);
3083        spin_unlock(&root->fs_info->trans_lock);
3084        if (!cur_trans)
3085                return 0;
3086
3087        delayed_refs = &cur_trans->delayed_refs;
3088        spin_lock(&delayed_refs->lock);
3089        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3090        if (!head) {
3091                spin_unlock(&delayed_refs->lock);
3092                btrfs_put_transaction(cur_trans);
3093                return 0;
3094        }
3095
3096        if (!mutex_trylock(&head->mutex)) {
3097                refcount_inc(&head->refs);
3098                spin_unlock(&delayed_refs->lock);
3099
3100                btrfs_release_path(path);
3101
3102                /*
3103                 * Mutex was contended, block until it's released and let
3104                 * caller try again
3105                 */
3106                mutex_lock(&head->mutex);
3107                mutex_unlock(&head->mutex);
3108                btrfs_put_delayed_ref_head(head);
3109                btrfs_put_transaction(cur_trans);
3110                return -EAGAIN;
3111        }
3112        spin_unlock(&delayed_refs->lock);
3113
3114        spin_lock(&head->lock);
3115        /*
3116         * XXX: We should replace this with a proper search function in the
3117         * future.
3118         */
3119        for (node = rb_first_cached(&head->ref_tree); node;
3120             node = rb_next(node)) {
3121                ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
3122                /* If it's a shared ref we know a cross reference exists */
3123                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
3124                        ret = 1;
3125                        break;
3126                }
3127
3128                data_ref = btrfs_delayed_node_to_data_ref(ref);
3129
3130                /*
3131                 * If our ref doesn't match the one we're currently looking at
3132                 * then we have a cross reference.
3133                 */
3134                if (data_ref->root != root->root_key.objectid ||
3135                    data_ref->objectid != objectid ||
3136                    data_ref->offset != offset) {
3137                        ret = 1;
3138                        break;
3139                }
3140        }
3141        spin_unlock(&head->lock);
3142        mutex_unlock(&head->mutex);
3143        btrfs_put_transaction(cur_trans);
3144        return ret;
3145}
3146
3147static noinline int check_committed_ref(struct btrfs_root *root,
3148                                        struct btrfs_path *path,
3149                                        u64 objectid, u64 offset, u64 bytenr)
3150{
3151        struct btrfs_fs_info *fs_info = root->fs_info;
3152        struct btrfs_root *extent_root = fs_info->extent_root;
3153        struct extent_buffer *leaf;
3154        struct btrfs_extent_data_ref *ref;
3155        struct btrfs_extent_inline_ref *iref;
3156        struct btrfs_extent_item *ei;
3157        struct btrfs_key key;
3158        u32 item_size;
3159        int type;
3160        int ret;
3161
3162        key.objectid = bytenr;
3163        key.offset = (u64)-1;
3164        key.type = BTRFS_EXTENT_ITEM_KEY;
3165
3166        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
3167        if (ret < 0)
3168                goto out;
3169        BUG_ON(ret == 0); /* Corruption */
3170
3171        ret = -ENOENT;
3172        if (path->slots[0] == 0)
3173                goto out;
3174
3175        path->slots[0]--;
3176        leaf = path->nodes[0];
3177        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3178
3179        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3180                goto out;
3181
3182        ret = 1;
3183        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
3184        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3185
3186        if (item_size != sizeof(*ei) +
3187            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
3188                goto out;
3189
3190        if (btrfs_extent_generation(leaf, ei) <=
3191            btrfs_root_last_snapshot(&root->root_item))
3192                goto out;
3193
3194        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3195
3196        type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
3197        if (type != BTRFS_EXTENT_DATA_REF_KEY)
3198                goto out;
3199
3200        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3201        if (btrfs_extent_refs(leaf, ei) !=
3202            btrfs_extent_data_ref_count(leaf, ref) ||
3203            btrfs_extent_data_ref_root(leaf, ref) !=
3204            root->root_key.objectid ||
3205            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3206            btrfs_extent_data_ref_offset(leaf, ref) != offset)
3207                goto out;
3208
3209        ret = 0;
3210out:
3211        return ret;
3212}
3213
3214int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
3215                          u64 bytenr)
3216{
3217        struct btrfs_path *path;
3218        int ret;
3219
3220        path = btrfs_alloc_path();
3221        if (!path)
3222                return -ENOMEM;
3223
3224        do {
3225                ret = check_committed_ref(root, path, objectid,
3226                                          offset, bytenr);
3227                if (ret && ret != -ENOENT)
3228                        goto out;
3229
3230                ret = check_delayed_ref(root, path, objectid, offset, bytenr);
3231        } while (ret == -EAGAIN);
3232
3233out:
3234        btrfs_free_path(path);
3235        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3236                WARN_ON(ret > 0);
3237        return ret;
3238}
3239
3240static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3241                           struct btrfs_root *root,
3242                           struct extent_buffer *buf,
3243                           int full_backref, int inc)
3244{
3245        struct btrfs_fs_info *fs_info = root->fs_info;
3246        u64 bytenr;
3247        u64 num_bytes;
3248        u64 parent;
3249        u64 ref_root;
3250        u32 nritems;
3251        struct btrfs_key key;
3252        struct btrfs_file_extent_item *fi;
3253        int i;
3254        int level;
3255        int ret = 0;
3256        int (*process_func)(struct btrfs_trans_handle *,
3257                            struct btrfs_root *,
3258                            u64, u64, u64, u64, u64, u64);
3259
3260
3261        if (btrfs_is_testing(fs_info))
3262                return 0;
3263
3264        ref_root = btrfs_header_owner(buf);
3265        nritems = btrfs_header_nritems(buf);
3266        level = btrfs_header_level(buf);
3267
3268        if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3269                return 0;
3270
3271        if (inc)
3272                process_func = btrfs_inc_extent_ref;
3273        else
3274                process_func = btrfs_free_extent;
3275
3276        if (full_backref)
3277                parent = buf->start;
3278        else
3279                parent = 0;
3280
3281        for (i = 0; i < nritems; i++) {
3282                if (level == 0) {
3283                        btrfs_item_key_to_cpu(buf, &key, i);
3284                        if (key.type != BTRFS_EXTENT_DATA_KEY)
3285                                continue;
3286                        fi = btrfs_item_ptr(buf, i,
3287                                            struct btrfs_file_extent_item);
3288                        if (btrfs_file_extent_type(buf, fi) ==
3289                            BTRFS_FILE_EXTENT_INLINE)
3290                                continue;
3291                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3292                        if (bytenr == 0)
3293                                continue;
3294
3295                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3296                        key.offset -= btrfs_file_extent_offset(buf, fi);
3297                        ret = process_func(trans, root, bytenr, num_bytes,
3298                                           parent, ref_root, key.objectid,
3299                                           key.offset);
3300                        if (ret)
3301                                goto fail;
3302                } else {
3303                        bytenr = btrfs_node_blockptr(buf, i);
3304                        num_bytes = fs_info->nodesize;
3305                        ret = process_func(trans, root, bytenr, num_bytes,
3306                                           parent, ref_root, level - 1, 0);
3307                        if (ret)
3308                                goto fail;
3309                }
3310        }
3311        return 0;
3312fail:
3313        return ret;
3314}
3315
3316int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3317                  struct extent_buffer *buf, int full_backref)
3318{
3319        return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3320}
3321
3322int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3323                  struct extent_buffer *buf, int full_backref)
3324{
3325        return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3326}
3327
3328static int write_one_cache_group(struct btrfs_trans_handle *trans,
3329                                 struct btrfs_fs_info *fs_info,
3330                                 struct btrfs_path *path,
3331                                 struct btrfs_block_group_cache *cache)
3332{
3333        int ret;
3334        struct btrfs_root *extent_root = fs_info->extent_root;
3335        unsigned long bi;
3336        struct extent_buffer *leaf;
3337
3338        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3339        if (ret) {
3340                if (ret > 0)
3341                        ret = -ENOENT;
3342                goto fail;
3343        }
3344
3345        leaf = path->nodes[0];
3346        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3347        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3348        btrfs_mark_buffer_dirty(leaf);
3349fail:
3350        btrfs_release_path(path);
3351        return ret;
3352
3353}
3354
3355static struct btrfs_block_group_cache *
3356next_block_group(struct btrfs_fs_info *fs_info,
3357                 struct btrfs_block_group_cache *cache)
3358{
3359        struct rb_node *node;
3360
3361        spin_lock(&fs_info->block_group_cache_lock);
3362
3363        /* If our block group was removed, we need a full search. */
3364        if (RB_EMPTY_NODE(&cache->cache_node)) {
3365                const u64 next_bytenr = cache->key.objectid + cache->key.offset;
3366
3367                spin_unlock(&fs_info->block_group_cache_lock);
3368                btrfs_put_block_group(cache);
3369                cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
3370        }
3371        node = rb_next(&cache->cache_node);
3372        btrfs_put_block_group(cache);
3373        if (node) {
3374                cache = rb_entry(node, struct btrfs_block_group_cache,
3375                                 cache_node);
3376                btrfs_get_block_group(cache);
3377        } else
3378                cache = NULL;
3379        spin_unlock(&fs_info->block_group_cache_lock);
3380        return cache;
3381}
3382
3383static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3384                            struct btrfs_trans_handle *trans,
3385                            struct btrfs_path *path)
3386{
3387        struct btrfs_fs_info *fs_info = block_group->fs_info;
3388        struct btrfs_root *root = fs_info->tree_root;
3389        struct inode *inode = NULL;
3390        struct extent_changeset *data_reserved = NULL;
3391        u64 alloc_hint = 0;
3392        int dcs = BTRFS_DC_ERROR;
3393        u64 num_pages = 0;
3394        int retries = 0;
3395        int ret = 0;
3396
3397        /*
3398         * If this block group is smaller than 100 megs don't bother caching the
3399         * block group.
3400         */
3401        if (block_group->key.offset < (100 * SZ_1M)) {
3402                spin_lock(&block_group->lock);
3403                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3404                spin_unlock(&block_group->lock);
3405                return 0;
3406        }
3407
3408        if (trans->aborted)
3409                return 0;
3410again:
3411        inode = lookup_free_space_inode(fs_info, block_group, path);
3412        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3413                ret = PTR_ERR(inode);
3414                btrfs_release_path(path);
3415                goto out;
3416        }
3417
3418        if (IS_ERR(inode)) {
3419                BUG_ON(retries);
3420                retries++;
3421
3422                if (block_group->ro)
3423                        goto out_free;
3424
3425                ret = create_free_space_inode(fs_info, trans, block_group,
3426                                              path);
3427                if (ret)
3428                        goto out_free;
3429                goto again;
3430        }
3431
3432        /*
3433         * We want to set the generation to 0, that way if anything goes wrong
3434         * from here on out we know not to trust this cache when we load up next
3435         * time.
3436         */
3437        BTRFS_I(inode)->generation = 0;
3438        ret = btrfs_update_inode(trans, root, inode);
3439        if (ret) {
3440                /*
3441                 * So theoretically we could recover from this, simply set the
3442                 * super cache generation to 0 so we know to invalidate the
3443                 * cache, but then we'd have to keep track of the block groups
3444                 * that fail this way so we know we _have_ to reset this cache
3445                 * before the next commit or risk reading stale cache.  So to
3446                 * limit our exposure to horrible edge cases lets just abort the
3447                 * transaction, this only happens in really bad situations
3448                 * anyway.
3449                 */
3450                btrfs_abort_transaction(trans, ret);
3451                goto out_put;
3452        }
3453        WARN_ON(ret);
3454
3455        /* We've already setup this transaction, go ahead and exit */
3456        if (block_group->cache_generation == trans->transid &&
3457            i_size_read(inode)) {
3458                dcs = BTRFS_DC_SETUP;
3459                goto out_put;
3460        }
3461
3462        if (i_size_read(inode) > 0) {
3463                ret = btrfs_check_trunc_cache_free_space(fs_info,
3464                                        &fs_info->global_block_rsv);
3465                if (ret)
3466                        goto out_put;
3467
3468                ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3469                if (ret)
3470                        goto out_put;
3471        }
3472
3473        spin_lock(&block_group->lock);
3474        if (block_group->cached != BTRFS_CACHE_FINISHED ||
3475            !btrfs_test_opt(fs_info, SPACE_CACHE)) {
3476                /*
3477                 * don't bother trying to write stuff out _if_
3478                 * a) we're not cached,
3479                 * b) we're with nospace_cache mount option,
3480                 * c) we're with v2 space_cache (FREE_SPACE_TREE).
3481                 */
3482                dcs = BTRFS_DC_WRITTEN;
3483                spin_unlock(&block_group->lock);
3484                goto out_put;
3485        }
3486        spin_unlock(&block_group->lock);
3487
3488        /*
3489         * We hit an ENOSPC when setting up the cache in this transaction, just
3490         * skip doing the setup, we've already cleared the cache so we're safe.
3491         */
3492        if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3493                ret = -ENOSPC;
3494                goto out_put;
3495        }
3496
3497        /*
3498         * Try to preallocate enough space based on how big the block group is.
3499         * Keep in mind this has to include any pinned space which could end up
3500         * taking up quite a bit since it's not folded into the other space
3501         * cache.
3502         */
3503        num_pages = div_u64(block_group->key.offset, SZ_256M);
3504        if (!num_pages)
3505                num_pages = 1;
3506
3507        num_pages *= 16;
3508        num_pages *= PAGE_SIZE;
3509
3510        ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
3511        if (ret)
3512                goto out_put;
3513
3514        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3515                                              num_pages, num_pages,
3516                                              &alloc_hint);
3517        /*
3518         * Our cache requires contiguous chunks so that we don't modify a bunch
3519         * of metadata or split extents when writing the cache out, which means
3520         * we can enospc if we are heavily fragmented in addition to just normal
3521         * out of space conditions.  So if we hit this just skip setting up any
3522         * other block groups for this transaction, maybe we'll unpin enough
3523         * space the next time around.
3524         */
3525        if (!ret)
3526                dcs = BTRFS_DC_SETUP;
3527        else if (ret == -ENOSPC)
3528                set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3529
3530out_put:
3531        iput(inode);
3532out_free:
3533        btrfs_release_path(path);
3534out:
3535        spin_lock(&block_group->lock);
3536        if (!ret && dcs == BTRFS_DC_SETUP)
3537                block_group->cache_generation = trans->transid;
3538        block_group->disk_cache_state = dcs;
3539        spin_unlock(&block_group->lock);
3540
3541        extent_changeset_free(data_reserved);
3542        return ret;
3543}
3544
3545int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3546                            struct btrfs_fs_info *fs_info)
3547{
3548        struct btrfs_block_group_cache *cache, *tmp;
3549        struct btrfs_transaction *cur_trans = trans->transaction;
3550        struct btrfs_path *path;
3551
3552        if (list_empty(&cur_trans->dirty_bgs) ||
3553            !btrfs_test_opt(fs_info, SPACE_CACHE))
3554                return 0;
3555
3556        path = btrfs_alloc_path();
3557        if (!path)
3558                return -ENOMEM;
3559
3560        /* Could add new block groups, use _safe just in case */
3561        list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3562                                 dirty_list) {
3563                if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3564                        cache_save_setup(cache, trans, path);
3565        }
3566
3567        btrfs_free_path(path);
3568        return 0;
3569}
3570
3571/*
3572 * transaction commit does final block group cache writeback during a
3573 * critical section where nothing is allowed to change the FS.  This is
3574 * required in order for the cache to actually match the block group,
3575 * but can introduce a lot of latency into the commit.
3576 *
3577 * So, btrfs_start_dirty_block_groups is here to kick off block group
3578 * cache IO.  There's a chance we'll have to redo some of it if the
3579 * block group changes again during the commit, but it greatly reduces
3580 * the commit latency by getting rid of the easy block groups while
3581 * we're still allowing others to join the commit.
3582 */
3583int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3584{
3585        struct btrfs_fs_info *fs_info = trans->fs_info;
3586        struct btrfs_block_group_cache *cache;
3587        struct btrfs_transaction *cur_trans = trans->transaction;
3588        int ret = 0;
3589        int should_put;
3590        struct btrfs_path *path = NULL;
3591        LIST_HEAD(dirty);
3592        struct list_head *io = &cur_trans->io_bgs;
3593        int num_started = 0;
3594        int loops = 0;
3595
3596        spin_lock(&cur_trans->dirty_bgs_lock);
3597        if (list_empty(&cur_trans->dirty_bgs)) {
3598                spin_unlock(&cur_trans->dirty_bgs_lock);
3599                return 0;
3600        }
3601        list_splice_init(&cur_trans->dirty_bgs, &dirty);
3602        spin_unlock(&cur_trans->dirty_bgs_lock);
3603
3604again:
3605        /*
3606         * make sure all the block groups on our dirty list actually
3607         * exist
3608         */
3609        btrfs_create_pending_block_groups(trans);
3610
3611        if (!path) {
3612                path = btrfs_alloc_path();
3613                if (!path)
3614                        return -ENOMEM;
3615        }
3616
3617        /*
3618         * cache_write_mutex is here only to save us from balance or automatic
3619         * removal of empty block groups deleting this block group while we are
3620         * writing out the cache
3621         */
3622        mutex_lock(&trans->transaction->cache_write_mutex);
3623        while (!list_empty(&dirty)) {
3624                bool drop_reserve = true;
3625
3626                cache = list_first_entry(&dirty,
3627                                         struct btrfs_block_group_cache,
3628                                         dirty_list);
3629                /*
3630                 * this can happen if something re-dirties a block
3631                 * group that is already under IO.  Just wait for it to
3632                 * finish and then do it all again
3633                 */
3634                if (!list_empty(&cache->io_list)) {
3635                        list_del_init(&cache->io_list);
3636                        btrfs_wait_cache_io(trans, cache, path);
3637                        btrfs_put_block_group(cache);
3638                }
3639
3640
3641                /*
3642                 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3643                 * if it should update the cache_state.  Don't delete
3644                 * until after we wait.
3645                 *
3646                 * Since we're not running in the commit critical section
3647                 * we need the dirty_bgs_lock to protect from update_block_group
3648                 */
3649                spin_lock(&cur_trans->dirty_bgs_lock);
3650                list_del_init(&cache->dirty_list);
3651                spin_unlock(&cur_trans->dirty_bgs_lock);
3652
3653                should_put = 1;
3654
3655                cache_save_setup(cache, trans, path);
3656
3657                if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3658                        cache->io_ctl.inode = NULL;
3659                        ret = btrfs_write_out_cache(fs_info, trans,
3660                                                    cache, path);
3661                        if (ret == 0 && cache->io_ctl.inode) {
3662                                num_started++;
3663                                should_put = 0;
3664
3665                                /*
3666                                 * The cache_write_mutex is protecting the
3667                                 * io_list, also refer to the definition of
3668                                 * btrfs_transaction::io_bgs for more details
3669                                 */
3670                                list_add_tail(&cache->io_list, io);
3671                        } else {
3672                                /*
3673                                 * if we failed to write the cache, the
3674                                 * generation will be bad and life goes on
3675                                 */
3676                                ret = 0;
3677                        }
3678                }
3679                if (!ret) {
3680                        ret = write_one_cache_group(trans, fs_info,
3681                                                    path, cache);
3682                        /*
3683                         * Our block group might still be attached to the list
3684                         * of new block groups in the transaction handle of some
3685                         * other task (struct btrfs_trans_handle->new_bgs). This
3686                         * means its block group item isn't yet in the extent
3687                         * tree. If this happens ignore the error, as we will
3688                         * try again later in the critical section of the
3689                         * transaction commit.
3690                         */
3691                        if (ret == -ENOENT) {
3692                                ret = 0;
3693                                spin_lock(&cur_trans->dirty_bgs_lock);
3694                                if (list_empty(&cache->dirty_list)) {
3695                                        list_add_tail(&cache->dirty_list,
3696                                                      &cur_trans->dirty_bgs);
3697                                        btrfs_get_block_group(cache);
3698                                        drop_reserve = false;
3699                                }
3700                                spin_unlock(&cur_trans->dirty_bgs_lock);
3701                        } else if (ret) {
3702                                btrfs_abort_transaction(trans, ret);
3703                        }
3704                }
3705
3706                /* if it's not on the io list, we need to put the block group */
3707                if (should_put)
3708                        btrfs_put_block_group(cache);
3709                if (drop_reserve)
3710                        btrfs_delayed_refs_rsv_release(fs_info, 1);
3711
3712                if (ret)
3713                        break;
3714
3715                /*
3716                 * Avoid blocking other tasks for too long. It might even save
3717                 * us from writing caches for block groups that are going to be
3718                 * removed.
3719                 */
3720                mutex_unlock(&trans->transaction->cache_write_mutex);
3721                mutex_lock(&trans->transaction->cache_write_mutex);
3722        }
3723        mutex_unlock(&trans->transaction->cache_write_mutex);
3724
3725        /*
3726         * go through delayed refs for all the stuff we've just kicked off
3727         * and then loop back (just once)
3728         */
3729        ret = btrfs_run_delayed_refs(trans, 0);
3730        if (!ret && loops == 0) {
3731                loops++;
3732                spin_lock(&cur_trans->dirty_bgs_lock);
3733                list_splice_init(&cur_trans->dirty_bgs, &dirty);
3734                /*
3735                 * dirty_bgs_lock protects us from concurrent block group
3736                 * deletes too (not just cache_write_mutex).
3737                 */
3738                if (!list_empty(&dirty)) {
3739                        spin_unlock(&cur_trans->dirty_bgs_lock);
3740                        goto again;
3741                }
3742                spin_unlock(&cur_trans->dirty_bgs_lock);
3743        } else if (ret < 0) {
3744                btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3745        }
3746
3747        btrfs_free_path(path);
3748        return ret;
3749}
3750
3751int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3752                                   struct btrfs_fs_info *fs_info)
3753{
3754        struct btrfs_block_group_cache *cache;
3755        struct btrfs_transaction *cur_trans = trans->transaction;
3756        int ret = 0;
3757        int should_put;
3758        struct btrfs_path *path;
3759        struct list_head *io = &cur_trans->io_bgs;
3760        int num_started = 0;
3761
3762        path = btrfs_alloc_path();
3763        if (!path)
3764                return -ENOMEM;
3765
3766        /*
3767         * Even though we are in the critical section of the transaction commit,
3768         * we can still have concurrent tasks adding elements to this
3769         * transaction's list of dirty block groups. These tasks correspond to
3770         * endio free space workers started when writeback finishes for a
3771         * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3772         * allocate new block groups as a result of COWing nodes of the root
3773         * tree when updating the free space inode. The writeback for the space
3774         * caches is triggered by an earlier call to
3775         * btrfs_start_dirty_block_groups() and iterations of the following
3776         * loop.
3777         * Also we want to do the cache_save_setup first and then run the
3778         * delayed refs to make sure we have the best chance at doing this all
3779         * in one shot.
3780         */
3781        spin_lock(&cur_trans->dirty_bgs_lock);
3782        while (!list_empty(&cur_trans->dirty_bgs)) {
3783                cache = list_first_entry(&cur_trans->dirty_bgs,
3784                                         struct btrfs_block_group_cache,
3785                                         dirty_list);
3786
3787                /*
3788                 * this can happen if cache_save_setup re-dirties a block
3789                 * group that is already under IO.  Just wait for it to
3790                 * finish and then do it all again
3791                 */
3792                if (!list_empty(&cache->io_list)) {
3793                        spin_unlock(&cur_trans->dirty_bgs_lock);
3794                        list_del_init(&cache->io_list);
3795                        btrfs_wait_cache_io(trans, cache, path);
3796                        btrfs_put_block_group(cache);
3797                        spin_lock(&cur_trans->dirty_bgs_lock);
3798                }
3799
3800                /*
3801                 * don't remove from the dirty list until after we've waited
3802                 * on any pending IO
3803                 */
3804                list_del_init(&cache->dirty_list);
3805                spin_unlock(&cur_trans->dirty_bgs_lock);
3806                should_put = 1;
3807
3808                cache_save_setup(cache, trans, path);
3809
3810                if (!ret)
3811                        ret = btrfs_run_delayed_refs(trans,
3812                                                     (unsigned long) -1);
3813
3814                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3815                        cache->io_ctl.inode = NULL;
3816                        ret = btrfs_write_out_cache(fs_info, trans,
3817                                                    cache, path);
3818                        if (ret == 0 && cache->io_ctl.inode) {
3819                                num_started++;
3820                                should_put = 0;
3821                                list_add_tail(&cache->io_list, io);
3822                        } else {
3823                                /*
3824                                 * if we failed to write the cache, the
3825                                 * generation will be bad and life goes on
3826                                 */
3827                                ret = 0;
3828                        }
3829                }
3830                if (!ret) {
3831                        ret = write_one_cache_group(trans, fs_info,
3832                                                    path, cache);
3833                        /*
3834                         * One of the free space endio workers might have
3835                         * created a new block group while updating a free space
3836                         * cache's inode (at inode.c:btrfs_finish_ordered_io())
3837                         * and hasn't released its transaction handle yet, in
3838                         * which case the new block group is still attached to
3839                         * its transaction handle and its creation has not
3840                         * finished yet (no block group item in the extent tree
3841                         * yet, etc). If this is the case, wait for all free
3842                         * space endio workers to finish and retry. This is a
3843                         * a very rare case so no need for a more efficient and
3844                         * complex approach.
3845                         */
3846                        if (ret == -ENOENT) {
3847                                wait_event(cur_trans->writer_wait,
3848                                   atomic_read(&cur_trans->num_writers) == 1);
3849                                ret = write_one_cache_group(trans, fs_info,
3850                                                            path, cache);
3851                        }
3852                        if (ret)
3853                                btrfs_abort_transaction(trans, ret);
3854                }
3855
3856                /* if its not on the io list, we need to put the block group */
3857                if (should_put)
3858                        btrfs_put_block_group(cache);
3859                btrfs_delayed_refs_rsv_release(fs_info, 1);
3860                spin_lock(&cur_trans->dirty_bgs_lock);
3861        }
3862        spin_unlock(&cur_trans->dirty_bgs_lock);
3863
3864        /*
3865         * Refer to the definition of io_bgs member for details why it's safe
3866         * to use it without any locking
3867         */
3868        while (!list_empty(io)) {
3869                cache = list_first_entry(io, struct btrfs_block_group_cache,
3870                                         io_list);
3871                list_del_init(&cache->io_list);
3872                btrfs_wait_cache_io(trans, cache, path);
3873                btrfs_put_block_group(cache);
3874        }
3875
3876        btrfs_free_path(path);
3877        return ret;
3878}
3879
3880int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
3881{
3882        struct btrfs_block_group_cache *block_group;
3883        int readonly = 0;
3884
3885        block_group = btrfs_lookup_block_group(fs_info, bytenr);
3886        if (!block_group || block_group->ro)
3887                readonly = 1;
3888        if (block_group)
3889                btrfs_put_block_group(block_group);
3890        return readonly;
3891}
3892
3893bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3894{
3895        struct btrfs_block_group_cache *bg;
3896        bool ret = true;
3897
3898        bg = btrfs_lookup_block_group(fs_info, bytenr);
3899        if (!bg)
3900                return false;
3901
3902        spin_lock(&bg->lock);
3903        if (bg->ro)
3904                ret = false;
3905        else
3906                atomic_inc(&bg->nocow_writers);
3907        spin_unlock(&bg->lock);
3908
3909        /* no put on block group, done by btrfs_dec_nocow_writers */
3910        if (!ret)
3911                btrfs_put_block_group(bg);
3912
3913        return ret;
3914
3915}
3916
3917void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3918{
3919        struct btrfs_block_group_cache *bg;
3920
3921        bg = btrfs_lookup_block_group(fs_info, bytenr);
3922        ASSERT(bg);
3923        if (atomic_dec_and_test(&bg->nocow_writers))
3924                wake_up_var(&bg->nocow_writers);
3925        /*
3926         * Once for our lookup and once for the lookup done by a previous call
3927         * to btrfs_inc_nocow_writers()
3928         */
3929        btrfs_put_block_group(bg);
3930        btrfs_put_block_group(bg);
3931}
3932
3933void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3934{
3935        wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
3936}
3937
3938static const char *alloc_name(u64 flags)
3939{
3940        switch (flags) {
3941        case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3942                return "mixed";
3943        case BTRFS_BLOCK_GROUP_METADATA:
3944                return "metadata";
3945        case BTRFS_BLOCK_GROUP_DATA:
3946                return "data";
3947        case BTRFS_BLOCK_GROUP_SYSTEM:
3948                return "system";
3949        default:
3950                WARN_ON(1);
3951                return "invalid-combination";
3952        };
3953}
3954
3955static int create_space_info(struct btrfs_fs_info *info, u64 flags)
3956{
3957
3958        struct btrfs_space_info *space_info;
3959        int i;
3960        int ret;
3961
3962        space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
3963        if (!space_info)
3964                return -ENOMEM;
3965
3966        ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
3967                                 GFP_KERNEL);
3968        if (ret) {
3969                kfree(space_info);
3970                return ret;
3971        }
3972
3973        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3974                INIT_LIST_HEAD(&space_info->block_groups[i]);
3975        init_rwsem(&space_info->groups_sem);
3976        spin_lock_init(&space_info->lock);
3977        space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3978        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3979        init_waitqueue_head(&space_info->wait);
3980        INIT_LIST_HEAD(&space_info->ro_bgs);
3981        INIT_LIST_HEAD(&space_info->tickets);
3982        INIT_LIST_HEAD(&space_info->priority_tickets);
3983
3984        ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
3985                                    info->space_info_kobj, "%s",
3986                                    alloc_name(space_info->flags));
3987        if (ret) {
3988                percpu_counter_destroy(&space_info->total_bytes_pinned);
3989                kfree(space_info);
3990                return ret;
3991        }
3992
3993        list_add_rcu(&space_info->list, &info->space_info);
3994        if (flags & BTRFS_BLOCK_GROUP_DATA)
3995                info->data_sinfo = space_info;
3996
3997        return ret;
3998}
3999
4000static void update_space_info(struct btrfs_fs_info *info, u64 flags,
4001                             u64 total_bytes, u64 bytes_used,
4002                             u64 bytes_readonly,
4003                             struct btrfs_space_info **space_info)
4004{
4005        struct btrfs_space_info *found;
4006        int factor;
4007
4008        factor = btrfs_bg_type_to_factor(flags);
4009
4010        found = __find_space_info(info, flags);
4011        ASSERT(found);
4012        spin_lock(&found->lock);
4013        found->total_bytes += total_bytes;
4014        found->disk_total += total_bytes * factor;
4015        found->bytes_used += bytes_used;
4016        found->disk_used += bytes_used * factor;
4017        found->bytes_readonly += bytes_readonly;
4018        if (total_bytes > 0)
4019                found->full = 0;
4020        space_info_add_new_bytes(info, found, total_bytes -
4021                                 bytes_used - bytes_readonly);
4022        spin_unlock(&found->lock);
4023        *space_info = found;
4024}
4025
4026static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
4027{
4028        u64 extra_flags = chunk_to_extended(flags) &
4029                                BTRFS_EXTENDED_PROFILE_MASK;
4030
4031        write_seqlock(&fs_info->profiles_lock);
4032        if (flags & BTRFS_BLOCK_GROUP_DATA)
4033                fs_info->avail_data_alloc_bits |= extra_flags;
4034        if (flags & BTRFS_BLOCK_GROUP_METADATA)
4035                fs_info->avail_metadata_alloc_bits |= extra_flags;
4036        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4037                fs_info->avail_system_alloc_bits |= extra_flags;
4038        write_sequnlock(&fs_info->profiles_lock);
4039}
4040
4041/*
4042 * returns target flags in extended format or 0 if restripe for this
4043 * chunk_type is not in progress
4044 *
4045 * should be called with balance_lock held
4046 */
4047static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
4048{
4049        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4050        u64 target = 0;
4051
4052        if (!bctl)
4053                return 0;
4054
4055        if (flags & BTRFS_BLOCK_GROUP_DATA &&
4056            bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4057                target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
4058        } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
4059                   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4060                target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
4061        } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
4062                   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
4063                target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
4064        }
4065
4066        return target;
4067}
4068
4069/*
4070 * @flags: available profiles in extended format (see ctree.h)
4071 *
4072 * Returns reduced profile in chunk format.  If profile changing is in
4073 * progress (either running or paused) picks the target profile (if it's
4074 * already available), otherwise falls back to plain reducing.
4075 */
4076static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
4077{
4078        u64 num_devices = fs_info->fs_devices->rw_devices;
4079        u64 target;
4080        u64 raid_type;
4081        u64 allowed = 0;
4082
4083        /*
4084         * see if restripe for this chunk_type is in progress, if so
4085         * try to reduce to the target profile
4086         */
4087        spin_lock(&fs_info->balance_lock);
4088        target = get_restripe_target(fs_info, flags);
4089        if (target) {
4090                /* pick target profile only if it's already available */
4091                if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4092                        spin_unlock(&fs_info->balance_lock);
4093                        return extended_to_chunk(target);
4094                }
4095        }
4096        spin_unlock(&fs_info->balance_lock);
4097
4098        /* First, mask out the RAID levels which aren't possible */
4099        for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
4100                if (num_devices >= btrfs_raid_array[raid_type].devs_min)
4101                        allowed |= btrfs_raid_array[raid_type].bg_flag;
4102        }
4103        allowed &= flags;
4104
4105        if (allowed & BTRFS_BLOCK_GROUP_RAID6)
4106                allowed = BTRFS_BLOCK_GROUP_RAID6;
4107        else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
4108                allowed = BTRFS_BLOCK_GROUP_RAID5;
4109        else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
4110                allowed = BTRFS_BLOCK_GROUP_RAID10;
4111        else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
4112                allowed = BTRFS_BLOCK_GROUP_RAID1;
4113        else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
4114                allowed = BTRFS_BLOCK_GROUP_RAID0;
4115
4116        flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
4117
4118        return extended_to_chunk(flags | allowed);
4119}
4120
4121static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
4122{
4123        unsigned seq;
4124        u64 flags;
4125
4126        do {
4127                flags = orig_flags;
4128                seq = read_seqbegin(&fs_info->profiles_lock);
4129
4130                if (flags & BTRFS_BLOCK_GROUP_DATA)
4131                        flags |= fs_info->avail_data_alloc_bits;
4132                else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4133                        flags |= fs_info->avail_system_alloc_bits;
4134                else if (flags & BTRFS_BLOCK_GROUP_METADATA)
4135                        flags |= fs_info->avail_metadata_alloc_bits;
4136        } while (read_seqretry(&fs_info->profiles_lock, seq));
4137
4138        return btrfs_reduce_alloc_profile(fs_info, flags);
4139}
4140
4141static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
4142{
4143        struct btrfs_fs_info *fs_info = root->fs_info;
4144        u64 flags;
4145        u64 ret;
4146
4147        if (data)
4148                flags = BTRFS_BLOCK_GROUP_DATA;
4149        else if (root == fs_info->chunk_root)
4150                flags = BTRFS_BLOCK_GROUP_SYSTEM;
4151        else
4152                flags = BTRFS_BLOCK_GROUP_METADATA;
4153
4154        ret = get_alloc_profile(fs_info, flags);
4155        return ret;
4156}
4157
4158u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
4159{
4160        return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
4161}
4162
4163u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
4164{
4165        return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4166}
4167
4168u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
4169{
4170        return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4171}
4172
4173static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
4174                                 bool may_use_included)
4175{
4176        ASSERT(s_info);
4177        return s_info->bytes_used + s_info->bytes_reserved +
4178                s_info->bytes_pinned + s_info->bytes_readonly +
4179                (may_use_included ? s_info->bytes_may_use : 0);
4180}
4181
4182int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
4183{
4184        struct btrfs_root *root = inode->root;
4185        struct btrfs_fs_info *fs_info = root->fs_info;
4186        struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
4187        u64 used;
4188        int ret = 0;
4189        int need_commit = 2;
4190        int have_pinned_space;
4191
4192        /* make sure bytes are sectorsize aligned */
4193        bytes = ALIGN(bytes, fs_info->sectorsize);
4194
4195        if (btrfs_is_free_space_inode(inode)) {
4196                need_commit = 0;
4197                ASSERT(current->journal_info);
4198        }
4199
4200again:
4201        /* make sure we have enough space to handle the data first */
4202        spin_lock(&data_sinfo->lock);
4203        used = btrfs_space_info_used(data_sinfo, true);
4204
4205        if (used + bytes > data_sinfo->total_bytes) {
4206                struct btrfs_trans_handle *trans;
4207
4208                /*
4209                 * if we don't have enough free bytes in this space then we need
4210                 * to alloc a new chunk.
4211                 */
4212                if (!data_sinfo->full) {
4213                        u64 alloc_target;
4214
4215                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
4216                        spin_unlock(&data_sinfo->lock);
4217
4218                        alloc_target = btrfs_data_alloc_profile(fs_info);
4219                        /*
4220                         * It is ugly that we don't call nolock join
4221                         * transaction for the free space inode case here.
4222                         * But it is safe because we only do the data space
4223                         * reservation for the free space cache in the
4224                         * transaction context, the common join transaction
4225                         * just increase the counter of the current transaction
4226                         * handler, doesn't try to acquire the trans_lock of
4227                         * the fs.
4228                         */
4229                        trans = btrfs_join_transaction(root);
4230                        if (IS_ERR(trans))
4231                                return PTR_ERR(trans);
4232
4233                        ret = do_chunk_alloc(trans, alloc_target,
4234                                             CHUNK_ALLOC_NO_FORCE);
4235                        btrfs_end_transaction(trans);
4236                        if (ret < 0) {
4237                                if (ret != -ENOSPC)
4238                                        return ret;
4239                                else {
4240                                        have_pinned_space = 1;
4241                                        goto commit_trans;
4242                                }
4243                        }
4244
4245                        goto again;
4246                }
4247
4248                /*
4249                 * If we don't have enough pinned space to deal with this
4250                 * allocation, and no removed chunk in current transaction,
4251                 * don't bother committing the transaction.
4252                 */
4253                have_pinned_space = __percpu_counter_compare(
4254                        &data_sinfo->total_bytes_pinned,
4255                        used + bytes - data_sinfo->total_bytes,
4256                        BTRFS_TOTAL_BYTES_PINNED_BATCH);
4257                spin_unlock(&data_sinfo->lock);
4258
4259                /* commit the current transaction and try again */
4260commit_trans:
4261                if (need_commit) {
4262                        need_commit--;
4263
4264                        if (need_commit > 0) {
4265                                btrfs_start_delalloc_roots(fs_info, -1);
4266                                btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
4267                                                         (u64)-1);
4268                        }
4269
4270                        trans = btrfs_join_transaction(root);
4271                        if (IS_ERR(trans))
4272                                return PTR_ERR(trans);
4273                        if (have_pinned_space >= 0 ||
4274                            test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4275                                     &trans->transaction->flags) ||
4276                            need_commit > 0) {
4277                                ret = btrfs_commit_transaction(trans);
4278                                if (ret)
4279                                        return ret;
4280                                /*
4281                                 * The cleaner kthread might still be doing iput
4282                                 * operations. Wait for it to finish so that
4283                                 * more space is released.
4284                                 */
4285                                mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4286                                mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
4287                                goto again;
4288                        } else {
4289                                btrfs_end_transaction(trans);
4290                        }
4291                }
4292
4293                trace_btrfs_space_reservation(fs_info,
4294                                              "space_info:enospc",
4295                                              data_sinfo->flags, bytes, 1);
4296                return -ENOSPC;
4297        }
4298        update_bytes_may_use(data_sinfo, bytes);
4299        trace_btrfs_space_reservation(fs_info, "space_info",
4300                                      data_sinfo->flags, bytes, 1);
4301        spin_unlock(&data_sinfo->lock);
4302
4303        return 0;
4304}
4305
4306int btrfs_check_data_free_space(struct inode *inode,
4307                        struct extent_changeset **reserved, u64 start, u64 len)
4308{
4309        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4310        int ret;
4311
4312        /* align the range */
4313        len = round_up(start + len, fs_info->sectorsize) -
4314              round_down(start, fs_info->sectorsize);
4315        start = round_down(start, fs_info->sectorsize);
4316
4317        ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
4318        if (ret < 0)
4319                return ret;
4320
4321        /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4322        ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
4323        if (ret < 0)
4324                btrfs_free_reserved_data_space_noquota(inode, start, len);
4325        else
4326                ret = 0;
4327        return ret;
4328}
4329
4330/*
4331 * Called if we need to clear a data reservation for this inode
4332 * Normally in a error case.
4333 *
4334 * This one will *NOT* use accurate qgroup reserved space API, just for case
4335 * which we can't sleep and is sure it won't affect qgroup reserved space.
4336 * Like clear_bit_hook().
4337 */
4338void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4339                                            u64 len)
4340{
4341        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4342        struct btrfs_space_info *data_sinfo;
4343
4344        /* Make sure the range is aligned to sectorsize */
4345        len = round_up(start + len, fs_info->sectorsize) -
4346              round_down(start, fs_info->sectorsize);
4347        start = round_down(start, fs_info->sectorsize);
4348
4349        data_sinfo = fs_info->data_sinfo;
4350        spin_lock(&data_sinfo->lock);
4351        update_bytes_may_use(data_sinfo, -len);
4352        trace_btrfs_space_reservation(fs_info, "space_info",
4353                                      data_sinfo->flags, len, 0);
4354        spin_unlock(&data_sinfo->lock);
4355}
4356
4357/*
4358 * Called if we need to clear a data reservation for this inode
4359 * Normally in a error case.
4360 *
4361 * This one will handle the per-inode data rsv map for accurate reserved
4362 * space framework.
4363 */
4364void btrfs_free_reserved_data_space(struct inode *inode,
4365                        struct extent_changeset *reserved, u64 start, u64 len)
4366{
4367        struct btrfs_root *root = BTRFS_I(inode)->root;
4368
4369        /* Make sure the range is aligned to sectorsize */
4370        len = round_up(start + len, root->fs_info->sectorsize) -
4371              round_down(start, root->fs_info->sectorsize);
4372        start = round_down(start, root->fs_info->sectorsize);
4373
4374        btrfs_free_reserved_data_space_noquota(inode, start, len);
4375        btrfs_qgroup_free_data(inode, reserved, start, len);
4376}
4377
4378static void force_metadata_allocation(struct btrfs_fs_info *info)
4379{
4380        struct list_head *head = &info->space_info;
4381        struct btrfs_space_info *found;
4382
4383        rcu_read_lock();
4384        list_for_each_entry_rcu(found, head, list) {
4385                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4386                        found->force_alloc = CHUNK_ALLOC_FORCE;
4387        }
4388        rcu_read_unlock();
4389}
4390
4391static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
4392{
4393        return (global->size << 1);
4394}
4395
4396static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
4397                              struct btrfs_space_info *sinfo, int force)
4398{
4399        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4400        u64 bytes_used = btrfs_space_info_used(sinfo, false);
4401        u64 thresh;
4402
4403        if (force == CHUNK_ALLOC_FORCE)
4404                return 1;
4405
4406        /*
4407         * We need to take into account the global rsv because for all intents
4408         * and purposes it's used space.  Don't worry about locking the
4409         * global_rsv, it doesn't change except when the transaction commits.
4410         */
4411        if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4412                bytes_used += calc_global_rsv_need_space(global_rsv);
4413
4414        /*
4415         * in limited mode, we want to have some free space up to
4416         * about 1% of the FS size.
4417         */
4418        if (force == CHUNK_ALLOC_LIMITED) {
4419                thresh = btrfs_super_total_bytes(fs_info->super_copy);
4420                thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4421
4422                if (sinfo->total_bytes - bytes_used < thresh)
4423                        return 1;
4424        }
4425
4426        if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
4427                return 0;
4428        return 1;
4429}
4430
4431static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
4432{
4433        u64 num_dev;
4434
4435        if (type & (BTRFS_BLOCK_GROUP_RAID10 |
4436                    BTRFS_BLOCK_GROUP_RAID0 |
4437                    BTRFS_BLOCK_GROUP_RAID5 |
4438                    BTRFS_BLOCK_GROUP_RAID6))
4439                num_dev = fs_info->fs_devices->rw_devices;
4440        else if (type & BTRFS_BLOCK_GROUP_RAID1)
4441                num_dev = 2;
4442        else
4443                num_dev = 1;    /* DUP or single */
4444
4445        return num_dev;
4446}
4447
4448/*
4449 * If @is_allocation is true, reserve space in the system space info necessary
4450 * for allocating a chunk, otherwise if it's false, reserve space necessary for
4451 * removing a chunk.
4452 */
4453void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4454{
4455        struct btrfs_fs_info *fs_info = trans->fs_info;
4456        struct btrfs_space_info *info;
4457        u64 left;
4458        u64 thresh;
4459        int ret = 0;
4460        u64 num_devs;
4461
4462        /*
4463         * Needed because we can end up allocating a system chunk and for an
4464         * atomic and race free space reservation in the chunk block reserve.
4465         */
4466        lockdep_assert_held(&fs_info->chunk_mutex);
4467
4468        info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4469        spin_lock(&info->lock);
4470        left = info->total_bytes - btrfs_space_info_used(info, true);
4471        spin_unlock(&info->lock);
4472
4473        num_devs = get_profile_num_devs(fs_info, type);
4474
4475        /* num_devs device items to update and 1 chunk item to add or remove */
4476        thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
4477                btrfs_calc_trans_metadata_size(fs_info, 1);
4478
4479        if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4480                btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4481                           left, thresh, type);
4482                dump_space_info(fs_info, info, 0, 0);
4483        }
4484
4485        if (left < thresh) {
4486                u64 flags = btrfs_system_alloc_profile(fs_info);
4487
4488                /*
4489                 * Ignore failure to create system chunk. We might end up not
4490                 * needing it, as we might not need to COW all nodes/leafs from
4491                 * the paths we visit in the chunk tree (they were already COWed
4492                 * or created in the current transaction for example).
4493                 */
4494                ret = btrfs_alloc_chunk(trans, flags);
4495        }
4496
4497        if (!ret) {
4498                ret = btrfs_block_rsv_add(fs_info->chunk_root,
4499                                          &fs_info->chunk_block_rsv,
4500                                          thresh, BTRFS_RESERVE_NO_FLUSH);
4501                if (!ret)
4502                        trans->chunk_bytes_reserved += thresh;
4503        }
4504}
4505
4506/*
4507 * If force is CHUNK_ALLOC_FORCE:
4508 *    - return 1 if it successfully allocates a chunk,
4509 *    - return errors including -ENOSPC otherwise.
4510 * If force is NOT CHUNK_ALLOC_FORCE:
4511 *    - return 0 if it doesn't need to allocate a new chunk,
4512 *    - return 1 if it successfully allocates a chunk,
4513 *    - return errors including -ENOSPC otherwise.
4514 */
4515static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
4516                          int force)
4517{
4518        struct btrfs_fs_info *fs_info = trans->fs_info;
4519        struct btrfs_space_info *space_info;
4520        bool wait_for_alloc = false;
4521        bool should_alloc = false;
4522        int ret = 0;
4523
4524        /* Don't re-enter if we're already allocating a chunk */
4525        if (trans->allocating_chunk)
4526                return -ENOSPC;
4527
4528        space_info = __find_space_info(fs_info, flags);
4529        ASSERT(space_info);
4530
4531        do {
4532                spin_lock(&space_info->lock);
4533                if (force < space_info->force_alloc)
4534                        force = space_info->force_alloc;
4535                should_alloc = should_alloc_chunk(fs_info, space_info, force);
4536                if (space_info->full) {
4537                        /* No more free physical space */
4538                        if (should_alloc)
4539                                ret = -ENOSPC;
4540                        else
4541                                ret = 0;
4542                        spin_unlock(&space_info->lock);
4543                        return ret;
4544                } else if (!should_alloc) {
4545                        spin_unlock(&space_info->lock);
4546                        return 0;
4547                } else if (space_info->chunk_alloc) {
4548                        /*
4549                         * Someone is already allocating, so we need to block
4550                         * until this someone is finished and then loop to
4551                         * recheck if we should continue with our allocation
4552                         * attempt.
4553                         */
4554                        wait_for_alloc = true;
4555                        spin_unlock(&space_info->lock);
4556                        mutex_lock(&fs_info->chunk_mutex);
4557                        mutex_unlock(&fs_info->chunk_mutex);
4558                } else {
4559                        /* Proceed with allocation */
4560                        space_info->chunk_alloc = 1;
4561                        wait_for_alloc = false;
4562                        spin_unlock(&space_info->lock);
4563                }
4564
4565                cond_resched();
4566        } while (wait_for_alloc);
4567
4568        mutex_lock(&fs_info->chunk_mutex);
4569        trans->allocating_chunk = true;
4570
4571        /*
4572         * If we have mixed data/metadata chunks we want to make sure we keep
4573         * allocating mixed chunks instead of individual chunks.
4574         */
4575        if (btrfs_mixed_space_info(space_info))
4576                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4577
4578        /*
4579         * if we're doing a data chunk, go ahead and make sure that
4580         * we keep a reasonable number of metadata chunks allocated in the
4581         * FS as well.
4582         */
4583        if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4584                fs_info->data_chunk_allocations++;
4585                if (!(fs_info->data_chunk_allocations %
4586                      fs_info->metadata_ratio))
4587                        force_metadata_allocation(fs_info);
4588        }
4589
4590        /*
4591         * Check if we have enough space in SYSTEM chunk because we may need
4592         * to update devices.
4593         */
4594        check_system_chunk(trans, flags);
4595
4596        ret = btrfs_alloc_chunk(trans, flags);
4597        trans->allocating_chunk = false;
4598
4599        spin_lock(&space_info->lock);
4600        if (ret < 0) {
4601                if (ret == -ENOSPC)
4602                        space_info->full = 1;
4603                else
4604                        goto out;
4605        } else {
4606                ret = 1;
4607                space_info->max_extent_size = 0;
4608        }
4609
4610        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4611out:
4612        space_info->chunk_alloc = 0;
4613        spin_unlock(&space_info->lock);
4614        mutex_unlock(&fs_info->chunk_mutex);
4615        /*
4616         * When we allocate a new chunk we reserve space in the chunk block
4617         * reserve to make sure we can COW nodes/leafs in the chunk tree or
4618         * add new nodes/leafs to it if we end up needing to do it when
4619         * inserting the chunk item and updating device items as part of the
4620         * second phase of chunk allocation, performed by
4621         * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
4622         * large number of new block groups to create in our transaction
4623         * handle's new_bgs list to avoid exhausting the chunk block reserve
4624         * in extreme cases - like having a single transaction create many new
4625         * block groups when starting to write out the free space caches of all
4626         * the block groups that were made dirty during the lifetime of the
4627         * transaction.
4628         */
4629        if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
4630                btrfs_create_pending_block_groups(trans);
4631
4632        return ret;
4633}
4634
4635static int can_overcommit(struct btrfs_fs_info *fs_info,
4636                          struct btrfs_space_info *space_info, u64 bytes,
4637                          enum btrfs_reserve_flush_enum flush,
4638                          bool system_chunk)
4639{
4640        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4641        u64 profile;
4642        u64 space_size;
4643        u64 avail;
4644        u64 used;
4645        int factor;
4646
4647        /* Don't overcommit when in mixed mode. */
4648        if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
4649                return 0;
4650
4651        if (system_chunk)
4652                profile = btrfs_system_alloc_profile(fs_info);
4653        else
4654                profile = btrfs_metadata_alloc_profile(fs_info);
4655
4656        used = btrfs_space_info_used(space_info, false);
4657
4658        /*
4659         * We only want to allow over committing if we have lots of actual space
4660         * free, but if we don't have enough space to handle the global reserve
4661         * space then we could end up having a real enospc problem when trying
4662         * to allocate a chunk or some other such important allocation.
4663         */
4664        spin_lock(&global_rsv->lock);
4665        space_size = calc_global_rsv_need_space(global_rsv);
4666        spin_unlock(&global_rsv->lock);
4667        if (used + space_size >= space_info->total_bytes)
4668                return 0;
4669
4670        used += space_info->bytes_may_use;
4671
4672        avail = atomic64_read(&fs_info->free_chunk_space);
4673
4674        /*
4675         * If we have dup, raid1 or raid10 then only half of the free
4676         * space is actually usable.  For raid56, the space info used
4677         * doesn't include the parity drive, so we don't have to
4678         * change the math
4679         */
4680        factor = btrfs_bg_type_to_factor(profile);
4681        avail = div_u64(avail, factor);
4682
4683        /*
4684         * If we aren't flushing all things, let us overcommit up to
4685         * 1/2th of the space. If we can flush, don't let us overcommit
4686         * too much, let it overcommit up to 1/8 of the space.
4687         */
4688        if (flush == BTRFS_RESERVE_FLUSH_ALL)
4689                avail >>= 3;
4690        else
4691                avail >>= 1;
4692
4693        if (used + bytes < space_info->total_bytes + avail)
4694                return 1;
4695        return 0;
4696}
4697
4698static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
4699                                         unsigned long nr_pages, int nr_items)
4700{
4701        struct super_block *sb = fs_info->sb;
4702
4703        if (down_read_trylock(&sb->s_umount)) {
4704                writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4705                up_read(&sb->s_umount);
4706        } else {
4707                /*
4708                 * We needn't worry the filesystem going from r/w to r/o though
4709                 * we don't acquire ->s_umount mutex, because the filesystem
4710                 * should guarantee the delalloc inodes list be empty after
4711                 * the filesystem is readonly(all dirty pages are written to
4712                 * the disk).
4713                 */
4714                btrfs_start_delalloc_roots(fs_info, nr_items);
4715                if (!current->journal_info)
4716                        btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
4717        }
4718}
4719
4720static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
4721                                        u64 to_reclaim)
4722{
4723        u64 bytes;
4724        u64 nr;
4725
4726        bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
4727        nr = div64_u64(to_reclaim, bytes);
4728        if (!nr)
4729                nr = 1;
4730        return nr;
4731}
4732
4733#define EXTENT_SIZE_PER_ITEM    SZ_256K
4734
4735/*
4736 * shrink metadata reservation for delalloc
4737 */
4738static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
4739                            u64 orig, bool wait_ordered)
4740{
4741        struct btrfs_space_info *space_info;
4742        struct btrfs_trans_handle *trans;
4743        u64 delalloc_bytes;
4744        u64 max_reclaim;
4745        u64 items;
4746        long time_left;
4747        unsigned long nr_pages;
4748        int loops;
4749
4750        /* Calc the number of the pages we need flush for space reservation */
4751        items = calc_reclaim_items_nr(fs_info, to_reclaim);
4752        to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4753
4754        trans = (struct btrfs_trans_handle *)current->journal_info;
4755        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4756
4757        delalloc_bytes = percpu_counter_sum_positive(
4758                                                &fs_info->delalloc_bytes);
4759        if (delalloc_bytes == 0) {
4760                if (trans)
4761                        return;
4762                if (wait_ordered)
4763                        btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4764                return;
4765        }
4766
4767        loops = 0;
4768        while (delalloc_bytes && loops < 3) {
4769                max_reclaim = min(delalloc_bytes, to_reclaim);
4770                nr_pages = max_reclaim >> PAGE_SHIFT;
4771                btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
4772                /*
4773                 * We need to wait for the async pages to actually start before
4774                 * we do anything.
4775                 */
4776                max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
4777                if (!max_reclaim)
4778                        goto skip_async;
4779
4780                if (max_reclaim <= nr_pages)
4781                        max_reclaim = 0;
4782                else
4783                        max_reclaim -= nr_pages;
4784
4785                wait_event(fs_info->async_submit_wait,
4786                           atomic_read(&fs_info->async_delalloc_pages) <=
4787                           (int)max_reclaim);
4788skip_async:
4789                spin_lock(&space_info->lock);
4790                if (list_empty(&space_info->tickets) &&
4791                    list_empty(&space_info->priority_tickets)) {
4792                        spin_unlock(&space_info->lock);
4793                        break;
4794                }
4795                spin_unlock(&space_info->lock);
4796
4797                loops++;
4798                if (wait_ordered && !trans) {
4799                        btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
4800                } else {
4801                        time_left = schedule_timeout_killable(1);
4802                        if (time_left)
4803                                break;
4804                }
4805                delalloc_bytes = percpu_counter_sum_positive(
4806                                                &fs_info->delalloc_bytes);
4807        }
4808}
4809
4810struct reserve_ticket {
4811        u64 bytes;
4812        int error;
4813        struct list_head list;
4814        wait_queue_head_t wait;
4815};
4816
4817/**
4818 * maybe_commit_transaction - possibly commit the transaction if its ok to
4819 * @root - the root we're allocating for
4820 * @bytes - the number of bytes we want to reserve
4821 * @force - force the commit
4822 *
4823 * This will check to make sure that committing the transaction will actually
4824 * get us somewhere and then commit the transaction if it does.  Otherwise it
4825 * will return -ENOSPC.
4826 */
4827static int may_commit_transaction(struct btrfs_fs_info *fs_info,
4828                                  struct btrfs_space_info *space_info)
4829{
4830        struct reserve_ticket *ticket = NULL;
4831        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
4832        struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
4833        struct btrfs_trans_handle *trans;
4834        u64 bytes_needed;
4835        u64 reclaim_bytes = 0;
4836
4837        trans = (struct btrfs_trans_handle *)current->journal_info;
4838        if (trans)
4839                return -EAGAIN;
4840
4841        spin_lock(&space_info->lock);
4842        if (!list_empty(&space_info->priority_tickets))
4843                ticket = list_first_entry(&space_info->priority_tickets,
4844                                          struct reserve_ticket, list);
4845        else if (!list_empty(&space_info->tickets))
4846                ticket = list_first_entry(&space_info->tickets,
4847                                          struct reserve_ticket, list);
4848        bytes_needed = (ticket) ? ticket->bytes : 0;
4849        spin_unlock(&space_info->lock);
4850
4851        if (!bytes_needed)
4852                return 0;
4853
4854        /* See if there is enough pinned space to make this reservation */
4855        if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4856                                   bytes_needed,
4857                                   BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
4858                goto commit;
4859
4860        /*
4861         * See if there is some space in the delayed insertion reservation for
4862         * this reservation.
4863         */
4864        if (space_info != delayed_rsv->space_info)
4865                return -ENOSPC;
4866
4867        spin_lock(&delayed_rsv->lock);
4868        reclaim_bytes += delayed_rsv->reserved;
4869        spin_unlock(&delayed_rsv->lock);
4870
4871        spin_lock(&delayed_refs_rsv->lock);
4872        reclaim_bytes += delayed_refs_rsv->reserved;
4873        spin_unlock(&delayed_refs_rsv->lock);
4874        if (reclaim_bytes >= bytes_needed)
4875                goto commit;
4876        bytes_needed -= reclaim_bytes;
4877
4878        if (__percpu_counter_compare(&space_info->total_bytes_pinned,
4879                                   bytes_needed,
4880                                   BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
4881                return -ENOSPC;
4882        }
4883
4884commit:
4885        trans = btrfs_join_transaction(fs_info->extent_root);
4886        if (IS_ERR(trans))
4887                return -ENOSPC;
4888
4889        return btrfs_commit_transaction(trans);
4890}
4891
4892/*
4893 * Try to flush some data based on policy set by @state. This is only advisory
4894 * and may fail for various reasons. The caller is supposed to examine the
4895 * state of @space_info to detect the outcome.
4896 */
4897static void flush_space(struct btrfs_fs_info *fs_info,
4898                       struct btrfs_space_info *space_info, u64 num_bytes,
4899                       int state)
4900{
4901        struct btrfs_root *root = fs_info->extent_root;
4902        struct btrfs_trans_handle *trans;
4903        int nr;
4904        int ret = 0;
4905
4906        switch (state) {
4907        case FLUSH_DELAYED_ITEMS_NR:
4908        case FLUSH_DELAYED_ITEMS:
4909                if (state == FLUSH_DELAYED_ITEMS_NR)
4910                        nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
4911                else
4912                        nr = -1;
4913
4914                trans = btrfs_join_transaction(root);
4915                if (IS_ERR(trans)) {
4916                        ret = PTR_ERR(trans);
4917                        break;
4918                }
4919                ret = btrfs_run_delayed_items_nr(trans, nr);
4920                btrfs_end_transaction(trans);
4921                break;
4922        case FLUSH_DELALLOC:
4923        case FLUSH_DELALLOC_WAIT:
4924                shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
4925                                state == FLUSH_DELALLOC_WAIT);
4926                break;
4927        case FLUSH_DELAYED_REFS_NR:
4928        case FLUSH_DELAYED_REFS:
4929                trans = btrfs_join_transaction(root);
4930                if (IS_ERR(trans)) {
4931                        ret = PTR_ERR(trans);
4932                        break;
4933                }
4934                if (state == FLUSH_DELAYED_REFS_NR)
4935                        nr = calc_reclaim_items_nr(fs_info, num_bytes);
4936                else
4937                        nr = 0;
4938                btrfs_run_delayed_refs(trans, nr);
4939                btrfs_end_transaction(trans);
4940                break;
4941        case ALLOC_CHUNK:
4942                trans = btrfs_join_transaction(root);
4943                if (IS_ERR(trans)) {
4944                        ret = PTR_ERR(trans);
4945                        break;
4946                }
4947                ret = do_chunk_alloc(trans,
4948                                     btrfs_metadata_alloc_profile(fs_info),
4949                                     CHUNK_ALLOC_NO_FORCE);
4950                btrfs_end_transaction(trans);
4951                if (ret > 0 || ret == -ENOSPC)
4952                        ret = 0;
4953                break;
4954        case COMMIT_TRANS:
4955                /*
4956                 * If we have pending delayed iputs then we could free up a
4957                 * bunch of pinned space, so make sure we run the iputs before
4958                 * we do our pinned bytes check below.
4959                 */
4960                mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
4961                btrfs_run_delayed_iputs(fs_info);
4962                mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
4963
4964                ret = may_commit_transaction(fs_info, space_info);
4965                break;
4966        default:
4967                ret = -ENOSPC;
4968                break;
4969        }
4970
4971        trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
4972                                ret);
4973        return;
4974}
4975
4976static inline u64
4977btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
4978                                 struct btrfs_space_info *space_info,
4979                                 bool system_chunk)
4980{
4981        struct reserve_ticket *ticket;
4982        u64 used;
4983        u64 expected;
4984        u64 to_reclaim = 0;
4985
4986        list_for_each_entry(ticket, &space_info->tickets, list)
4987                to_reclaim += ticket->bytes;
4988        list_for_each_entry(ticket, &space_info->priority_tickets, list)
4989                to_reclaim += ticket->bytes;
4990        if (to_reclaim)
4991                return to_reclaim;
4992
4993        to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
4994        if (can_overcommit(fs_info, space_info, to_reclaim,
4995                           BTRFS_RESERVE_FLUSH_ALL, system_chunk))
4996                return 0;
4997
4998        used = btrfs_space_info_used(space_info, true);
4999
5000        if (can_overcommit(fs_info, space_info, SZ_1M,
5001                           BTRFS_RESERVE_FLUSH_ALL, system_chunk))
5002                expected = div_factor_fine(space_info->total_bytes, 95);
5003        else
5004                expected = div_factor_fine(space_info->total_bytes, 90);
5005
5006        if (used > expected)
5007                to_reclaim = used - expected;
5008        else
5009                to_reclaim = 0;
5010        to_reclaim = min(to_reclaim, space_info->bytes_may_use +
5011                                     space_info->bytes_reserved);
5012        return to_reclaim;
5013}
5014
5015static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
5016                                        struct btrfs_space_info *space_info,
5017                                        u64 used, bool system_chunk)
5018{
5019        u64 thresh = div_factor_fine(space_info->total_bytes, 98);
5020
5021        /* If we're just plain full then async reclaim just slows us down. */
5022        if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
5023                return 0;
5024
5025        if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5026                                              system_chunk))
5027                return 0;
5028
5029        return (used >= thresh && !btrfs_fs_closing(fs_info) &&
5030                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
5031}
5032
5033static void wake_all_tickets(struct list_head *head)
5034{
5035        struct reserve_ticket *ticket;
5036
5037        while (!list_empty(head)) {
5038                ticket = list_first_entry(head, struct reserve_ticket, list);
5039                list_del_init(&ticket->list);
5040                ticket->error = -ENOSPC;
5041                wake_up(&ticket->wait);
5042        }
5043}
5044
5045/*
5046 * This is for normal flushers, we can wait all goddamned day if we want to.  We
5047 * will loop and continuously try to flush as long as we are making progress.
5048 * We count progress as clearing off tickets each time we have to loop.
5049 */
5050static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
5051{
5052        struct btrfs_fs_info *fs_info;
5053        struct btrfs_space_info *space_info;
5054        u64 to_reclaim;
5055        int flush_state;
5056        int commit_cycles = 0;
5057        u64 last_tickets_id;
5058
5059        fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
5060        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5061
5062        spin_lock(&space_info->lock);
5063        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5064                                                      false);
5065        if (!to_reclaim) {
5066                space_info->flush = 0;
5067                spin_unlock(&space_info->lock);
5068                return;
5069        }
5070        last_tickets_id = space_info->tickets_id;
5071        spin_unlock(&space_info->lock);
5072
5073        flush_state = FLUSH_DELAYED_ITEMS_NR;
5074        do {
5075                flush_space(fs_info, space_info, to_reclaim, flush_state);
5076                spin_lock(&space_info->lock);
5077                if (list_empty(&space_info->tickets)) {
5078                        space_info->flush = 0;
5079                        spin_unlock(&space_info->lock);
5080                        return;
5081                }
5082                to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
5083                                                              space_info,
5084                                                              false);
5085                if (last_tickets_id == space_info->tickets_id) {
5086                        flush_state++;
5087                } else {
5088                        last_tickets_id = space_info->tickets_id;
5089                        flush_state = FLUSH_DELAYED_ITEMS_NR;
5090                        if (commit_cycles)
5091                                commit_cycles--;
5092                }
5093
5094                if (flush_state > COMMIT_TRANS) {
5095                        commit_cycles++;
5096                        if (commit_cycles > 2) {
5097                                wake_all_tickets(&space_info->tickets);
5098                                space_info->flush = 0;
5099                        } else {
5100                                flush_state = FLUSH_DELAYED_ITEMS_NR;
5101                        }
5102                }
5103                spin_unlock(&space_info->lock);
5104        } while (flush_state <= COMMIT_TRANS);
5105}
5106
5107void btrfs_init_async_reclaim_work(struct work_struct *work)
5108{
5109        INIT_WORK(work, btrfs_async_reclaim_metadata_space);
5110}
5111
5112static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
5113                                            struct btrfs_space_info *space_info,
5114                                            struct reserve_ticket *ticket)
5115{
5116        u64 to_reclaim;
5117        int flush_state = FLUSH_DELAYED_ITEMS_NR;
5118
5119        spin_lock(&space_info->lock);
5120        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
5121                                                      false);
5122        if (!to_reclaim) {
5123                spin_unlock(&space_info->lock);
5124                return;
5125        }
5126        spin_unlock(&space_info->lock);
5127
5128        do {
5129                flush_space(fs_info, space_info, to_reclaim, flush_state);
5130                flush_state++;
5131                spin_lock(&space_info->lock);
5132                if (ticket->bytes == 0) {
5133                        spin_unlock(&space_info->lock);
5134                        return;
5135                }
5136                spin_unlock(&space_info->lock);
5137
5138                /*
5139                 * Priority flushers can't wait on delalloc without
5140                 * deadlocking.
5141                 */
5142                if (flush_state == FLUSH_DELALLOC ||
5143                    flush_state == FLUSH_DELALLOC_WAIT)
5144                        flush_state = ALLOC_CHUNK;
5145        } while (flush_state < COMMIT_TRANS);
5146}
5147
5148static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
5149                               struct btrfs_space_info *space_info,
5150                               struct reserve_ticket *ticket, u64 orig_bytes)
5151
5152{
5153        DEFINE_WAIT(wait);
5154        int ret = 0;
5155
5156        spin_lock(&space_info->lock);
5157        while (ticket->bytes > 0 && ticket->error == 0) {
5158                ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
5159                if (ret) {
5160                        ret = -EINTR;
5161                        break;
5162                }
5163                spin_unlock(&space_info->lock);
5164
5165                schedule();
5166
5167                finish_wait(&ticket->wait, &wait);
5168                spin_lock(&space_info->lock);
5169        }
5170        if (!ret)
5171                ret = ticket->error;
5172        if (!list_empty(&ticket->list))
5173                list_del_init(&ticket->list);
5174        if (ticket->bytes && ticket->bytes < orig_bytes) {
5175                u64 num_bytes = orig_bytes - ticket->bytes;
5176                update_bytes_may_use(space_info, -num_bytes);
5177                trace_btrfs_space_reservation(fs_info, "space_info",
5178                                              space_info->flags, num_bytes, 0);
5179        }
5180        spin_unlock(&space_info->lock);
5181
5182        return ret;
5183}
5184
5185/**
5186 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5187 * @root - the root we're allocating for
5188 * @space_info - the space info we want to allocate from
5189 * @orig_bytes - the number of bytes we want
5190 * @flush - whether or not we can flush to make our reservation
5191 *
5192 * This will reserve orig_bytes number of bytes from the space info associated
5193 * with the block_rsv.  If there is not enough space it will make an attempt to
5194 * flush out space to make room.  It will do this by flushing delalloc if
5195 * possible or committing the transaction.  If flush is 0 then no attempts to
5196 * regain reservations will be made and this will fail if there is not enough
5197 * space already.
5198 */
5199static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
5200                                    struct btrfs_space_info *space_info,
5201                                    u64 orig_bytes,
5202                                    enum btrfs_reserve_flush_enum flush,
5203                                    bool system_chunk)
5204{
5205        struct reserve_ticket ticket;
5206        u64 used;
5207        int ret = 0;
5208
5209        ASSERT(orig_bytes);
5210        ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5211
5212        spin_lock(&space_info->lock);
5213        ret = -ENOSPC;
5214        used = btrfs_space_info_used(space_info, true);
5215
5216        /*
5217         * If we have enough space then hooray, make our reservation and carry
5218         * on.  If not see if we can overcommit, and if we can, hooray carry on.
5219         * If not things get more complicated.
5220         */
5221        if (used + orig_bytes <= space_info->total_bytes) {
5222                update_bytes_may_use(space_info, orig_bytes);
5223                trace_btrfs_space_reservation(fs_info, "space_info",
5224                                              space_info->flags, orig_bytes, 1);
5225                ret = 0;
5226        } else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
5227                                  system_chunk)) {
5228                update_bytes_may_use(space_info, orig_bytes);
5229                trace_btrfs_space_reservation(fs_info, "space_info",
5230                                              space_info->flags, orig_bytes, 1);
5231                ret = 0;
5232        }
5233
5234        /*
5235         * If we couldn't make a reservation then setup our reservation ticket
5236         * and kick the async worker if it's not already running.
5237         *
5238         * If we are a priority flusher then we just need to add our ticket to
5239         * the list and we will do our own flushing further down.
5240         */
5241        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5242                ticket.bytes = orig_bytes;
5243                ticket.error = 0;
5244                init_waitqueue_head(&ticket.wait);
5245                if (flush == BTRFS_RESERVE_FLUSH_ALL) {
5246                        list_add_tail(&ticket.list, &space_info->tickets);
5247                        if (!space_info->flush) {
5248                                space_info->flush = 1;
5249                                trace_btrfs_trigger_flush(fs_info,
5250                                                          space_info->flags,
5251                                                          orig_bytes, flush,
5252                                                          "enospc");
5253                                queue_work(system_unbound_wq,
5254                                           &fs_info->async_reclaim_work);
5255                        }
5256                } else {
5257                        list_add_tail(&ticket.list,
5258                                      &space_info->priority_tickets);
5259                }
5260        } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
5261                used += orig_bytes;
5262                /*
5263                 * We will do the space reservation dance during log replay,
5264                 * which means we won't have fs_info->fs_root set, so don't do
5265                 * the async reclaim as we will panic.
5266                 */
5267                if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
5268                    need_do_async_reclaim(fs_info, space_info,
5269                                          used, system_chunk) &&
5270                    !work_busy(&fs_info->async_reclaim_work)) {
5271                        trace_btrfs_trigger_flush(fs_info, space_info->flags,
5272                                                  orig_bytes, flush, "preempt");
5273                        queue_work(system_unbound_wq,
5274                                   &fs_info->async_reclaim_work);
5275                }
5276        }
5277        spin_unlock(&space_info->lock);
5278        if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5279                return ret;
5280
5281        if (flush == BTRFS_RESERVE_FLUSH_ALL)
5282                return wait_reserve_ticket(fs_info, space_info, &ticket,
5283                                           orig_bytes);
5284
5285        ret = 0;
5286        priority_reclaim_metadata_space(fs_info, space_info, &ticket);
5287        spin_lock(&space_info->lock);
5288        if (ticket.bytes) {
5289                if (ticket.bytes < orig_bytes) {
5290                        u64 num_bytes = orig_bytes - ticket.bytes;
5291                        update_bytes_may_use(space_info, -num_bytes);
5292                        trace_btrfs_space_reservation(fs_info, "space_info",
5293                                                      space_info->flags,
5294                                                      num_bytes, 0);
5295
5296                }
5297                list_del_init(&ticket.list);
5298                ret = -ENOSPC;
5299        }
5300        spin_unlock(&space_info->lock);
5301        ASSERT(list_empty(&ticket.list));
5302        return ret;
5303}
5304
5305/**
5306 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
5307 * @root - the root we're allocating for
5308 * @block_rsv - the block_rsv we're allocating for
5309 * @orig_bytes - the number of bytes we want
5310 * @flush - whether or not we can flush to make our reservation
5311 *
5312 * This will reserve orig_bytes number of bytes from the space info associated
5313 * with the block_rsv.  If there is not enough space it will make an attempt to
5314 * flush out space to make room.  It will do this by flushing delalloc if
5315 * possible or committing the transaction.  If flush is 0 then no attempts to
5316 * regain reservations will be made and this will fail if there is not enough
5317 * space already.
5318 */
5319static int reserve_metadata_bytes(struct btrfs_root *root,
5320                                  struct btrfs_block_rsv *block_rsv,
5321                                  u64 orig_bytes,
5322                                  enum btrfs_reserve_flush_enum flush)
5323{
5324        struct btrfs_fs_info *fs_info = root->fs_info;
5325        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5326        int ret;
5327        bool system_chunk = (root == fs_info->chunk_root);
5328
5329        ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
5330                                       orig_bytes, flush, system_chunk);
5331        if (ret == -ENOSPC &&
5332            unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
5333                if (block_rsv != global_rsv &&
5334                    !block_rsv_use_bytes(global_rsv, orig_bytes))
5335                        ret = 0;
5336        }
5337        if (ret == -ENOSPC) {
5338                trace_btrfs_space_reservation(fs_info, "space_info:enospc",
5339                                              block_rsv->space_info->flags,
5340                                              orig_bytes, 1);
5341
5342                if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
5343                        dump_space_info(fs_info, block_rsv->space_info,
5344                                        orig_bytes, 0);
5345        }
5346        return ret;
5347}
5348
5349static struct btrfs_block_rsv *get_block_rsv(
5350                                        const struct btrfs_trans_handle *trans,
5351                                        const struct btrfs_root *root)
5352{
5353        struct btrfs_fs_info *fs_info = root->fs_info;
5354        struct btrfs_block_rsv *block_rsv = NULL;
5355
5356        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
5357            (root == fs_info->csum_root && trans->adding_csums) ||
5358            (root == fs_info->uuid_root))
5359                block_rsv = trans->block_rsv;
5360
5361        if (!block_rsv)
5362                block_rsv = root->block_rsv;
5363
5364        if (!block_rsv)
5365                block_rsv = &fs_info->empty_block_rsv;
5366
5367        return block_rsv;
5368}
5369
5370static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
5371                               u64 num_bytes)
5372{
5373        int ret = -ENOSPC;
5374        spin_lock(&block_rsv->lock);
5375        if (block_rsv->reserved >= num_bytes) {
5376                block_rsv->reserved -= num_bytes;
5377                if (block_rsv->reserved < block_rsv->size)
5378                        block_rsv->full = 0;
5379                ret = 0;
5380        }
5381        spin_unlock(&block_rsv->lock);
5382        return ret;
5383}
5384
5385static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
5386                                u64 num_bytes, bool update_size)
5387{
5388        spin_lock(&block_rsv->lock);
5389        block_rsv->reserved += num_bytes;
5390        if (update_size)
5391                block_rsv->size += num_bytes;
5392        else if (block_rsv->reserved >= block_rsv->size)
5393                block_rsv->full = 1;
5394        spin_unlock(&block_rsv->lock);
5395}
5396
5397int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
5398                             struct btrfs_block_rsv *dest, u64 num_bytes,
5399                             int min_factor)
5400{
5401        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5402        u64 min_bytes;
5403
5404        if (global_rsv->space_info != dest->space_info)
5405                return -ENOSPC;
5406
5407        spin_lock(&global_rsv->lock);
5408        min_bytes = div_factor(global_rsv->size, min_factor);
5409        if (global_rsv->reserved < min_bytes + num_bytes) {
5410                spin_unlock(&global_rsv->lock);
5411                return -ENOSPC;
5412        }
5413        global_rsv->reserved -= num_bytes;
5414        if (global_rsv->reserved < global_rsv->size)
5415                global_rsv->full = 0;
5416        spin_unlock(&global_rsv->lock);
5417
5418        block_rsv_add_bytes(dest, num_bytes, true);
5419        return 0;
5420}
5421
5422/**
5423 * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs rsv.
5424 * @fs_info - the fs info for our fs.
5425 * @src - the source block rsv to transfer from.
5426 * @num_bytes - the number of bytes to transfer.
5427 *
5428 * This transfers up to the num_bytes amount from the src rsv to the
5429 * delayed_refs_rsv.  Any extra bytes are returned to the space info.
5430 */
5431void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
5432                                       struct btrfs_block_rsv *src,
5433                                       u64 num_bytes)
5434{
5435        struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
5436        u64 to_free = 0;
5437
5438        spin_lock(&src->lock);
5439        src->reserved -= num_bytes;
5440        src->size -= num_bytes;
5441        spin_unlock(&src->lock);
5442
5443        spin_lock(&delayed_refs_rsv->lock);
5444        if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
5445                u64 delta = delayed_refs_rsv->size -
5446                        delayed_refs_rsv->reserved;
5447                if (num_bytes > delta) {
5448                        to_free = num_bytes - delta;
5449                        num_bytes = delta;
5450                }
5451        } else {
5452                to_free = num_bytes;
5453                num_bytes = 0;
5454        }
5455
5456        if (num_bytes)
5457                delayed_refs_rsv->reserved += num_bytes;
5458        if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
5459                delayed_refs_rsv->full = 1;
5460        spin_unlock(&delayed_refs_rsv->lock);
5461
5462        if (num_bytes)
5463                trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5464                                              0, num_bytes, 1);
5465        if (to_free)
5466                space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
5467                                         to_free);
5468}
5469
5470/**
5471 * btrfs_delayed_refs_rsv_refill - refill based on our delayed refs usage.
5472 * @fs_info - the fs_info for our fs.
5473 * @flush - control how we can flush for this reservation.
5474 *
5475 * This will refill the delayed block_rsv up to 1 items size worth of space and
5476 * will return -ENOSPC if we can't make the reservation.
5477 */
5478int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
5479                                  enum btrfs_reserve_flush_enum flush)
5480{
5481        struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5482        u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
5483        u64 num_bytes = 0;
5484        int ret = -ENOSPC;
5485
5486        spin_lock(&block_rsv->lock);
5487        if (block_rsv->reserved < block_rsv->size) {
5488                num_bytes = block_rsv->size - block_rsv->reserved;
5489                num_bytes = min(num_bytes, limit);
5490        }
5491        spin_unlock(&block_rsv->lock);
5492
5493        if (!num_bytes)
5494                return 0;
5495
5496        ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
5497                                     num_bytes, flush);
5498        if (ret)
5499                return ret;
5500        block_rsv_add_bytes(block_rsv, num_bytes, 0);
5501        trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5502                                      0, num_bytes, 1);
5503        return 0;
5504}
5505
5506/*
5507 * This is for space we already have accounted in space_info->bytes_may_use, so
5508 * basically when we're returning space from block_rsv's.
5509 */
5510static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
5511                                     struct btrfs_space_info *space_info,
5512                                     u64 num_bytes)
5513{
5514        struct reserve_ticket *ticket;
5515        struct list_head *head;
5516        u64 used;
5517        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
5518        bool check_overcommit = false;
5519
5520        spin_lock(&space_info->lock);
5521        head = &space_info->priority_tickets;
5522
5523        /*
5524         * If we are over our limit then we need to check and see if we can
5525         * overcommit, and if we can't then we just need to free up our space
5526         * and not satisfy any requests.
5527         */
5528        used = btrfs_space_info_used(space_info, true);
5529        if (used - num_bytes >= space_info->total_bytes)
5530                check_overcommit = true;
5531again:
5532        while (!list_empty(head) && num_bytes) {
5533                ticket = list_first_entry(head, struct reserve_ticket,
5534                                          list);
5535                /*
5536                 * We use 0 bytes because this space is already reserved, so
5537                 * adding the ticket space would be a double count.
5538                 */
5539                if (check_overcommit &&
5540                    !can_overcommit(fs_info, space_info, 0, flush, false))
5541                        break;
5542                if (num_bytes >= ticket->bytes) {
5543                        list_del_init(&ticket->list);
5544                        num_bytes -= ticket->bytes;
5545                        ticket->bytes = 0;
5546                        space_info->tickets_id++;
5547                        wake_up(&ticket->wait);
5548                } else {
5549                        ticket->bytes -= num_bytes;
5550                        num_bytes = 0;
5551                }
5552        }
5553
5554        if (num_bytes && head == &space_info->priority_tickets) {
5555                head = &space_info->tickets;
5556                flush = BTRFS_RESERVE_FLUSH_ALL;
5557                goto again;
5558        }
5559        update_bytes_may_use(space_info, -num_bytes);
5560        trace_btrfs_space_reservation(fs_info, "space_info",
5561                                      space_info->flags, num_bytes, 0);
5562        spin_unlock(&space_info->lock);
5563}
5564
5565/*
5566 * This is for newly allocated space that isn't accounted in
5567 * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
5568 * we use this helper.
5569 */
5570static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
5571                                     struct btrfs_space_info *space_info,
5572                                     u64 num_bytes)
5573{
5574        struct reserve_ticket *ticket;
5575        struct list_head *head = &space_info->priority_tickets;
5576
5577again:
5578        while (!list_empty(head) && num_bytes) {
5579                ticket = list_first_entry(head, struct reserve_ticket,
5580                                          list);
5581                if (num_bytes >= ticket->bytes) {
5582                        trace_btrfs_space_reservation(fs_info, "space_info",
5583                                                      space_info->flags,
5584                                                      ticket->bytes, 1);
5585                        list_del_init(&ticket->list);
5586                        num_bytes -= ticket->bytes;
5587                        update_bytes_may_use(space_info, ticket->bytes);
5588                        ticket->bytes = 0;
5589                        space_info->tickets_id++;
5590                        wake_up(&ticket->wait);
5591                } else {
5592                        trace_btrfs_space_reservation(fs_info, "space_info",
5593                                                      space_info->flags,
5594                                                      num_bytes, 1);
5595                        update_bytes_may_use(space_info, num_bytes);
5596                        ticket->bytes -= num_bytes;
5597                        num_bytes = 0;
5598                }
5599        }
5600
5601        if (num_bytes && head == &space_info->priority_tickets) {
5602                head = &space_info->tickets;
5603                goto again;
5604        }
5605}
5606
5607static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
5608                                    struct btrfs_block_rsv *block_rsv,
5609                                    struct btrfs_block_rsv *dest, u64 num_bytes,
5610                                    u64 *qgroup_to_release_ret)
5611{
5612        struct btrfs_space_info *space_info = block_rsv->space_info;
5613        u64 qgroup_to_release = 0;
5614        u64 ret;
5615
5616        spin_lock(&block_rsv->lock);
5617        if (num_bytes == (u64)-1) {
5618                num_bytes = block_rsv->size;
5619                qgroup_to_release = block_rsv->qgroup_rsv_size;
5620        }
5621        block_rsv->size -= num_bytes;
5622        if (block_rsv->reserved >= block_rsv->size) {
5623                num_bytes = block_rsv->reserved - block_rsv->size;
5624                block_rsv->reserved = block_rsv->size;
5625                block_rsv->full = 1;
5626        } else {
5627                num_bytes = 0;
5628        }
5629        if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
5630                qgroup_to_release = block_rsv->qgroup_rsv_reserved -
5631                                    block_rsv->qgroup_rsv_size;
5632                block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
5633        } else {
5634                qgroup_to_release = 0;
5635        }
5636        spin_unlock(&block_rsv->lock);
5637
5638        ret = num_bytes;
5639        if (num_bytes > 0) {
5640                if (dest) {
5641                        spin_lock(&dest->lock);
5642                        if (!dest->full) {
5643                                u64 bytes_to_add;
5644
5645                                bytes_to_add = dest->size - dest->reserved;
5646                                bytes_to_add = min(num_bytes, bytes_to_add);
5647                                dest->reserved += bytes_to_add;
5648                                if (dest->reserved >= dest->size)
5649                                        dest->full = 1;
5650                                num_bytes -= bytes_to_add;
5651                        }
5652                        spin_unlock(&dest->lock);
5653                }
5654                if (num_bytes)
5655                        space_info_add_old_bytes(fs_info, space_info,
5656                                                 num_bytes);
5657        }
5658        if (qgroup_to_release_ret)
5659                *qgroup_to_release_ret = qgroup_to_release;
5660        return ret;
5661}
5662
5663int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
5664                            struct btrfs_block_rsv *dst, u64 num_bytes,
5665                            bool update_size)
5666{
5667        int ret;
5668
5669        ret = block_rsv_use_bytes(src, num_bytes);
5670        if (ret)
5671                return ret;
5672
5673        block_rsv_add_bytes(dst, num_bytes, update_size);
5674        return 0;
5675}
5676
5677void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
5678{
5679        memset(rsv, 0, sizeof(*rsv));
5680        spin_lock_init(&rsv->lock);
5681        rsv->type = type;
5682}
5683
5684void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
5685                                   struct btrfs_block_rsv *rsv,
5686                                   unsigned short type)
5687{
5688        btrfs_init_block_rsv(rsv, type);
5689        rsv->space_info = __find_space_info(fs_info,
5690                                            BTRFS_BLOCK_GROUP_METADATA);
5691}
5692
5693struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
5694                                              unsigned short type)
5695{
5696        struct btrfs_block_rsv *block_rsv;
5697
5698        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
5699        if (!block_rsv)
5700                return NULL;
5701
5702        btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
5703        return block_rsv;
5704}
5705
5706void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
5707                          struct btrfs_block_rsv *rsv)
5708{
5709        if (!rsv)
5710                return;
5711        btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
5712        kfree(rsv);
5713}
5714
5715int btrfs_block_rsv_add(struct btrfs_root *root,
5716                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
5717                        enum btrfs_reserve_flush_enum flush)
5718{
5719        int ret;
5720
5721        if (num_bytes == 0)
5722                return 0;
5723
5724        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5725        if (!ret)
5726                block_rsv_add_bytes(block_rsv, num_bytes, true);
5727
5728        return ret;
5729}
5730
5731int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
5732{
5733        u64 num_bytes = 0;
5734        int ret = -ENOSPC;
5735
5736        if (!block_rsv)
5737                return 0;
5738
5739        spin_lock(&block_rsv->lock);
5740        num_bytes = div_factor(block_rsv->size, min_factor);
5741        if (block_rsv->reserved >= num_bytes)
5742                ret = 0;
5743        spin_unlock(&block_rsv->lock);
5744
5745        return ret;
5746}
5747
5748int btrfs_block_rsv_refill(struct btrfs_root *root,
5749                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
5750                           enum btrfs_reserve_flush_enum flush)
5751{
5752        u64 num_bytes = 0;
5753        int ret = -ENOSPC;
5754
5755        if (!block_rsv)
5756                return 0;
5757
5758        spin_lock(&block_rsv->lock);
5759        num_bytes = min_reserved;
5760        if (block_rsv->reserved >= num_bytes)
5761                ret = 0;
5762        else
5763                num_bytes -= block_rsv->reserved;
5764        spin_unlock(&block_rsv->lock);
5765
5766        if (!ret)
5767                return 0;
5768
5769        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5770        if (!ret) {
5771                block_rsv_add_bytes(block_rsv, num_bytes, false);
5772                return 0;
5773        }
5774
5775        return ret;
5776}
5777
5778/**
5779 * btrfs_inode_rsv_refill - refill the inode block rsv.
5780 * @inode - the inode we are refilling.
5781 * @flush - the flushing restriction.
5782 *
5783 * Essentially the same as btrfs_block_rsv_refill, except it uses the
5784 * block_rsv->size as the minimum size.  We'll either refill the missing amount
5785 * or return if we already have enough space.  This will also handle the reserve
5786 * tracepoint for the reserved amount.
5787 */
5788static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
5789                                  enum btrfs_reserve_flush_enum flush)
5790{
5791        struct btrfs_root *root = inode->root;
5792        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5793        u64 num_bytes = 0;
5794        u64 qgroup_num_bytes = 0;
5795        int ret = -ENOSPC;
5796
5797        spin_lock(&block_rsv->lock);
5798        if (block_rsv->reserved < block_rsv->size)
5799                num_bytes = block_rsv->size - block_rsv->reserved;
5800        if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
5801                qgroup_num_bytes = block_rsv->qgroup_rsv_size -
5802                                   block_rsv->qgroup_rsv_reserved;
5803        spin_unlock(&block_rsv->lock);
5804
5805        if (num_bytes == 0)
5806                return 0;
5807
5808        ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
5809        if (ret)
5810                return ret;
5811        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5812        if (!ret) {
5813                block_rsv_add_bytes(block_rsv, num_bytes, false);
5814                trace_btrfs_space_reservation(root->fs_info, "delalloc",
5815                                              btrfs_ino(inode), num_bytes, 1);
5816
5817                /* Don't forget to increase qgroup_rsv_reserved */
5818                spin_lock(&block_rsv->lock);
5819                block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
5820                spin_unlock(&block_rsv->lock);
5821        } else
5822                btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
5823        return ret;
5824}
5825
5826static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5827                                     struct btrfs_block_rsv *block_rsv,
5828                                     u64 num_bytes, u64 *qgroup_to_release)
5829{
5830        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5831        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
5832        struct btrfs_block_rsv *target = delayed_rsv;
5833
5834        if (target->full || target == block_rsv)
5835                target = global_rsv;
5836
5837        if (block_rsv->space_info != target->space_info)
5838                target = NULL;
5839
5840        return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
5841                                       qgroup_to_release);
5842}
5843
5844void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
5845                             struct btrfs_block_rsv *block_rsv,
5846                             u64 num_bytes)
5847{
5848        __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
5849}
5850
5851/**
5852 * btrfs_inode_rsv_release - release any excessive reservation.
5853 * @inode - the inode we need to release from.
5854 * @qgroup_free - free or convert qgroup meta.
5855 *   Unlike normal operation, qgroup meta reservation needs to know if we are
5856 *   freeing qgroup reservation or just converting it into per-trans.  Normally
5857 *   @qgroup_free is true for error handling, and false for normal release.
5858 *
5859 * This is the same as btrfs_block_rsv_release, except that it handles the
5860 * tracepoint for the reservation.
5861 */
5862static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
5863{
5864        struct btrfs_fs_info *fs_info = inode->root->fs_info;
5865        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
5866        u64 released = 0;
5867        u64 qgroup_to_release = 0;
5868
5869        /*
5870         * Since we statically set the block_rsv->size we just want to say we
5871         * are releasing 0 bytes, and then we'll just get the reservation over
5872         * the size free'd.
5873         */
5874        released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
5875                                             &qgroup_to_release);
5876        if (released > 0)
5877                trace_btrfs_space_reservation(fs_info, "delalloc",
5878                                              btrfs_ino(inode), released, 0);
5879        if (qgroup_free)
5880                btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
5881        else
5882                btrfs_qgroup_convert_reserved_meta(inode->root,
5883                                                   qgroup_to_release);
5884}
5885
5886/**
5887 * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
5888 * @fs_info - the fs_info for our fs.
5889 * @nr - the number of items to drop.
5890 *
5891 * This drops the delayed ref head's count from the delayed refs rsv and frees
5892 * any excess reservation we had.
5893 */
5894void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
5895{
5896        struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
5897        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5898        u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
5899        u64 released = 0;
5900
5901        released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
5902                                           num_bytes, NULL);
5903        if (released)
5904                trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
5905                                              0, released, 0);
5906}
5907
5908static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
5909{
5910        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
5911        struct btrfs_space_info *sinfo = block_rsv->space_info;
5912        u64 num_bytes;
5913
5914        /*
5915         * The global block rsv is based on the size of the extent tree, the
5916         * checksum tree and the root tree.  If the fs is empty we want to set
5917         * it to a minimal amount for safety.
5918         */
5919        num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
5920                btrfs_root_used(&fs_info->csum_root->root_item) +
5921                btrfs_root_used(&fs_info->tree_root->root_item);
5922        num_bytes = max_t(u64, num_bytes, SZ_16M);
5923
5924        spin_lock(&sinfo->lock);
5925        spin_lock(&block_rsv->lock);
5926
5927        block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5928
5929        if (block_rsv->reserved < block_rsv->size) {
5930                num_bytes = btrfs_space_info_used(sinfo, true);
5931                if (sinfo->total_bytes > num_bytes) {
5932                        num_bytes = sinfo->total_bytes - num_bytes;
5933                        num_bytes = min(num_bytes,
5934                                        block_rsv->size - block_rsv->reserved);
5935                        block_rsv->reserved += num_bytes;
5936                        update_bytes_may_use(sinfo, num_bytes);
5937                        trace_btrfs_space_reservation(fs_info, "space_info",
5938                                                      sinfo->flags, num_bytes,
5939                                                      1);
5940                }
5941        } else if (block_rsv->reserved > block_rsv->size) {
5942                num_bytes = block_rsv->reserved - block_rsv->size;
5943                update_bytes_may_use(sinfo, -num_bytes);
5944                trace_btrfs_space_reservation(fs_info, "space_info",
5945                                      sinfo->flags, num_bytes, 0);
5946                block_rsv->reserved = block_rsv->size;
5947        }
5948
5949        if (block_rsv->reserved == block_rsv->size)
5950                block_rsv->full = 1;
5951        else
5952                block_rsv->full = 0;
5953
5954        spin_unlock(&block_rsv->lock);
5955        spin_unlock(&sinfo->lock);
5956}
5957
5958static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
5959{
5960        struct btrfs_space_info *space_info;
5961
5962        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
5963        fs_info->chunk_block_rsv.space_info = space_info;
5964
5965        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5966        fs_info->global_block_rsv.space_info = space_info;
5967        fs_info->trans_block_rsv.space_info = space_info;
5968        fs_info->empty_block_rsv.space_info = space_info;
5969        fs_info->delayed_block_rsv.space_info = space_info;
5970        fs_info->delayed_refs_rsv.space_info = space_info;
5971
5972        fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
5973        fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
5974        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
5975        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5976        if (fs_info->quota_root)
5977                fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5978        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5979
5980        update_global_block_rsv(fs_info);
5981}
5982
5983static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
5984{
5985        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
5986                                (u64)-1, NULL);
5987        WARN_ON(fs_info->trans_block_rsv.size > 0);
5988        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
5989        WARN_ON(fs_info->chunk_block_rsv.size > 0);
5990        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5991        WARN_ON(fs_info->delayed_block_rsv.size > 0);
5992        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5993        WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
5994        WARN_ON(fs_info->delayed_refs_rsv.size > 0);
5995}
5996
5997/*
5998 * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
5999 * @trans - the trans that may have generated delayed refs
6000 *
6001 * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
6002 * it'll calculate the additional size and add it to the delayed_refs_rsv.
6003 */
6004void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
6005{
6006        struct btrfs_fs_info *fs_info = trans->fs_info;
6007        struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
6008        u64 num_bytes;
6009
6010        if (!trans->delayed_ref_updates)
6011                return;
6012
6013        num_bytes = btrfs_calc_trans_metadata_size(fs_info,
6014                                                   trans->delayed_ref_updates);
6015        spin_lock(&delayed_rsv->lock);
6016        delayed_rsv->size += num_bytes;
6017        delayed_rsv->full = 0;
6018        spin_unlock(&delayed_rsv->lock);
6019        trans->delayed_ref_updates = 0;
6020}
6021
6022/*
6023 * To be called after all the new block groups attached to the transaction
6024 * handle have been created (btrfs_create_pending_block_groups()).
6025 */
6026void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
6027{
6028        struct btrfs_fs_info *fs_info = trans->fs_info;
6029
6030        if (!trans->chunk_bytes_reserved)
6031                return;
6032
6033        WARN_ON_ONCE(!list_empty(&trans->new_bgs));
6034
6035        block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
6036                                trans->chunk_bytes_reserved, NULL);
6037        trans->chunk_bytes_reserved = 0;
6038}
6039
6040/*
6041 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
6042 * root: the root of the parent directory
6043 * rsv: block reservation
6044 * items: the number of items that we need do reservation
6045 * use_global_rsv: allow fallback to the global block reservation
6046 *
6047 * This function is used to reserve the space for snapshot/subvolume
6048 * creation and deletion. Those operations are different with the
6049 * common file/directory operations, they change two fs/file trees
6050 * and root tree, the number of items that the qgroup reserves is
6051 * different with the free space reservation. So we can not use
6052 * the space reservation mechanism in start_transaction().
6053 */
6054int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
6055                                     struct btrfs_block_rsv *rsv, int items,
6056                                     bool use_global_rsv)
6057{
6058        u64 qgroup_num_bytes = 0;
6059        u64 num_bytes;
6060        int ret;
6061        struct btrfs_fs_info *fs_info = root->fs_info;
6062        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6063
6064        if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
6065                /* One for parent inode, two for dir entries */
6066                qgroup_num_bytes = 3 * fs_info->nodesize;
6067                ret = btrfs_qgroup_reserve_meta_prealloc(root,
6068                                qgroup_num_bytes, true);
6069                if (ret)
6070                        return ret;
6071        }
6072
6073        num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
6074        rsv->space_info = __find_space_info(fs_info,
6075                                            BTRFS_BLOCK_GROUP_METADATA);
6076        ret = btrfs_block_rsv_add(root, rsv, num_bytes,
6077                                  BTRFS_RESERVE_FLUSH_ALL);
6078
6079        if (ret == -ENOSPC && use_global_rsv)
6080                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
6081
6082        if (ret && qgroup_num_bytes)
6083                btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
6084
6085        return ret;
6086}
6087
6088void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
6089                                      struct btrfs_block_rsv *rsv)
6090{
6091        btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
6092}
6093
6094static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
6095                                                 struct btrfs_inode *inode)
6096{
6097        struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
6098        u64 reserve_size = 0;
6099        u64 qgroup_rsv_size = 0;
6100        u64 csum_leaves;
6101        unsigned outstanding_extents;
6102
6103        lockdep_assert_held(&inode->lock);
6104        outstanding_extents = inode->outstanding_extents;
6105        if (outstanding_extents)
6106                reserve_size = btrfs_calc_trans_metadata_size(fs_info,
6107                                                outstanding_extents + 1);
6108        csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
6109                                                 inode->csum_bytes);
6110        reserve_size += btrfs_calc_trans_metadata_size(fs_info,
6111                                                       csum_leaves);
6112        /*
6113         * For qgroup rsv, the calculation is very simple:
6114         * account one nodesize for each outstanding extent
6115         *
6116         * This is overestimating in most cases.
6117         */
6118        qgroup_rsv_size = outstanding_extents * fs_info->nodesize;
6119
6120        spin_lock(&block_rsv->lock);
6121        block_rsv->size = reserve_size;
6122        block_rsv->qgroup_rsv_size = qgroup_rsv_size;
6123        spin_unlock(&block_rsv->lock);
6124}
6125
6126int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
6127{
6128        struct btrfs_fs_info *fs_info = inode->root->fs_info;
6129        unsigned nr_extents;
6130        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
6131        int ret = 0;
6132        bool delalloc_lock = true;
6133
6134        /* If we are a free space inode we need to not flush since we will be in
6135         * the middle of a transaction commit.  We also don't need the delalloc
6136         * mutex since we won't race with anybody.  We need this mostly to make
6137         * lockdep shut its filthy mouth.
6138         *
6139         * If we have a transaction open (can happen if we call truncate_block
6140         * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
6141         */
6142        if (btrfs_is_free_space_inode(inode)) {
6143                flush = BTRFS_RESERVE_NO_FLUSH;
6144                delalloc_lock = false;
6145        } else {
6146                if (current->journal_info)
6147                        flush = BTRFS_RESERVE_FLUSH_LIMIT;
6148
6149                if (btrfs_transaction_in_commit(fs_info))
6150                        schedule_timeout(1);
6151        }
6152
6153        if (delalloc_lock)
6154                mutex_lock(&inode->delalloc_mutex);
6155
6156        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6157
6158        /* Add our new extents and calculate the new rsv size. */
6159        spin_lock(&inode->lock);
6160        nr_extents = count_max_extents(num_bytes);
6161        btrfs_mod_outstanding_extents(inode, nr_extents);
6162        inode->csum_bytes += num_bytes;
6163        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6164        spin_unlock(&inode->lock);
6165
6166        ret = btrfs_inode_rsv_refill(inode, flush);
6167        if (unlikely(ret))
6168                goto out_fail;
6169
6170        if (delalloc_lock)
6171                mutex_unlock(&inode->delalloc_mutex);
6172        return 0;
6173
6174out_fail:
6175        spin_lock(&inode->lock);
6176        nr_extents = count_max_extents(num_bytes);
6177        btrfs_mod_outstanding_extents(inode, -nr_extents);
6178        inode->csum_bytes -= num_bytes;
6179        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6180        spin_unlock(&inode->lock);
6181
6182        btrfs_inode_rsv_release(inode, true);
6183        if (delalloc_lock)
6184                mutex_unlock(&inode->delalloc_mutex);
6185        return ret;
6186}
6187
6188/**
6189 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
6190 * @inode: the inode to release the reservation for.
6191 * @num_bytes: the number of bytes we are releasing.
6192 * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
6193 *
6194 * This will release the metadata reservation for an inode.  This can be called
6195 * once we complete IO for a given set of bytes to release their metadata
6196 * reservations, or on error for the same reason.
6197 */
6198void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
6199                                     bool qgroup_free)
6200{
6201        struct btrfs_fs_info *fs_info = inode->root->fs_info;
6202
6203        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
6204        spin_lock(&inode->lock);
6205        inode->csum_bytes -= num_bytes;
6206        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6207        spin_unlock(&inode->lock);
6208
6209        if (btrfs_is_testing(fs_info))
6210                return;
6211
6212        btrfs_inode_rsv_release(inode, qgroup_free);
6213}
6214
6215/**
6216 * btrfs_delalloc_release_extents - release our outstanding_extents
6217 * @inode: the inode to balance the reservation for.
6218 * @num_bytes: the number of bytes we originally reserved with
6219 * @qgroup_free: do we need to free qgroup meta reservation or convert them.
6220 *
6221 * When we reserve space we increase outstanding_extents for the extents we may
6222 * add.  Once we've set the range as delalloc or created our ordered extents we
6223 * have outstanding_extents to track the real usage, so we use this to free our
6224 * temporarily tracked outstanding_extents.  This _must_ be used in conjunction
6225 * with btrfs_delalloc_reserve_metadata.
6226 */
6227void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
6228                                    bool qgroup_free)
6229{
6230        struct btrfs_fs_info *fs_info = inode->root->fs_info;
6231        unsigned num_extents;
6232
6233        spin_lock(&inode->lock);
6234        num_extents = count_max_extents(num_bytes);
6235        btrfs_mod_outstanding_extents(inode, -num_extents);
6236        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
6237        spin_unlock(&inode->lock);
6238
6239        if (btrfs_is_testing(fs_info))
6240                return;
6241
6242        btrfs_inode_rsv_release(inode, qgroup_free);
6243}
6244
6245/**
6246 * btrfs_delalloc_reserve_space - reserve data and metadata space for
6247 * delalloc
6248 * @inode: inode we're writing to
6249 * @start: start range we are writing to
6250 * @len: how long the range we are writing to
6251 * @reserved: mandatory parameter, record actually reserved qgroup ranges of
6252 *            current reservation.
6253 *
6254 * This will do the following things
6255 *
6256 * o reserve space in data space info for num bytes
6257 *   and reserve precious corresponding qgroup space
6258 *   (Done in check_data_free_space)
6259 *
6260 * o reserve space for metadata space, based on the number of outstanding
6261 *   extents and how much csums will be needed
6262 *   also reserve metadata space in a per root over-reserve method.
6263 * o add to the inodes->delalloc_bytes
6264 * o add it to the fs_info's delalloc inodes list.
6265 *   (Above 3 all done in delalloc_reserve_metadata)
6266 *
6267 * Return 0 for success
6268 * Return <0 for error(-ENOSPC or -EQUOT)
6269 */
6270int btrfs_delalloc_reserve_space(struct inode *inode,
6271                        struct extent_changeset **reserved, u64 start, u64 len)
6272{
6273        int ret;
6274
6275        ret = btrfs_check_data_free_space(inode, reserved, start, len);
6276        if (ret < 0)
6277                return ret;
6278        ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
6279        if (ret < 0)
6280                btrfs_free_reserved_data_space(inode, *reserved, start, len);
6281        return ret;
6282}
6283
6284/**
6285 * btrfs_delalloc_release_space - release data and metadata space for delalloc
6286 * @inode: inode we're releasing space for
6287 * @start: start position of the space already reserved
6288 * @len: the len of the space already reserved
6289 * @release_bytes: the len of the space we consumed or didn't use
6290 *
6291 * This function will release the metadata space that was not used and will
6292 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
6293 * list if there are no delalloc bytes left.
6294 * Also it will handle the qgroup reserved space.
6295 */
6296void btrfs_delalloc_release_space(struct inode *inode,
6297                                  struct extent_changeset *reserved,
6298                                  u64 start, u64 len, bool qgroup_free)
6299{
6300        btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
6301        btrfs_free_reserved_data_space(inode, reserved, start, len);
6302}
6303
6304static int update_block_group(struct btrfs_trans_handle *trans,
6305                              struct btrfs_fs_info *info, u64 bytenr,
6306                              u64 num_bytes, int alloc)
6307{
6308        struct btrfs_block_group_cache *cache = NULL;
6309        u64 total = num_bytes;
6310        u64 old_val;
6311        u64 byte_in_group;
6312        int factor;
6313        int ret = 0;
6314
6315        /* block accounting for super block */
6316        spin_lock(&info->delalloc_root_lock);
6317        old_val = btrfs_super_bytes_used(info->super_copy);
6318        if (alloc)
6319                old_val += num_bytes;
6320        else
6321                old_val -= num_bytes;
6322        btrfs_set_super_bytes_used(info->super_copy, old_val);
6323        spin_unlock(&info->delalloc_root_lock);
6324
6325        while (total) {
6326                cache = btrfs_lookup_block_group(info, bytenr);
6327                if (!cache) {
6328                        ret = -ENOENT;
6329                        break;
6330                }
6331                factor = btrfs_bg_type_to_factor(cache->flags);
6332
6333                /*
6334                 * If this block group has free space cache written out, we
6335                 * need to make sure to load it if we are removing space.  This
6336                 * is because we need the unpinning stage to actually add the
6337                 * space back to the block group, otherwise we will leak space.
6338                 */
6339                if (!alloc && cache->cached == BTRFS_CACHE_NO)
6340                        cache_block_group(cache, 1);
6341
6342                byte_in_group = bytenr - cache->key.objectid;
6343                WARN_ON(byte_in_group > cache->key.offset);
6344
6345                spin_lock(&cache->space_info->lock);
6346                spin_lock(&cache->lock);
6347
6348                if (btrfs_test_opt(info, SPACE_CACHE) &&
6349                    cache->disk_cache_state < BTRFS_DC_CLEAR)
6350                        cache->disk_cache_state = BTRFS_DC_CLEAR;
6351
6352                old_val = btrfs_block_group_used(&cache->item);
6353                num_bytes = min(total, cache->key.offset - byte_in_group);
6354                if (alloc) {
6355                        old_val += num_bytes;
6356                        btrfs_set_block_group_used(&cache->item, old_val);
6357                        cache->reserved -= num_bytes;
6358                        cache->space_info->bytes_reserved -= num_bytes;
6359                        cache->space_info->bytes_used += num_bytes;
6360                        cache->space_info->disk_used += num_bytes * factor;
6361                        spin_unlock(&cache->lock);
6362                        spin_unlock(&cache->space_info->lock);
6363                } else {
6364                        old_val -= num_bytes;
6365                        btrfs_set_block_group_used(&cache->item, old_val);
6366                        cache->pinned += num_bytes;
6367                        update_bytes_pinned(cache->space_info, num_bytes);
6368                        cache->space_info->bytes_used -= num_bytes;
6369                        cache->space_info->disk_used -= num_bytes * factor;
6370                        spin_unlock(&cache->lock);
6371                        spin_unlock(&cache->space_info->lock);
6372
6373                        trace_btrfs_space_reservation(info, "pinned",
6374                                                      cache->space_info->flags,
6375                                                      num_bytes, 1);
6376                        percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6377                                           num_bytes,
6378                                           BTRFS_TOTAL_BYTES_PINNED_BATCH);
6379                        set_extent_dirty(info->pinned_extents,
6380                                         bytenr, bytenr + num_bytes - 1,
6381                                         GFP_NOFS | __GFP_NOFAIL);
6382                }
6383
6384                spin_lock(&trans->transaction->dirty_bgs_lock);
6385                if (list_empty(&cache->dirty_list)) {
6386                        list_add_tail(&cache->dirty_list,
6387                                      &trans->transaction->dirty_bgs);
6388                        trans->transaction->num_dirty_bgs++;
6389                        trans->delayed_ref_updates++;
6390                        btrfs_get_block_group(cache);
6391                }
6392                spin_unlock(&trans->transaction->dirty_bgs_lock);
6393
6394                /*
6395                 * No longer have used bytes in this block group, queue it for
6396                 * deletion. We do this after adding the block group to the
6397                 * dirty list to avoid races between cleaner kthread and space
6398                 * cache writeout.
6399                 */
6400                if (!alloc && old_val == 0)
6401                        btrfs_mark_bg_unused(cache);
6402
6403                btrfs_put_block_group(cache);
6404                total -= num_bytes;
6405                bytenr += num_bytes;
6406        }
6407
6408        /* Modified block groups are accounted for in the delayed_refs_rsv. */
6409        btrfs_update_delayed_refs_rsv(trans);
6410        return ret;
6411}
6412
6413static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
6414{
6415        struct btrfs_block_group_cache *cache;
6416        u64 bytenr;
6417
6418        spin_lock(&fs_info->block_group_cache_lock);
6419        bytenr = fs_info->first_logical_byte;
6420        spin_unlock(&fs_info->block_group_cache_lock);
6421
6422        if (bytenr < (u64)-1)
6423                return bytenr;
6424
6425        cache = btrfs_lookup_first_block_group(fs_info, search_start);
6426        if (!cache)
6427                return 0;
6428
6429        bytenr = cache->key.objectid;
6430        btrfs_put_block_group(cache);
6431
6432        return bytenr;
6433}
6434
6435static int pin_down_extent(struct btrfs_fs_info *fs_info,
6436                           struct btrfs_block_group_cache *cache,
6437                           u64 bytenr, u64 num_bytes, int reserved)
6438{
6439        spin_lock(&cache->space_info->lock);
6440        spin_lock(&cache->lock);
6441        cache->pinned += num_bytes;
6442        update_bytes_pinned(cache->space_info, num_bytes);
6443        if (reserved) {
6444                cache->reserved -= num_bytes;
6445                cache->space_info->bytes_reserved -= num_bytes;
6446        }
6447        spin_unlock(&cache->lock);
6448        spin_unlock(&cache->space_info->lock);
6449
6450        trace_btrfs_space_reservation(fs_info, "pinned",
6451                                      cache->space_info->flags, num_bytes, 1);
6452        percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
6453                    num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6454        set_extent_dirty(fs_info->pinned_extents, bytenr,
6455                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
6456        return 0;
6457}
6458
6459/*
6460 * this function must be called within transaction
6461 */
6462int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
6463                     u64 bytenr, u64 num_bytes, int reserved)
6464{
6465        struct btrfs_block_group_cache *cache;
6466
6467        cache = btrfs_lookup_block_group(fs_info, bytenr);
6468        BUG_ON(!cache); /* Logic error */
6469
6470        pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
6471
6472        btrfs_put_block_group(cache);
6473        return 0;
6474}
6475
6476/*
6477 * this function must be called within transaction
6478 */
6479int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
6480                                    u64 bytenr, u64 num_bytes)
6481{
6482        struct btrfs_block_group_cache *cache;
6483        int ret;
6484
6485        cache = btrfs_lookup_block_group(fs_info, bytenr);
6486        if (!cache)
6487                return -EINVAL;
6488
6489        /*
6490         * pull in the free space cache (if any) so that our pin
6491         * removes the free space from the cache.  We have load_only set
6492         * to one because the slow code to read in the free extents does check
6493         * the pinned extents.
6494         */
6495        cache_block_group(cache, 1);
6496
6497        pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
6498
6499        /* remove us from the free space cache (if we're there at all) */
6500        ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6501        btrfs_put_block_group(cache);
6502        return ret;
6503}
6504
6505static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
6506                                   u64 start, u64 num_bytes)
6507{
6508        int ret;
6509        struct btrfs_block_group_cache *block_group;
6510        struct btrfs_caching_control *caching_ctl;
6511
6512        block_group = btrfs_lookup_block_group(fs_info, start);
6513        if (!block_group)
6514                return -EINVAL;
6515
6516        cache_block_group(block_group, 0);
6517        caching_ctl = get_caching_control(block_group);
6518
6519        if (!caching_ctl) {
6520                /* Logic error */
6521                BUG_ON(!block_group_cache_done(block_group));
6522                ret = btrfs_remove_free_space(block_group, start, num_bytes);
6523        } else {
6524                mutex_lock(&caching_ctl->mutex);
6525
6526                if (start >= caching_ctl->progress) {
6527                        ret = add_excluded_extent(fs_info, start, num_bytes);
6528                } else if (start + num_bytes <= caching_ctl->progress) {
6529                        ret = btrfs_remove_free_space(block_group,
6530                                                      start, num_bytes);
6531                } else {
6532                        num_bytes = caching_ctl->progress - start;
6533                        ret = btrfs_remove_free_space(block_group,
6534                                                      start, num_bytes);
6535                        if (ret)
6536                                goto out_lock;
6537
6538                        num_bytes = (start + num_bytes) -
6539                                caching_ctl->progress;
6540                        start = caching_ctl->progress;
6541                        ret = add_excluded_extent(fs_info, start, num_bytes);
6542                }
6543out_lock:
6544                mutex_unlock(&caching_ctl->mutex);
6545                put_caching_control(caching_ctl);
6546        }
6547        btrfs_put_block_group(block_group);
6548        return ret;
6549}
6550
6551int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
6552                                 struct extent_buffer *eb)
6553{
6554        struct btrfs_file_extent_item *item;
6555        struct btrfs_key key;
6556        int found_type;
6557        int i;
6558        int ret = 0;
6559
6560        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
6561                return 0;
6562
6563        for (i = 0; i < btrfs_header_nritems(eb); i++) {
6564                btrfs_item_key_to_cpu(eb, &key, i);
6565                if (key.type != BTRFS_EXTENT_DATA_KEY)
6566                        continue;
6567                item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
6568                found_type = btrfs_file_extent_type(eb, item);
6569                if (found_type == BTRFS_FILE_EXTENT_INLINE)
6570                        continue;
6571                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
6572                        continue;
6573                key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
6574                key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
6575                ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
6576                if (ret)
6577                        break;
6578        }
6579
6580        return ret;
6581}
6582
6583static void
6584btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6585{
6586        atomic_inc(&bg->reservations);
6587}
6588
6589void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6590                                        const u64 start)
6591{
6592        struct btrfs_block_group_cache *bg;
6593
6594        bg = btrfs_lookup_block_group(fs_info, start);
6595        ASSERT(bg);
6596        if (atomic_dec_and_test(&bg->reservations))
6597                wake_up_var(&bg->reservations);
6598        btrfs_put_block_group(bg);
6599}
6600
6601void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6602{
6603        struct btrfs_space_info *space_info = bg->space_info;
6604
6605        ASSERT(bg->ro);
6606
6607        if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6608                return;
6609
6610        /*
6611         * Our block group is read only but before we set it to read only,
6612         * some task might have had allocated an extent from it already, but it
6613         * has not yet created a respective ordered extent (and added it to a
6614         * root's list of ordered extents).
6615         * Therefore wait for any task currently allocating extents, since the
6616         * block group's reservations counter is incremented while a read lock
6617         * on the groups' semaphore is held and decremented after releasing
6618         * the read access on that semaphore and creating the ordered extent.
6619         */
6620        down_write(&space_info->groups_sem);
6621        up_write(&space_info->groups_sem);
6622
6623        wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
6624}
6625
6626/**
6627 * btrfs_add_reserved_bytes - update the block_group and space info counters
6628 * @cache:      The cache we are manipulating
6629 * @ram_bytes:  The number of bytes of file content, and will be same to
6630 *              @num_bytes except for the compress path.
6631 * @num_bytes:  The number of bytes in question
6632 * @delalloc:   The blocks are allocated for the delalloc write
6633 *
6634 * This is called by the allocator when it reserves space. If this is a
6635 * reservation and the block group has become read only we cannot make the
6636 * reservation and return -EAGAIN, otherwise this function always succeeds.
6637 */
6638static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6639                                    u64 ram_bytes, u64 num_bytes, int delalloc)
6640{
6641        struct btrfs_space_info *space_info = cache->space_info;
6642        int ret = 0;
6643
6644        spin_lock(&space_info->lock);
6645        spin_lock(&cache->lock);
6646        if (cache->ro) {
6647                ret = -EAGAIN;
6648        } else {
6649                cache->reserved += num_bytes;
6650                space_info->bytes_reserved += num_bytes;
6651                update_bytes_may_use(space_info, -ram_bytes);
6652                if (delalloc)
6653                        cache->delalloc_bytes += num_bytes;
6654        }
6655        spin_unlock(&cache->lock);
6656        spin_unlock(&space_info->lock);
6657        return ret;
6658}
6659
6660/**
6661 * btrfs_free_reserved_bytes - update the block_group and space info counters
6662 * @cache:      The cache we are manipulating
6663 * @num_bytes:  The number of bytes in question
6664 * @delalloc:   The blocks are allocated for the delalloc write
6665 *
6666 * This is called by somebody who is freeing space that was never actually used
6667 * on disk.  For example if you reserve some space for a new leaf in transaction
6668 * A and before transaction A commits you free that leaf, you call this with
6669 * reserve set to 0 in order to clear the reservation.
6670 */
6671
6672static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
6673                                      u64 num_bytes, int delalloc)
6674{
6675        struct btrfs_space_info *space_info = cache->space_info;
6676
6677        spin_lock(&space_info->lock);
6678        spin_lock(&cache->lock);
6679        if (cache->ro)
6680                space_info->bytes_readonly += num_bytes;
6681        cache->reserved -= num_bytes;
6682        space_info->bytes_reserved -= num_bytes;
6683        space_info->max_extent_size = 0;
6684
6685        if (delalloc)
6686                cache->delalloc_bytes -= num_bytes;
6687        spin_unlock(&cache->lock);
6688        spin_unlock(&space_info->lock);
6689}
6690void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
6691{
6692        struct btrfs_caching_control *next;
6693        struct btrfs_caching_control *caching_ctl;
6694        struct btrfs_block_group_cache *cache;
6695
6696        down_write(&fs_info->commit_root_sem);
6697
6698        list_for_each_entry_safe(caching_ctl, next,
6699                                 &fs_info->caching_block_groups, list) {
6700                cache = caching_ctl->block_group;
6701                if (block_group_cache_done(cache)) {
6702                        cache->last_byte_to_unpin = (u64)-1;
6703                        list_del_init(&caching_ctl->list);
6704                        put_caching_control(caching_ctl);
6705                } else {
6706                        cache->last_byte_to_unpin = caching_ctl->progress;
6707                }
6708        }
6709
6710        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6711                fs_info->pinned_extents = &fs_info->freed_extents[1];
6712        else
6713                fs_info->pinned_extents = &fs_info->freed_extents[0];
6714
6715        up_write(&fs_info->commit_root_sem);
6716
6717        update_global_block_rsv(fs_info);
6718}
6719
6720/*
6721 * Returns the free cluster for the given space info and sets empty_cluster to
6722 * what it should be based on the mount options.
6723 */
6724static struct btrfs_free_cluster *
6725fetch_cluster_info(struct btrfs_fs_info *fs_info,
6726                   struct btrfs_space_info *space_info, u64 *empty_cluster)
6727{
6728        struct btrfs_free_cluster *ret = NULL;
6729
6730        *empty_cluster = 0;
6731        if (btrfs_mixed_space_info(space_info))
6732                return ret;
6733
6734        if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6735                ret = &fs_info->meta_alloc_cluster;
6736                if (btrfs_test_opt(fs_info, SSD))
6737                        *empty_cluster = SZ_2M;
6738                else
6739                        *empty_cluster = SZ_64K;
6740        } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
6741                   btrfs_test_opt(fs_info, SSD_SPREAD)) {
6742                *empty_cluster = SZ_2M;
6743                ret = &fs_info->data_alloc_cluster;
6744        }
6745
6746        return ret;
6747}
6748
6749static int unpin_extent_range(struct btrfs_fs_info *fs_info,
6750                              u64 start, u64 end,
6751                              const bool return_free_space)
6752{
6753        struct btrfs_block_group_cache *cache = NULL;
6754        struct btrfs_space_info *space_info;
6755        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6756        struct btrfs_free_cluster *cluster = NULL;
6757        u64 len;
6758        u64 total_unpinned = 0;
6759        u64 empty_cluster = 0;
6760        bool readonly;
6761
6762        while (start <= end) {
6763                readonly = false;
6764                if (!cache ||
6765                    start >= cache->key.objectid + cache->key.offset) {
6766                        if (cache)
6767                                btrfs_put_block_group(cache);
6768                        total_unpinned = 0;
6769                        cache = btrfs_lookup_block_group(fs_info, start);
6770                        BUG_ON(!cache); /* Logic error */
6771
6772                        cluster = fetch_cluster_info(fs_info,
6773                                                     cache->space_info,
6774                                                     &empty_cluster);
6775                        empty_cluster <<= 1;
6776                }
6777
6778                len = cache->key.objectid + cache->key.offset - start;
6779                len = min(len, end + 1 - start);
6780
6781                if (start < cache->last_byte_to_unpin) {
6782                        len = min(len, cache->last_byte_to_unpin - start);
6783                        if (return_free_space)
6784                                btrfs_add_free_space(cache, start, len);
6785                }
6786
6787                start += len;
6788                total_unpinned += len;
6789                space_info = cache->space_info;
6790
6791                /*
6792                 * If this space cluster has been marked as fragmented and we've
6793                 * unpinned enough in this block group to potentially allow a
6794                 * cluster to be created inside of it go ahead and clear the
6795                 * fragmented check.
6796                 */
6797                if (cluster && cluster->fragmented &&
6798                    total_unpinned > empty_cluster) {
6799                        spin_lock(&cluster->lock);
6800                        cluster->fragmented = 0;
6801                        spin_unlock(&cluster->lock);
6802                }
6803
6804                spin_lock(&space_info->lock);
6805                spin_lock(&cache->lock);
6806                cache->pinned -= len;
6807                update_bytes_pinned(space_info, -len);
6808
6809                trace_btrfs_space_reservation(fs_info, "pinned",
6810                                              space_info->flags, len, 0);
6811                space_info->max_extent_size = 0;
6812                percpu_counter_add_batch(&space_info->total_bytes_pinned,
6813                            -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
6814                if (cache->ro) {
6815                        space_info->bytes_readonly += len;
6816                        readonly = true;
6817                }
6818                spin_unlock(&cache->lock);
6819                if (!readonly && return_free_space &&
6820                    global_rsv->space_info == space_info) {
6821                        u64 to_add = len;
6822
6823                        spin_lock(&global_rsv->lock);
6824                        if (!global_rsv->full) {
6825                                to_add = min(len, global_rsv->size -
6826                                             global_rsv->reserved);
6827                                global_rsv->reserved += to_add;
6828                                update_bytes_may_use(space_info, to_add);
6829                                if (global_rsv->reserved >= global_rsv->size)
6830                                        global_rsv->full = 1;
6831                                trace_btrfs_space_reservation(fs_info,
6832                                                              "space_info",
6833                                                              space_info->flags,
6834                                                              to_add, 1);
6835                                len -= to_add;
6836                        }
6837                        spin_unlock(&global_rsv->lock);
6838                        /* Add to any tickets we may have */
6839                        if (len)
6840                                space_info_add_new_bytes(fs_info, space_info,
6841                                                         len);
6842                }
6843                spin_unlock(&space_info->lock);
6844        }
6845
6846        if (cache)
6847                btrfs_put_block_group(cache);
6848        return 0;
6849}
6850
6851int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
6852{
6853        struct btrfs_fs_info *fs_info = trans->fs_info;
6854        struct btrfs_block_group_cache *block_group, *tmp;
6855        struct list_head *deleted_bgs;
6856        struct extent_io_tree *unpin;
6857        u64 start;
6858        u64 end;
6859        int ret;
6860
6861        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
6862                unpin = &fs_info->freed_extents[1];
6863        else
6864                unpin = &fs_info->freed_extents[0];
6865
6866        while (!trans->aborted) {
6867                struct extent_state *cached_state = NULL;
6868
6869                mutex_lock(&fs_info->unused_bg_unpin_mutex);
6870                ret = find_first_extent_bit(unpin, 0, &start, &end,
6871                                            EXTENT_DIRTY, &cached_state);
6872                if (ret) {
6873                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6874                        break;
6875                }
6876
6877                if (btrfs_test_opt(fs_info, DISCARD))
6878                        ret = btrfs_discard_extent(fs_info, start,
6879                                                   end + 1 - start, NULL);
6880
6881                clear_extent_dirty(unpin, start, end, &cached_state);
6882                unpin_extent_range(fs_info, start, end, true);
6883                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6884                free_extent_state(cached_state);
6885                cond_resched();
6886        }
6887
6888        /*
6889         * Transaction is finished.  We don't need the lock anymore.  We
6890         * do need to clean up the block groups in case of a transaction
6891         * abort.
6892         */
6893        deleted_bgs = &trans->transaction->deleted_bgs;
6894        list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
6895                u64 trimmed = 0;
6896
6897                ret = -EROFS;
6898                if (!trans->aborted)
6899                        ret = btrfs_discard_extent(fs_info,
6900                                                   block_group->key.objectid,
6901                                                   block_group->key.offset,
6902                                                   &trimmed);
6903
6904                list_del_init(&block_group->bg_list);
6905                btrfs_put_block_group_trimming(block_group);
6906                btrfs_put_block_group(block_group);
6907
6908                if (ret) {
6909                        const char *errstr = btrfs_decode_error(ret);
6910                        btrfs_warn(fs_info,
6911                           "discard failed while removing blockgroup: errno=%d %s",
6912                                   ret, errstr);
6913                }
6914        }
6915
6916        return 0;
6917}
6918
6919static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6920                               struct btrfs_delayed_ref_node *node, u64 parent,
6921                               u64 root_objectid, u64 owner_objectid,
6922                               u64 owner_offset, int refs_to_drop,
6923                               struct btrfs_delayed_extent_op *extent_op)
6924{
6925        struct btrfs_fs_info *info = trans->fs_info;
6926        struct btrfs_key key;
6927        struct btrfs_path *path;
6928        struct btrfs_root *extent_root = info->extent_root;
6929        struct extent_buffer *leaf;
6930        struct btrfs_extent_item *ei;
6931        struct btrfs_extent_inline_ref *iref;
6932        int ret;
6933        int is_data;
6934        int extent_slot = 0;
6935        int found_extent = 0;
6936        int num_to_del = 1;
6937        u32 item_size;
6938        u64 refs;
6939        u64 bytenr = node->bytenr;
6940        u64 num_bytes = node->num_bytes;
6941        int last_ref = 0;
6942        bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
6943
6944        path = btrfs_alloc_path();
6945        if (!path)
6946                return -ENOMEM;
6947
6948        path->reada = READA_FORWARD;
6949        path->leave_spinning = 1;
6950
6951        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
6952        BUG_ON(!is_data && refs_to_drop != 1);
6953
6954        if (is_data)
6955                skinny_metadata = false;
6956
6957        ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
6958                                    parent, root_objectid, owner_objectid,
6959                                    owner_offset);
6960        if (ret == 0) {
6961                extent_slot = path->slots[0];
6962                while (extent_slot >= 0) {
6963                        btrfs_item_key_to_cpu(path->nodes[0], &key,
6964                                              extent_slot);
6965                        if (key.objectid != bytenr)
6966                                break;
6967                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
6968                            key.offset == num_bytes) {
6969                                found_extent = 1;
6970                                break;
6971                        }
6972                        if (key.type == BTRFS_METADATA_ITEM_KEY &&
6973                            key.offset == owner_objectid) {
6974                                found_extent = 1;
6975                                break;
6976                        }
6977                        if (path->slots[0] - extent_slot > 5)
6978                                break;
6979                        extent_slot--;
6980                }
6981
6982                if (!found_extent) {
6983                        BUG_ON(iref);
6984                        ret = remove_extent_backref(trans, path, NULL,
6985                                                    refs_to_drop,
6986                                                    is_data, &last_ref);
6987                        if (ret) {
6988                                btrfs_abort_transaction(trans, ret);
6989                                goto out;
6990                        }
6991                        btrfs_release_path(path);
6992                        path->leave_spinning = 1;
6993
6994                        key.objectid = bytenr;
6995                        key.type = BTRFS_EXTENT_ITEM_KEY;
6996                        key.offset = num_bytes;
6997
6998                        if (!is_data && skinny_metadata) {
6999                                key.type = BTRFS_METADATA_ITEM_KEY;
7000                                key.offset = owner_objectid;
7001                        }
7002
7003                        ret = btrfs_search_slot(trans, extent_root,
7004                                                &key, path, -1, 1);
7005                        if (ret > 0 && skinny_metadata && path->slots[0]) {
7006                                /*
7007                                 * Couldn't find our skinny metadata item,
7008                                 * see if we have ye olde extent item.
7009                                 */
7010                                path->slots[0]--;
7011                                btrfs_item_key_to_cpu(path->nodes[0], &key,
7012                                                      path->slots[0]);
7013                                if (key.objectid == bytenr &&
7014                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
7015                                    key.offset == num_bytes)
7016                                        ret = 0;
7017                        }
7018
7019                        if (ret > 0 && skinny_metadata) {
7020                                skinny_metadata = false;
7021                                key.objectid = bytenr;
7022                                key.type = BTRFS_EXTENT_ITEM_KEY;
7023                                key.offset = num_bytes;
7024                                btrfs_release_path(path);
7025                                ret = btrfs_search_slot(trans, extent_root,
7026                                                        &key, path, -1, 1);
7027                        }
7028
7029                        if (ret) {
7030                                btrfs_err(info,
7031                                          "umm, got %d back from search, was looking for %llu",
7032                                          ret, bytenr);
7033                                if (ret > 0)
7034                                        btrfs_print_leaf(path->nodes[0]);
7035                        }
7036                        if (ret < 0) {
7037                                btrfs_abort_transaction(trans, ret);
7038                                goto out;
7039                        }
7040                        extent_slot = path->slots[0];
7041                }
7042        } else if (WARN_ON(ret == -ENOENT)) {
7043                btrfs_print_leaf(path->nodes[0]);
7044                btrfs_err(info,
7045                        "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
7046                        bytenr, parent, root_objectid, owner_objectid,
7047                        owner_offset);
7048                btrfs_abort_transaction(trans, ret);
7049                goto out;
7050        } else {
7051                btrfs_abort_transaction(trans, ret);
7052                goto out;
7053        }
7054
7055        leaf = path->nodes[0];
7056        item_size = btrfs_item_size_nr(leaf, extent_slot);
7057        if (unlikely(item_size < sizeof(*ei))) {
7058                ret = -EINVAL;
7059                btrfs_print_v0_err(info);
7060                btrfs_abort_transaction(trans, ret);
7061                goto out;
7062        }
7063        ei = btrfs_item_ptr(leaf, extent_slot,
7064                            struct btrfs_extent_item);
7065        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
7066            key.type == BTRFS_EXTENT_ITEM_KEY) {
7067                struct btrfs_tree_block_info *bi;
7068                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
7069                bi = (struct btrfs_tree_block_info *)(ei + 1);
7070                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
7071        }
7072
7073        refs = btrfs_extent_refs(leaf, ei);
7074        if (refs < refs_to_drop) {
7075                btrfs_err(info,
7076                          "trying to drop %d refs but we only have %Lu for bytenr %Lu",
7077                          refs_to_drop, refs, bytenr);
7078                ret = -EINVAL;
7079                btrfs_abort_transaction(trans, ret);
7080                goto out;
7081        }
7082        refs -= refs_to_drop;
7083
7084        if (refs > 0) {
7085                if (extent_op)
7086                        __run_delayed_extent_op(extent_op, leaf, ei);
7087                /*
7088                 * In the case of inline back ref, reference count will
7089                 * be updated by remove_extent_backref
7090                 */
7091                if (iref) {
7092                        BUG_ON(!found_extent);
7093                } else {
7094                        btrfs_set_extent_refs(leaf, ei, refs);
7095                        btrfs_mark_buffer_dirty(leaf);
7096                }
7097                if (found_extent) {
7098                        ret = remove_extent_backref(trans, path, iref,
7099                                                    refs_to_drop, is_data,
7100                                                    &last_ref);
7101                        if (ret) {
7102                                btrfs_abort_transaction(trans, ret);
7103                                goto out;
7104                        }
7105                }
7106        } else {
7107                if (found_extent) {
7108                        BUG_ON(is_data && refs_to_drop !=
7109                               extent_data_ref_count(path, iref));
7110                        if (iref) {
7111                                BUG_ON(path->slots[0] != extent_slot);
7112                        } else {
7113                                BUG_ON(path->slots[0] != extent_slot + 1);
7114                                path->slots[0] = extent_slot;
7115                                num_to_del = 2;
7116                        }
7117                }
7118
7119                last_ref = 1;
7120                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
7121                                      num_to_del);
7122                if (ret) {
7123                        btrfs_abort_transaction(trans, ret);
7124                        goto out;
7125                }
7126                btrfs_release_path(path);
7127
7128                if (is_data) {
7129                        ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
7130                        if (ret) {
7131                                btrfs_abort_transaction(trans, ret);
7132                                goto out;
7133                        }
7134                }
7135
7136                ret = add_to_free_space_tree(trans, bytenr, num_bytes);
7137                if (ret) {
7138                        btrfs_abort_transaction(trans, ret);
7139                        goto out;
7140                }
7141
7142                ret = update_block_group(trans, info, bytenr, num_bytes, 0);
7143                if (ret) {
7144                        btrfs_abort_transaction(trans, ret);
7145                        goto out;
7146                }
7147        }
7148        btrfs_release_path(path);
7149
7150out:
7151        btrfs_free_path(path);
7152        return ret;
7153}
7154
7155/*
7156 * when we free an block, it is possible (and likely) that we free the last
7157 * delayed ref for that extent as well.  This searches the delayed ref tree for
7158 * a given extent, and if there are no other delayed refs to be processed, it
7159 * removes it from the tree.
7160 */
7161static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
7162                                      u64 bytenr)
7163{
7164        struct btrfs_delayed_ref_head *head;
7165        struct btrfs_delayed_ref_root *delayed_refs;
7166        int ret = 0;
7167
7168        delayed_refs = &trans->transaction->delayed_refs;
7169        spin_lock(&delayed_refs->lock);
7170        head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
7171        if (!head)
7172                goto out_delayed_unlock;
7173
7174        spin_lock(&head->lock);
7175        if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
7176                goto out;
7177
7178        if (cleanup_extent_op(head) != NULL)
7179                goto out;
7180
7181        /*
7182         * waiting for the lock here would deadlock.  If someone else has it
7183         * locked they are already in the process of dropping it anyway
7184         */
7185        if (!mutex_trylock(&head->mutex))
7186                goto out;
7187
7188        btrfs_delete_ref_head(delayed_refs, head);
7189        head->processing = 0;
7190
7191        spin_unlock(&head->lock);
7192        spin_unlock(&delayed_refs->lock);
7193
7194        BUG_ON(head->extent_op);
7195        if (head->must_insert_reserved)
7196                ret = 1;
7197
7198        btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
7199        mutex_unlock(&head->mutex);
7200        btrfs_put_delayed_ref_head(head);
7201        return ret;
7202out:
7203        spin_unlock(&head->lock);
7204
7205out_delayed_unlock:
7206        spin_unlock(&delayed_refs->lock);
7207        return 0;
7208}
7209
7210void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
7211                           struct btrfs_root *root,
7212                           struct extent_buffer *buf,
7213                           u64 parent, int last_ref)
7214{
7215        struct btrfs_fs_info *fs_info = root->fs_info;
7216        int pin = 1;
7217        int ret;
7218
7219        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7220                int old_ref_mod, new_ref_mod;
7221
7222                btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
7223                                   root->root_key.objectid,
7224                                   btrfs_header_level(buf), 0,
7225                                   BTRFS_DROP_DELAYED_REF);
7226                ret = btrfs_add_delayed_tree_ref(trans, buf->start,
7227                                                 buf->len, parent,
7228                                                 root->root_key.objectid,
7229                                                 btrfs_header_level(buf),
7230                                                 BTRFS_DROP_DELAYED_REF, NULL,
7231                                                 &old_ref_mod, &new_ref_mod);
7232                BUG_ON(ret); /* -ENOMEM */
7233                pin = old_ref_mod >= 0 && new_ref_mod < 0;
7234        }
7235
7236        if (last_ref && btrfs_header_generation(buf) == trans->transid) {
7237                struct btrfs_block_group_cache *cache;
7238
7239                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
7240                        ret = check_ref_cleanup(trans, buf->start);
7241                        if (!ret)
7242                                goto out;
7243                }
7244
7245                pin = 0;
7246                cache = btrfs_lookup_block_group(fs_info, buf->start);
7247
7248                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
7249                        pin_down_extent(fs_info, cache, buf->start,
7250                                        buf->len, 1);
7251                        btrfs_put_block_group(cache);
7252                        goto out;
7253                }
7254
7255                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
7256
7257                btrfs_add_free_space(cache, buf->start, buf->len);
7258                btrfs_free_reserved_bytes(cache, buf->len, 0);
7259                btrfs_put_block_group(cache);
7260                trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
7261        }
7262out:
7263        if (pin)
7264                add_pinned_bytes(fs_info, buf->len, true,
7265                                 root->root_key.objectid);
7266
7267        if (last_ref) {
7268                /*
7269                 * Deleting the buffer, clear the corrupt flag since it doesn't
7270                 * matter anymore.
7271                 */
7272                clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7273        }
7274}
7275
7276/* Can return -ENOMEM */
7277int btrfs_free_extent(struct btrfs_trans_handle *trans,
7278                      struct btrfs_root *root,
7279                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7280                      u64 owner, u64 offset)
7281{
7282        struct btrfs_fs_info *fs_info = root->fs_info;
7283        int old_ref_mod, new_ref_mod;
7284        int ret;
7285
7286        if (btrfs_is_testing(fs_info))
7287                return 0;
7288
7289        if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
7290                btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
7291                                   root_objectid, owner, offset,
7292                                   BTRFS_DROP_DELAYED_REF);
7293
7294        /*
7295         * tree log blocks never actually go into the extent allocation
7296         * tree, just update pinning info and exit early.
7297         */
7298        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
7299                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7300                /* unlocks the pinned mutex */
7301                btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
7302                old_ref_mod = new_ref_mod = 0;
7303                ret = 0;
7304        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
7305                ret = btrfs_add_delayed_tree_ref(trans, bytenr,
7306                                                 num_bytes, parent,
7307                                                 root_objectid, (int)owner,
7308                                                 BTRFS_DROP_DELAYED_REF, NULL,
7309                                                 &old_ref_mod, &new_ref_mod);
7310        } else {
7311                ret = btrfs_add_delayed_data_ref(trans, bytenr,
7312                                                 num_bytes, parent,
7313                                                 root_objectid, owner, offset,
7314                                                 0, BTRFS_DROP_DELAYED_REF,
7315                                                 &old_ref_mod, &new_ref_mod);
7316        }
7317
7318        if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
7319                bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
7320
7321                add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
7322        }
7323
7324        return ret;
7325}
7326
7327/*
7328 * when we wait for progress in the block group caching, its because
7329 * our allocation attempt failed at least once.  So, we must sleep
7330 * and let some progress happen before we try again.
7331 *
7332 * This function will sleep at least once waiting for new free space to
7333 * show up, and then it will check the block group free space numbers
7334 * for our min num_bytes.  Another option is to have it go ahead
7335 * and look in the rbtree for a free extent of a given size, but this
7336 * is a good start.
7337 *
7338 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
7339 * any of the information in this block group.
7340 */
7341static noinline void
7342wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
7343                                u64 num_bytes)
7344{
7345        struct btrfs_caching_control *caching_ctl;
7346
7347        caching_ctl = get_caching_control(cache);
7348        if (!caching_ctl)
7349                return;
7350
7351        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7352                   (cache->free_space_ctl->free_space >= num_bytes));
7353
7354        put_caching_control(caching_ctl);
7355}
7356
7357static noinline int
7358wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
7359{
7360        struct btrfs_caching_control *caching_ctl;
7361        int ret = 0;
7362
7363        caching_ctl = get_caching_control(cache);
7364        if (!caching_ctl)
7365                return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7366
7367        wait_event(caching_ctl->wait, block_group_cache_done(cache));
7368        if (cache->cached == BTRFS_CACHE_ERROR)
7369                ret = -EIO;
7370        put_caching_control(caching_ctl);
7371        return ret;
7372}
7373
7374enum btrfs_loop_type {
7375        LOOP_CACHING_NOWAIT = 0,
7376        LOOP_CACHING_WAIT = 1,
7377        LOOP_ALLOC_CHUNK = 2,
7378        LOOP_NO_EMPTY_SIZE = 3,
7379};
7380
7381static inline void
7382btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
7383                       int delalloc)
7384{
7385        if (delalloc)
7386                down_read(&cache->data_rwsem);
7387}
7388
7389static inline void
7390btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
7391                       int delalloc)
7392{
7393        btrfs_get_block_group(cache);
7394        if (delalloc)
7395                down_read(&cache->data_rwsem);
7396}
7397
7398static struct btrfs_block_group_cache *
7399btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7400                   struct btrfs_free_cluster *cluster,
7401                   int delalloc)
7402{
7403        struct btrfs_block_group_cache *used_bg = NULL;
7404
7405        spin_lock(&cluster->refill_lock);
7406        while (1) {
7407                used_bg = cluster->block_group;
7408                if (!used_bg)
7409                        return NULL;
7410
7411                if (used_bg == block_group)
7412                        return used_bg;
7413
7414                btrfs_get_block_group(used_bg);
7415
7416                if (!delalloc)
7417                        return used_bg;
7418
7419                if (down_read_trylock(&used_bg->data_rwsem))
7420                        return used_bg;
7421
7422                spin_unlock(&cluster->refill_lock);
7423
7424                /* We should only have one-level nested. */
7425                down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
7426
7427                spin_lock(&cluster->refill_lock);
7428                if (used_bg == cluster->block_group)
7429                        return used_bg;
7430
7431                up_read(&used_bg->data_rwsem);
7432                btrfs_put_block_group(used_bg);
7433        }
7434}
7435
7436static inline void
7437btrfs_release_block_group(struct btrfs_block_group_cache *cache,
7438                         int delalloc)
7439{
7440        if (delalloc)
7441                up_read(&cache->data_rwsem);
7442        btrfs_put_block_group(cache);
7443}
7444
7445/*
7446 * Structure used internally for find_free_extent() function.  Wraps needed
7447 * parameters.
7448 */
7449struct find_free_extent_ctl {
7450        /* Basic allocation info */
7451        u64 ram_bytes;
7452        u64 num_bytes;
7453        u64 empty_size;
7454        u64 flags;
7455        int delalloc;
7456
7457        /* Where to start the search inside the bg */
7458        u64 search_start;
7459
7460        /* For clustered allocation */
7461        u64 empty_cluster;
7462
7463        bool have_caching_bg;
7464        bool orig_have_caching_bg;
7465
7466        /* RAID index, converted from flags */
7467        int index;
7468
7469        /*
7470         * Current loop number, check find_free_extent_update_loop() for details
7471         */
7472        int loop;
7473
7474        /*
7475         * Whether we're refilling a cluster, if true we need to re-search
7476         * current block group but don't try to refill the cluster again.
7477         */
7478        bool retry_clustered;
7479
7480        /*
7481         * Whether we're updating free space cache, if true we need to re-search
7482         * current block group but don't try updating free space cache again.
7483         */
7484        bool retry_unclustered;
7485
7486        /* If current block group is cached */
7487        int cached;
7488
7489        /* Max contiguous hole found */
7490        u64 max_extent_size;
7491
7492        /* Total free space from free space cache, not always contiguous */
7493        u64 total_free_space;
7494
7495        /* Found result */
7496        u64 found_offset;
7497};
7498
7499
7500/*
7501 * Helper function for find_free_extent().
7502 *
7503 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
7504 * Return -EAGAIN to inform caller that we need to re-search this block group
7505 * Return >0 to inform caller that we find nothing
7506 * Return 0 means we have found a location and set ffe_ctl->found_offset.
7507 */
7508static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
7509                struct btrfs_free_cluster *last_ptr,
7510                struct find_free_extent_ctl *ffe_ctl,
7511                struct btrfs_block_group_cache **cluster_bg_ret)
7512{
7513        struct btrfs_fs_info *fs_info = bg->fs_info;
7514        struct btrfs_block_group_cache *cluster_bg;
7515        u64 aligned_cluster;
7516        u64 offset;
7517        int ret;
7518
7519        cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
7520        if (!cluster_bg)
7521                goto refill_cluster;
7522        if (cluster_bg != bg && (cluster_bg->ro ||
7523            !block_group_bits(cluster_bg, ffe_ctl->flags)))
7524                goto release_cluster;
7525
7526        offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
7527                        ffe_ctl->num_bytes, cluster_bg->key.objectid,
7528                        &ffe_ctl->max_extent_size);
7529        if (offset) {
7530                /* We have a block, we're done */
7531                spin_unlock(&last_ptr->refill_lock);
7532                trace_btrfs_reserve_extent_cluster(cluster_bg,
7533                                ffe_ctl->search_start, ffe_ctl->num_bytes);
7534                *cluster_bg_ret = cluster_bg;
7535                ffe_ctl->found_offset = offset;
7536                return 0;
7537        }
7538        WARN_ON(last_ptr->block_group != cluster_bg);
7539
7540release_cluster:
7541        /*
7542         * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
7543         * lets just skip it and let the allocator find whatever block it can
7544         * find. If we reach this point, we will have tried the cluster
7545         * allocator plenty of times and not have found anything, so we are
7546         * likely way too fragmented for the clustering stuff to find anything.
7547         *
7548         * However, if the cluster is taken from the current block group,
7549         * release the cluster first, so that we stand a better chance of
7550         * succeeding in the unclustered allocation.
7551         */
7552        if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
7553                spin_unlock(&last_ptr->refill_lock);
7554                btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7555                return -ENOENT;
7556        }
7557
7558        /* This cluster didn't work out, free it and start over */
7559        btrfs_return_cluster_to_free_space(NULL, last_ptr);
7560
7561        if (cluster_bg != bg)
7562                btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
7563
7564refill_cluster:
7565        if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
7566                spin_unlock(&last_ptr->refill_lock);
7567                return -ENOENT;
7568        }
7569
7570        aligned_cluster = max_t(u64,
7571                        ffe_ctl->empty_cluster + ffe_ctl->empty_size,
7572                        bg->full_stripe_len);
7573        ret = btrfs_find_space_cluster(fs_info, bg, last_ptr,
7574                        ffe_ctl->search_start, ffe_ctl->num_bytes,
7575                        aligned_cluster);
7576        if (ret == 0) {
7577                /* Now pull our allocation out of this cluster */
7578                offset = btrfs_alloc_from_cluster(bg, last_ptr,
7579                                ffe_ctl->num_bytes, ffe_ctl->search_start,
7580                                &ffe_ctl->max_extent_size);
7581                if (offset) {
7582                        /* We found one, proceed */
7583                        spin_unlock(&last_ptr->refill_lock);
7584                        trace_btrfs_reserve_extent_cluster(bg,
7585                                        ffe_ctl->search_start,
7586                                        ffe_ctl->num_bytes);
7587                        ffe_ctl->found_offset = offset;
7588                        return 0;
7589                }
7590        } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
7591                   !ffe_ctl->retry_clustered) {
7592                spin_unlock(&last_ptr->refill_lock);
7593
7594                ffe_ctl->retry_clustered = true;
7595                wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7596                                ffe_ctl->empty_cluster + ffe_ctl->empty_size);
7597                return -EAGAIN;
7598        }
7599        /*
7600         * At this point we either didn't find a cluster or we weren't able to
7601         * allocate a block from our cluster.  Free the cluster we've been
7602         * trying to use, and go to the next block group.
7603         */
7604        btrfs_return_cluster_to_free_space(NULL, last_ptr);
7605        spin_unlock(&last_ptr->refill_lock);
7606        return 1;
7607}
7608
7609/*
7610 * Return >0 to inform caller that we find nothing
7611 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
7612 * Return -EAGAIN to inform caller that we need to re-search this block group
7613 */
7614static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
7615                struct btrfs_free_cluster *last_ptr,
7616                struct find_free_extent_ctl *ffe_ctl)
7617{
7618        u64 offset;
7619
7620        /*
7621         * We are doing an unclustered allocation, set the fragmented flag so
7622         * we don't bother trying to setup a cluster again until we get more
7623         * space.
7624         */
7625        if (unlikely(last_ptr)) {
7626                spin_lock(&last_ptr->lock);
7627                last_ptr->fragmented = 1;
7628                spin_unlock(&last_ptr->lock);
7629        }
7630        if (ffe_ctl->cached) {
7631                struct btrfs_free_space_ctl *free_space_ctl;
7632
7633                free_space_ctl = bg->free_space_ctl;
7634                spin_lock(&free_space_ctl->tree_lock);
7635                if (free_space_ctl->free_space <
7636                    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
7637                    ffe_ctl->empty_size) {
7638                        ffe_ctl->total_free_space = max_t(u64,
7639                                        ffe_ctl->total_free_space,
7640                                        free_space_ctl->free_space);
7641                        spin_unlock(&free_space_ctl->tree_lock);
7642                        return 1;
7643                }
7644                spin_unlock(&free_space_ctl->tree_lock);
7645        }
7646
7647        offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
7648                        ffe_ctl->num_bytes, ffe_ctl->empty_size,
7649                        &ffe_ctl->max_extent_size);
7650
7651        /*
7652         * If we didn't find a chunk, and we haven't failed on this block group
7653         * before, and this block group is in the middle of caching and we are
7654         * ok with waiting, then go ahead and wait for progress to be made, and
7655         * set @retry_unclustered to true.
7656         *
7657         * If @retry_unclustered is true then we've already waited on this
7658         * block group once and should move on to the next block group.
7659         */
7660        if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
7661            ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
7662                wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
7663                                                ffe_ctl->empty_size);
7664                ffe_ctl->retry_unclustered = true;
7665                return -EAGAIN;
7666        } else if (!offset) {
7667                return 1;
7668        }
7669        ffe_ctl->found_offset = offset;
7670        return 0;
7671}
7672
7673/*
7674 * Return >0 means caller needs to re-search for free extent
7675 * Return 0 means we have the needed free extent.
7676 * Return <0 means we failed to locate any free extent.
7677 */
7678static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
7679                                        struct btrfs_free_cluster *last_ptr,
7680                                        struct btrfs_key *ins,
7681                                        struct find_free_extent_ctl *ffe_ctl,
7682                                        int full_search, bool use_cluster)
7683{
7684        struct btrfs_root *root = fs_info->extent_root;
7685        int ret;
7686
7687        if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
7688            ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
7689                ffe_ctl->orig_have_caching_bg = true;
7690
7691        if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
7692            ffe_ctl->have_caching_bg)
7693                return 1;
7694
7695        if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
7696                return 1;
7697
7698        if (ins->objectid) {
7699                if (!use_cluster && last_ptr) {
7700                        spin_lock(&last_ptr->lock);
7701                        last_ptr->window_start = ins->objectid;
7702                        spin_unlock(&last_ptr->lock);
7703                }
7704                return 0;
7705        }
7706
7707        /*
7708         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
7709         *                      caching kthreads as we move along
7710         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
7711         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
7712         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
7713         *                     again
7714         */
7715        if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
7716                ffe_ctl->index = 0;
7717                if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
7718                        /*
7719                         * We want to skip the LOOP_CACHING_WAIT step if we
7720                         * don't have any uncached bgs and we've already done a
7721                         * full search through.
7722                         */
7723                        if (ffe_ctl->orig_have_caching_bg || !full_search)
7724                                ffe_ctl->loop = LOOP_CACHING_WAIT;
7725                        else
7726                                ffe_ctl->loop = LOOP_ALLOC_CHUNK;
7727                } else {
7728                        ffe_ctl->loop++;
7729                }
7730
7731                if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
7732                        struct btrfs_trans_handle *trans;
7733                        int exist = 0;
7734
7735                        trans = current->journal_info;
7736                        if (trans)
7737                                exist = 1;
7738                        else
7739                                trans = btrfs_join_transaction(root);
7740
7741                        if (IS_ERR(trans)) {
7742                                ret = PTR_ERR(trans);
7743                                return ret;
7744                        }
7745
7746                        ret = do_chunk_alloc(trans, ffe_ctl->flags,
7747                                             CHUNK_ALLOC_FORCE);
7748
7749                        /*
7750                         * If we can't allocate a new chunk we've already looped
7751                         * through at least once, move on to the NO_EMPTY_SIZE
7752                         * case.
7753                         */
7754                        if (ret == -ENOSPC)
7755                                ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
7756
7757                        /* Do not bail out on ENOSPC since we can do more. */
7758                        if (ret < 0 && ret != -ENOSPC)
7759                                btrfs_abort_transaction(trans, ret);
7760                        else
7761                                ret = 0;
7762                        if (!exist)
7763                                btrfs_end_transaction(trans);
7764                        if (ret)
7765                                return ret;
7766                }
7767
7768                if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
7769                        /*
7770                         * Don't loop again if we already have no empty_size and
7771                         * no empty_cluster.
7772                         */
7773                        if (ffe_ctl->empty_size == 0 &&
7774                            ffe_ctl->empty_cluster == 0)
7775                                return -ENOSPC;
7776                        ffe_ctl->empty_size = 0;
7777                        ffe_ctl->empty_cluster = 0;
7778                }
7779                return 1;
7780        }
7781        return -ENOSPC;
7782}
7783
7784/*
7785 * walks the btree of allocated extents and find a hole of a given size.
7786 * The key ins is changed to record the hole:
7787 * ins->objectid == start position
7788 * ins->flags = BTRFS_EXTENT_ITEM_KEY
7789 * ins->offset == the size of the hole.
7790 * Any available blocks before search_start are skipped.
7791 *
7792 * If there is no suitable free space, we will record the max size of
7793 * the free space extent currently.
7794 *
7795 * The overall logic and call chain:
7796 *
7797 * find_free_extent()
7798 * |- Iterate through all block groups
7799 * |  |- Get a valid block group
7800 * |  |- Try to do clustered allocation in that block group
7801 * |  |- Try to do unclustered allocation in that block group
7802 * |  |- Check if the result is valid
7803 * |  |  |- If valid, then exit
7804 * |  |- Jump to next block group
7805 * |
7806 * |- Push harder to find free extents
7807 *    |- If not found, re-iterate all block groups
7808 */
7809static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
7810                                u64 ram_bytes, u64 num_bytes, u64 empty_size,
7811                                u64 hint_byte, struct btrfs_key *ins,
7812                                u64 flags, int delalloc)
7813{
7814        int ret = 0;
7815        struct btrfs_free_cluster *last_ptr = NULL;
7816        struct btrfs_block_group_cache *block_group = NULL;
7817        struct find_free_extent_ctl ffe_ctl = {0};
7818        struct btrfs_space_info *space_info;
7819        bool use_cluster = true;
7820        bool full_search = false;
7821
7822        WARN_ON(num_bytes < fs_info->sectorsize);
7823
7824        ffe_ctl.ram_bytes = ram_bytes;
7825        ffe_ctl.num_bytes = num_bytes;
7826        ffe_ctl.empty_size = empty_size;
7827        ffe_ctl.flags = flags;
7828        ffe_ctl.search_start = 0;
7829        ffe_ctl.retry_clustered = false;
7830        ffe_ctl.retry_unclustered = false;
7831        ffe_ctl.delalloc = delalloc;
7832        ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
7833        ffe_ctl.have_caching_bg = false;
7834        ffe_ctl.orig_have_caching_bg = false;
7835        ffe_ctl.found_offset = 0;
7836
7837        ins->type = BTRFS_EXTENT_ITEM_KEY;
7838        ins->objectid = 0;
7839        ins->offset = 0;
7840
7841        trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
7842
7843        space_info = __find_space_info(fs_info, flags);
7844        if (!space_info) {
7845                btrfs_err(fs_info, "No space info for %llu", flags);
7846                return -ENOSPC;
7847        }
7848
7849        /*
7850         * If our free space is heavily fragmented we may not be able to make
7851         * big contiguous allocations, so instead of doing the expensive search
7852         * for free space, simply return ENOSPC with our max_extent_size so we
7853         * can go ahead and search for a more manageable chunk.
7854         *
7855         * If our max_extent_size is large enough for our allocation simply
7856         * disable clustering since we will likely not be able to find enough
7857         * space to create a cluster and induce latency trying.
7858         */
7859        if (unlikely(space_info->max_extent_size)) {
7860                spin_lock(&space_info->lock);
7861                if (space_info->max_extent_size &&
7862                    num_bytes > space_info->max_extent_size) {
7863                        ins->offset = space_info->max_extent_size;
7864                        spin_unlock(&space_info->lock);
7865                        return -ENOSPC;
7866                } else if (space_info->max_extent_size) {
7867                        use_cluster = false;
7868                }
7869                spin_unlock(&space_info->lock);
7870        }
7871
7872        last_ptr = fetch_cluster_info(fs_info, space_info,
7873                                      &ffe_ctl.empty_cluster);
7874        if (last_ptr) {
7875                spin_lock(&last_ptr->lock);
7876                if (last_ptr->block_group)
7877                        hint_byte = last_ptr->window_start;
7878                if (last_ptr->fragmented) {
7879                        /*
7880                         * We still set window_start so we can keep track of the
7881                         * last place we found an allocation to try and save
7882                         * some time.
7883                         */
7884                        hint_byte = last_ptr->window_start;
7885                        use_cluster = false;
7886                }
7887                spin_unlock(&last_ptr->lock);
7888        }
7889
7890        ffe_ctl.search_start = max(ffe_ctl.search_start,
7891                                   first_logical_byte(fs_info, 0));
7892        ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
7893        if (ffe_ctl.search_start == hint_byte) {
7894                block_group = btrfs_lookup_block_group(fs_info,
7895                                                       ffe_ctl.search_start);
7896                /*
7897                 * we don't want to use the block group if it doesn't match our
7898                 * allocation bits, or if its not cached.
7899                 *
7900                 * However if we are re-searching with an ideal block group
7901                 * picked out then we don't care that the block group is cached.
7902                 */
7903                if (block_group && block_group_bits(block_group, flags) &&
7904                    block_group->cached != BTRFS_CACHE_NO) {
7905                        down_read(&space_info->groups_sem);
7906                        if (list_empty(&block_group->list) ||
7907                            block_group->ro) {
7908                                /*
7909                                 * someone is removing this block group,
7910                                 * we can't jump into the have_block_group
7911                                 * target because our list pointers are not
7912                                 * valid
7913                                 */
7914                                btrfs_put_block_group(block_group);
7915                                up_read(&space_info->groups_sem);
7916                        } else {
7917                                ffe_ctl.index = btrfs_bg_flags_to_raid_index(
7918                                                block_group->flags);
7919                                btrfs_lock_block_group(block_group, delalloc);
7920                                goto have_block_group;
7921                        }
7922                } else if (block_group) {
7923                        btrfs_put_block_group(block_group);
7924                }
7925        }
7926search:
7927        ffe_ctl.have_caching_bg = false;
7928        if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
7929            ffe_ctl.index == 0)
7930                full_search = true;
7931        down_read(&space_info->groups_sem);
7932        list_for_each_entry(block_group,
7933                            &space_info->block_groups[ffe_ctl.index], list) {
7934                /* If the block group is read-only, we can skip it entirely. */
7935                if (unlikely(block_group->ro))
7936                        continue;
7937
7938                btrfs_grab_block_group(block_group, delalloc);
7939                ffe_ctl.search_start = block_group->key.objectid;
7940
7941                /*
7942                 * this can happen if we end up cycling through all the
7943                 * raid types, but we want to make sure we only allocate
7944                 * for the proper type.
7945                 */
7946                if (!block_group_bits(block_group, flags)) {
7947                        u64 extra = BTRFS_BLOCK_GROUP_DUP |
7948                                BTRFS_BLOCK_GROUP_RAID1 |
7949                                BTRFS_BLOCK_GROUP_RAID5 |
7950                                BTRFS_BLOCK_GROUP_RAID6 |
7951                                BTRFS_BLOCK_GROUP_RAID10;
7952
7953                        /*
7954                         * if they asked for extra copies and this block group
7955                         * doesn't provide them, bail.  This does allow us to
7956                         * fill raid0 from raid1.
7957                         */
7958                        if ((flags & extra) && !(block_group->flags & extra))
7959                                goto loop;
7960                }
7961
7962have_block_group:
7963                ffe_ctl.cached = block_group_cache_done(block_group);
7964                if (unlikely(!ffe_ctl.cached)) {
7965                        ffe_ctl.have_caching_bg = true;
7966                        ret = cache_block_group(block_group, 0);
7967                        BUG_ON(ret < 0);
7968                        ret = 0;
7969                }
7970
7971                if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
7972                        goto loop;
7973
7974                /*
7975                 * Ok we want to try and use the cluster allocator, so
7976                 * lets look there
7977                 */
7978                if (last_ptr && use_cluster) {
7979                        struct btrfs_block_group_cache *cluster_bg = NULL;
7980
7981                        ret = find_free_extent_clustered(block_group, last_ptr,
7982                                                         &ffe_ctl, &cluster_bg);
7983
7984                        if (ret == 0) {
7985                                if (cluster_bg && cluster_bg != block_group) {
7986                                        btrfs_release_block_group(block_group,
7987                                                                  delalloc);
7988                                        block_group = cluster_bg;
7989                                }
7990                                goto checks;
7991                        } else if (ret == -EAGAIN) {
7992                                goto have_block_group;
7993                        } else if (ret > 0) {
7994                                goto loop;
7995                        }
7996                        /* ret == -ENOENT case falls through */
7997                }
7998
7999                ret = find_free_extent_unclustered(block_group, last_ptr,
8000                                                   &ffe_ctl);
8001                if (ret == -EAGAIN)
8002                        goto have_block_group;
8003                else if (ret > 0)
8004                        goto loop;
8005                /* ret == 0 case falls through */
8006checks:
8007                ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
8008                                             fs_info->stripesize);
8009
8010                /* move on to the next group */
8011                if (ffe_ctl.search_start + num_bytes >
8012                    block_group->key.objectid + block_group->key.offset) {
8013                        btrfs_add_free_space(block_group, ffe_ctl.found_offset,
8014                                             num_bytes);
8015                        goto loop;
8016                }
8017
8018                if (ffe_ctl.found_offset < ffe_ctl.search_start)
8019                        btrfs_add_free_space(block_group, ffe_ctl.found_offset,
8020                                ffe_ctl.search_start - ffe_ctl.found_offset);
8021
8022                ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
8023                                num_bytes, delalloc);
8024                if (ret == -EAGAIN) {
8025                        btrfs_add_free_space(block_group, ffe_ctl.found_offset,
8026                                             num_bytes);
8027                        goto loop;
8028                }
8029                btrfs_inc_block_group_reservations(block_group);
8030
8031                /* we are all good, lets return */
8032                ins->objectid = ffe_ctl.search_start;
8033                ins->offset = num_bytes;
8034
8035                trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
8036                                           num_bytes);
8037                btrfs_release_block_group(block_group, delalloc);
8038                break;
8039loop:
8040                ffe_ctl.retry_clustered = false;
8041                ffe_ctl.retry_unclustered = false;
8042                BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
8043                       ffe_ctl.index);
8044                btrfs_release_block_group(block_group, delalloc);
8045                cond_resched();
8046        }
8047        up_read(&space_info->groups_sem);
8048
8049        ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
8050                                           full_search, use_cluster);
8051        if (ret > 0)
8052                goto search;
8053
8054        if (ret == -ENOSPC) {
8055                /*
8056                 * Use ffe_ctl->total_free_space as fallback if we can't find
8057                 * any contiguous hole.
8058                 */
8059                if (!ffe_ctl.max_extent_size)
8060                        ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
8061                spin_lock(&space_info->lock);
8062                space_info->max_extent_size = ffe_ctl.max_extent_size;
8063                spin_unlock(&space_info->lock);
8064                ins->offset = ffe_ctl.max_extent_size;
8065        }
8066        return ret;
8067}
8068
8069static void dump_space_info(struct btrfs_fs_info *fs_info,
8070                            struct btrfs_space_info *info, u64 bytes,
8071                            int dump_block_groups)
8072{
8073        struct btrfs_block_group_cache *cache;
8074        int index = 0;
8075
8076        spin_lock(&info->lock);
8077        btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
8078                   info->flags,
8079                   info->total_bytes - btrfs_space_info_used(info, true),
8080                   info->full ? "" : "not ");
8081        btrfs_info(fs_info,
8082                "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
8083                info->total_bytes, info->bytes_used, info->bytes_pinned,
8084                info->bytes_reserved, info->bytes_may_use,
8085                info->bytes_readonly);
8086        spin_unlock(&info->lock);
8087
8088        if (!dump_block_groups)
8089                return;
8090
8091        down_read(&info->groups_sem);
8092again:
8093        list_for_each_entry(cache, &info->block_groups[index], list) {
8094                spin_lock(&cache->lock);
8095                btrfs_info(fs_info,
8096                        "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
8097                        cache->key.objectid, cache->key.offset,
8098                        btrfs_block_group_used(&cache->item), cache->pinned,
8099                        cache->reserved, cache->ro ? "[readonly]" : "");
8100                btrfs_dump_free_space(cache, bytes);
8101                spin_unlock(&cache->lock);
8102        }
8103        if (++index < BTRFS_NR_RAID_TYPES)
8104                goto again;
8105        up_read(&info->groups_sem);
8106}
8107
8108/*
8109 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
8110 *                        hole that is at least as big as @num_bytes.
8111 *
8112 * @root           -    The root that will contain this extent
8113 *
8114 * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
8115 *                      is used for accounting purposes. This value differs
8116 *                      from @num_bytes only in the case of compressed extents.
8117 *
8118 * @num_bytes      -    Number of bytes to allocate on-disk.
8119 *
8120 * @min_alloc_size -    Indicates the minimum amount of space that the
8121 *                      allocator should try to satisfy. In some cases
8122 *                      @num_bytes may be larger than what is required and if
8123 *                      the filesystem is fragmented then allocation fails.
8124 *                      However, the presence of @min_alloc_size gives a
8125 *                      chance to try and satisfy the smaller allocation.
8126 *
8127 * @empty_size     -    A hint that you plan on doing more COW. This is the
8128 *                      size in bytes the allocator should try to find free
8129 *                      next to the block it returns.  This is just a hint and
8130 *                      may be ignored by the allocator.
8131 *
8132 * @hint_byte      -    Hint to the allocator to start searching above the byte
8133 *                      address passed. It might be ignored.
8134 *
8135 * @ins            -    This key is modified to record the found hole. It will
8136 *                      have the following values:
8137 *                      ins->objectid == start position
8138 *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
8139 *                      ins->offset == the size of the hole.
8140 *
8141 * @is_data        -    Boolean flag indicating whether an extent is
8142 *                      allocated for data (true) or metadata (false)
8143 *
8144 * @delalloc       -    Boolean flag indicating whether this allocation is for
8145 *                      delalloc or not. If 'true' data_rwsem of block groups
8146 *                      is going to be acquired.
8147 *
8148 *
8149 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
8150 * case -ENOSPC is returned then @ins->offset will contain the size of the
8151 * largest available hole the allocator managed to find.
8152 */
8153int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
8154                         u64 num_bytes, u64 min_alloc_size,
8155                         u64 empty_size, u64 hint_byte,
8156                         struct btrfs_key *ins, int is_data, int delalloc)
8157{
8158        struct btrfs_fs_info *fs_info = root->fs_info;
8159        bool final_tried = num_bytes == min_alloc_size;
8160        u64 flags;
8161        int ret;
8162
8163        flags = get_alloc_profile_by_root(root, is_data);
8164again:
8165        WARN_ON(num_bytes < fs_info->sectorsize);
8166        ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
8167                               hint_byte, ins, flags, delalloc);
8168        if (!ret && !is_data) {
8169                btrfs_dec_block_group_reservations(fs_info, ins->objectid);
8170        } else if (ret == -ENOSPC) {
8171                if (!final_tried && ins->offset) {
8172                        num_bytes = min(num_bytes >> 1, ins->offset);
8173                        num_bytes = round_down(num_bytes,
8174                                               fs_info->sectorsize);
8175                        num_bytes = max(num_bytes, min_alloc_size);
8176                        ram_bytes = num_bytes;
8177                        if (num_bytes == min_alloc_size)
8178                                final_tried = true;
8179                        goto again;
8180                } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8181                        struct btrfs_space_info *sinfo;
8182
8183                        sinfo = __find_space_info(fs_info, flags);
8184                        btrfs_err(fs_info,
8185                                  "allocation failed flags %llu, wanted %llu",
8186                                  flags, num_bytes);
8187                        if (sinfo)
8188                                dump_space_info(fs_info, sinfo, num_bytes, 1);
8189                }
8190        }
8191
8192        return ret;
8193}
8194
8195static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8196                                        u64 start, u64 len,
8197                                        int pin, int delalloc)
8198{
8199        struct btrfs_block_group_cache *cache;
8200        int ret = 0;
8201
8202        cache = btrfs_lookup_block_group(fs_info, start);
8203        if (!cache) {
8204                btrfs_err(fs_info, "Unable to find block group for %llu",
8205                          start);
8206                return -ENOSPC;
8207        }
8208
8209        if (pin)
8210                pin_down_extent(fs_info, cache, start, len, 1);
8211        else {
8212                if (btrfs_test_opt(fs_info, DISCARD))
8213                        ret = btrfs_discard_extent(fs_info, start, len, NULL);
8214                btrfs_add_free_space(cache, start, len);
8215                btrfs_free_reserved_bytes(cache, len, delalloc);
8216                trace_btrfs_reserved_extent_free(fs_info, start, len);
8217        }
8218
8219        btrfs_put_block_group(cache);
8220        return ret;
8221}
8222
8223int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
8224                               u64 start, u64 len, int delalloc)
8225{
8226        return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
8227}
8228
8229int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
8230                                       u64 start, u64 len)
8231{
8232        return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
8233}
8234
8235static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8236                                      u64 parent, u64 root_objectid,
8237                                      u64 flags, u64 owner, u64 offset,
8238                                      struct btrfs_key *ins, int ref_mod)
8239{
8240        struct btrfs_fs_info *fs_info = trans->fs_info;
8241        int ret;
8242        struct btrfs_extent_item *extent_item;
8243        struct btrfs_extent_inline_ref *iref;
8244        struct btrfs_path *path;
8245        struct extent_buffer *leaf;
8246        int type;
8247        u32 size;
8248
8249        if (parent > 0)
8250                type = BTRFS_SHARED_DATA_REF_KEY;
8251        else
8252                type = BTRFS_EXTENT_DATA_REF_KEY;
8253
8254        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8255
8256        path = btrfs_alloc_path();
8257        if (!path)
8258                return -ENOMEM;
8259
8260        path->leave_spinning = 1;
8261        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8262                                      ins, size);
8263        if (ret) {
8264                btrfs_free_path(path);
8265                return ret;
8266        }
8267
8268        leaf = path->nodes[0];
8269        extent_item = btrfs_item_ptr(leaf, path->slots[0],
8270                                     struct btrfs_extent_item);
8271        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
8272        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8273        btrfs_set_extent_flags(leaf, extent_item,
8274                               flags | BTRFS_EXTENT_FLAG_DATA);
8275
8276        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8277        btrfs_set_extent_inline_ref_type(leaf, iref, type);
8278        if (parent > 0) {
8279                struct btrfs_shared_data_ref *ref;
8280                ref = (struct btrfs_shared_data_ref *)(iref + 1);
8281                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
8282                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
8283        } else {
8284                struct btrfs_extent_data_ref *ref;
8285                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
8286                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
8287                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
8288                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
8289                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
8290        }
8291
8292        btrfs_mark_buffer_dirty(path->nodes[0]);
8293        btrfs_free_path(path);
8294
8295        ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
8296        if (ret)
8297                return ret;
8298
8299        ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
8300        if (ret) { /* -ENOENT, logic error */
8301                btrfs_err(fs_info, "update block group failed for %llu %llu",
8302                        ins->objectid, ins->offset);
8303                BUG();
8304        }
8305        trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
8306        return ret;
8307}
8308
8309static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
8310                                     struct btrfs_delayed_ref_node *node,
8311                                     struct btrfs_delayed_extent_op *extent_op)
8312{
8313        struct btrfs_fs_info *fs_info = trans->fs_info;
8314        int ret;
8315        struct btrfs_extent_item *extent_item;
8316        struct btrfs_key extent_key;
8317        struct btrfs_tree_block_info *block_info;
8318        struct btrfs_extent_inline_ref *iref;
8319        struct btrfs_path *path;
8320        struct extent_buffer *leaf;
8321        struct btrfs_delayed_tree_ref *ref;
8322        u32 size = sizeof(*extent_item) + sizeof(*iref);
8323        u64 num_bytes;
8324        u64 flags = extent_op->flags_to_set;
8325        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8326
8327        ref = btrfs_delayed_node_to_tree_ref(node);
8328
8329        extent_key.objectid = node->bytenr;
8330        if (skinny_metadata) {
8331                extent_key.offset = ref->level;
8332                extent_key.type = BTRFS_METADATA_ITEM_KEY;
8333                num_bytes = fs_info->nodesize;
8334        } else {
8335                extent_key.offset = node->num_bytes;
8336                extent_key.type = BTRFS_EXTENT_ITEM_KEY;
8337                size += sizeof(*block_info);
8338                num_bytes = node->num_bytes;
8339        }
8340
8341        path = btrfs_alloc_path();
8342        if (!path)
8343                return -ENOMEM;
8344
8345        path->leave_spinning = 1;
8346        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
8347                                      &extent_key, size);
8348        if (ret) {
8349                btrfs_free_path(path);
8350                return ret;
8351        }
8352
8353        leaf = path->nodes[0];
8354        extent_item = btrfs_item_ptr(leaf, path->slots[0],
8355                                     struct btrfs_extent_item);
8356        btrfs_set_extent_refs(leaf, extent_item, 1);
8357        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
8358        btrfs_set_extent_flags(leaf, extent_item,
8359                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
8360
8361        if (skinny_metadata) {
8362                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8363        } else {
8364                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
8365                btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
8366                btrfs_set_tree_block_level(leaf, block_info, ref->level);
8367                iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
8368        }
8369
8370        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
8371                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
8372                btrfs_set_extent_inline_ref_type(leaf, iref,
8373                                                 BTRFS_SHARED_BLOCK_REF_KEY);
8374                btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
8375        } else {
8376                btrfs_set_extent_inline_ref_type(leaf, iref,
8377                                                 BTRFS_TREE_BLOCK_REF_KEY);
8378                btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
8379        }
8380
8381        btrfs_mark_buffer_dirty(leaf);
8382        btrfs_free_path(path);
8383
8384        ret = remove_from_free_space_tree(trans, extent_key.objectid,
8385                                          num_bytes);
8386        if (ret)
8387                return ret;
8388
8389        ret = update_block_group(trans, fs_info, extent_key.objectid,
8390                                 fs_info->nodesize, 1);
8391        if (ret) { /* -ENOENT, logic error */
8392                btrfs_err(fs_info, "update block group failed for %llu %llu",
8393                        extent_key.objectid, extent_key.offset);
8394                BUG();
8395        }
8396
8397        trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
8398                                          fs_info->nodesize);
8399        return ret;
8400}
8401
8402int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
8403                                     struct btrfs_root *root, u64 owner,
8404                                     u64 offset, u64 ram_bytes,
8405                                     struct btrfs_key *ins)
8406{
8407        int ret;
8408
8409        BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
8410
8411        btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
8412                           root->root_key.objectid, owner, offset,
8413                           BTRFS_ADD_DELAYED_EXTENT);
8414
8415        ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
8416                                         ins->offset, 0,
8417                                         root->root_key.objectid, owner,
8418                                         offset, ram_bytes,
8419                                         BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
8420        return ret;
8421}
8422
8423/*
8424 * this is used by the tree logging recovery code.  It records that
8425 * an extent has been allocated and makes sure to clear the free
8426 * space cache bits as well
8427 */
8428int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
8429                                   u64 root_objectid, u64 owner, u64 offset,
8430                                   struct btrfs_key *ins)
8431{
8432        struct btrfs_fs_info *fs_info = trans->fs_info;
8433        int ret;
8434        struct btrfs_block_group_cache *block_group;
8435        struct btrfs_space_info *space_info;
8436
8437        /*
8438         * Mixed block groups will exclude before processing the log so we only
8439         * need to do the exclude dance if this fs isn't mixed.
8440         */
8441        if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
8442                ret = __exclude_logged_extent(fs_info, ins->objectid,
8443                                              ins->offset);
8444                if (ret)
8445                        return ret;
8446        }
8447
8448        block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
8449        if (!block_group)
8450                return -EINVAL;
8451
8452        space_info = block_group->space_info;
8453        spin_lock(&space_info->lock);
8454        spin_lock(&block_group->lock);
8455        space_info->bytes_reserved += ins->offset;
8456        block_group->reserved += ins->offset;
8457        spin_unlock(&block_group->lock);
8458        spin_unlock(&space_info->lock);
8459
8460        ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
8461                                         offset, ins, 1);
8462        btrfs_put_block_group(block_group);
8463        return ret;
8464}
8465
8466static struct extent_buffer *
8467btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8468                      u64 bytenr, int level, u64 owner)
8469{
8470        struct btrfs_fs_info *fs_info = root->fs_info;
8471        struct extent_buffer *buf;
8472
8473        buf = btrfs_find_create_tree_block(fs_info, bytenr);
8474        if (IS_ERR(buf))
8475                return buf;
8476
8477        /*
8478         * Extra safety check in case the extent tree is corrupted and extent
8479         * allocator chooses to use a tree block which is already used and
8480         * locked.
8481         */
8482        if (buf->lock_owner == current->pid) {
8483                btrfs_err_rl(fs_info,
8484"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
8485                        buf->start, btrfs_header_owner(buf), current->pid);
8486                free_extent_buffer(buf);
8487                return ERR_PTR(-EUCLEAN);
8488        }
8489
8490        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8491        btrfs_tree_lock(buf);
8492        clean_tree_block(fs_info, buf);
8493        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8494
8495        btrfs_set_lock_blocking(buf);
8496        set_extent_buffer_uptodate(buf);
8497
8498        memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
8499        btrfs_set_header_level(buf, level);
8500        btrfs_set_header_bytenr(buf, buf->start);
8501        btrfs_set_header_generation(buf, trans->transid);
8502        btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
8503        btrfs_set_header_owner(buf, owner);
8504        write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
8505        write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
8506        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8507                buf->log_index = root->log_transid % 2;
8508                /*
8509                 * we allow two log transactions at a time, use different
8510                 * EXTENT bit to differentiate dirty pages.
8511                 */
8512                if (buf->log_index == 0)
8513                        set_extent_dirty(&root->dirty_log_pages, buf->start,
8514                                        buf->start + buf->len - 1, GFP_NOFS);
8515                else
8516                        set_extent_new(&root->dirty_log_pages, buf->start,
8517                                        buf->start + buf->len - 1);
8518        } else {
8519                buf->log_index = -1;
8520                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8521                         buf->start + buf->len - 1, GFP_NOFS);
8522        }
8523        trans->dirty = true;
8524        /* this returns a buffer locked for blocking */
8525        return buf;
8526}
8527
8528static struct btrfs_block_rsv *
8529use_block_rsv(struct btrfs_trans_handle *trans,
8530              struct btrfs_root *root, u32 blocksize)
8531{
8532        struct btrfs_fs_info *fs_info = root->fs_info;
8533        struct btrfs_block_rsv *block_rsv;
8534        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
8535        int ret;
8536        bool global_updated = false;
8537
8538        block_rsv = get_block_rsv(trans, root);
8539
8540        if (unlikely(block_rsv->size == 0))
8541                goto try_reserve;
8542again:
8543        ret = block_rsv_use_bytes(block_rsv, blocksize);
8544        if (!ret)
8545                return block_rsv;
8546
8547        if (block_rsv->failfast)
8548                return ERR_PTR(ret);
8549
8550        if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
8551                global_updated = true;
8552                update_global_block_rsv(fs_info);
8553                goto again;
8554        }
8555
8556        /*
8557         * The global reserve still exists to save us from ourselves, so don't
8558         * warn_on if we are short on our delayed refs reserve.
8559         */
8560        if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
8561            btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
8562                static DEFINE_RATELIMIT_STATE(_rs,
8563                                DEFAULT_RATELIMIT_INTERVAL * 10,
8564                                /*DEFAULT_RATELIMIT_BURST*/ 1);
8565                if (__ratelimit(&_rs))
8566                        WARN(1, KERN_DEBUG
8567                                "BTRFS: block rsv returned %d\n", ret);
8568        }
8569try_reserve:
8570        ret = reserve_metadata_bytes(root, block_rsv, blocksize,
8571                                     BTRFS_RESERVE_NO_FLUSH);
8572        if (!ret)
8573                return block_rsv;
8574        /*
8575         * If we couldn't reserve metadata bytes try and use some from
8576         * the global reserve if its space type is the same as the global
8577         * reservation.
8578         */
8579        if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
8580            block_rsv->space_info == global_rsv->space_info) {
8581                ret = block_rsv_use_bytes(global_rsv, blocksize);
8582                if (!ret)
8583                        return global_rsv;
8584        }
8585        return ERR_PTR(ret);
8586}
8587
8588static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
8589                            struct btrfs_block_rsv *block_rsv, u32 blocksize)
8590{
8591        block_rsv_add_bytes(block_rsv, blocksize, false);
8592        block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
8593}
8594
8595/*
8596 * finds a free extent and does all the dirty work required for allocation
8597 * returns the tree buffer or an ERR_PTR on error.
8598 */
8599struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
8600                                             struct btrfs_root *root,
8601                                             u64 parent, u64 root_objectid,
8602                                             const struct btrfs_disk_key *key,
8603                                             int level, u64 hint,
8604                                             u64 empty_size)
8605{
8606        struct btrfs_fs_info *fs_info = root->fs_info;
8607        struct btrfs_key ins;
8608        struct btrfs_block_rsv *block_rsv;
8609        struct extent_buffer *buf;
8610        struct btrfs_delayed_extent_op *extent_op;
8611        u64 flags = 0;
8612        int ret;
8613        u32 blocksize = fs_info->nodesize;
8614        bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
8615
8616#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8617        if (btrfs_is_testing(fs_info)) {
8618                buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8619                                            level, root_objectid);
8620                if (!IS_ERR(buf))
8621                        root->alloc_bytenr += blocksize;
8622                return buf;
8623        }
8624#endif
8625
8626        block_rsv = use_block_rsv(trans, root, blocksize);
8627        if (IS_ERR(block_rsv))
8628                return ERR_CAST(block_rsv);
8629
8630        ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8631                                   empty_size, hint, &ins, 0, 0);
8632        if (ret)
8633                goto out_unuse;
8634
8635        buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
8636                                    root_objectid);
8637        if (IS_ERR(buf)) {
8638                ret = PTR_ERR(buf);
8639                goto out_free_reserved;
8640        }
8641
8642        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
8643                if (parent == 0)
8644                        parent = ins.objectid;
8645                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
8646        } else
8647                BUG_ON(parent > 0);
8648
8649        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8650                extent_op = btrfs_alloc_delayed_extent_op();
8651                if (!extent_op) {
8652                        ret = -ENOMEM;
8653                        goto out_free_buf;
8654                }
8655                if (key)
8656                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
8657                else
8658                        memset(&extent_op->key, 0, sizeof(extent_op->key));
8659                extent_op->flags_to_set = flags;
8660                extent_op->update_key = skinny_metadata ? false : true;
8661                extent_op->update_flags = true;
8662                extent_op->is_data = false;
8663                extent_op->level = level;
8664
8665                btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
8666                                   root_objectid, level, 0,
8667                                   BTRFS_ADD_DELAYED_EXTENT);
8668                ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
8669                                                 ins.offset, parent,
8670                                                 root_objectid, level,
8671                                                 BTRFS_ADD_DELAYED_EXTENT,
8672                                                 extent_op, NULL, NULL);
8673                if (ret)
8674                        goto out_free_delayed;
8675        }
8676        return buf;
8677
8678out_free_delayed:
8679        btrfs_free_delayed_extent_op(extent_op);
8680out_free_buf:
8681        free_extent_buffer(buf);
8682out_free_reserved:
8683        btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
8684out_unuse:
8685        unuse_block_rsv(fs_info, block_rsv, blocksize);
8686        return ERR_PTR(ret);
8687}
8688
8689struct walk_control {
8690        u64 refs[BTRFS_MAX_LEVEL];
8691        u64 flags[BTRFS_MAX_LEVEL];
8692        struct btrfs_key update_progress;
8693        int stage;
8694        int level;
8695        int shared_level;
8696        int update_ref;
8697        int keep_locks;
8698        int reada_slot;
8699        int reada_count;
8700};
8701
8702#define DROP_REFERENCE  1
8703#define UPDATE_BACKREF  2
8704
8705static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
8706                                     struct btrfs_root *root,
8707                                     struct walk_control *wc,
8708                                     struct btrfs_path *path)
8709{
8710        struct btrfs_fs_info *fs_info = root->fs_info;
8711        u64 bytenr;
8712        u64 generation;
8713        u64 refs;
8714        u64 flags;
8715        u32 nritems;
8716        struct btrfs_key key;
8717        struct extent_buffer *eb;
8718        int ret;
8719        int slot;
8720        int nread = 0;
8721
8722        if (path->slots[wc->level] < wc->reada_slot) {
8723                wc->reada_count = wc->reada_count * 2 / 3;
8724                wc->reada_count = max(wc->reada_count, 2);
8725        } else {
8726                wc->reada_count = wc->reada_count * 3 / 2;
8727                wc->reada_count = min_t(int, wc->reada_count,
8728                                        BTRFS_NODEPTRS_PER_BLOCK(fs_info));
8729        }
8730
8731        eb = path->nodes[wc->level];
8732        nritems = btrfs_header_nritems(eb);
8733
8734        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
8735                if (nread >= wc->reada_count)
8736                        break;
8737
8738                cond_resched();
8739                bytenr = btrfs_node_blockptr(eb, slot);
8740                generation = btrfs_node_ptr_generation(eb, slot);
8741
8742                if (slot == path->slots[wc->level])
8743                        goto reada;
8744
8745                if (wc->stage == UPDATE_BACKREF &&
8746                    generation <= root->root_key.offset)
8747                        continue;
8748
8749                /* We don't lock the tree block, it's OK to be racy here */
8750                ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
8751                                               wc->level - 1, 1, &refs,
8752                                               &flags);
8753                /* We don't care about errors in readahead. */
8754                if (ret < 0)
8755                        continue;
8756                BUG_ON(refs == 0);
8757
8758                if (wc->stage == DROP_REFERENCE) {
8759                        if (refs == 1)
8760                                goto reada;
8761
8762                        if (wc->level == 1 &&
8763                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8764                                continue;
8765                        if (!wc->update_ref ||
8766                            generation <= root->root_key.offset)
8767                                continue;
8768                        btrfs_node_key_to_cpu(eb, &key, slot);
8769                        ret = btrfs_comp_cpu_keys(&key,
8770                                                  &wc->update_progress);
8771                        if (ret < 0)
8772                                continue;
8773                } else {
8774                        if (wc->level == 1 &&
8775                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8776                                continue;
8777                }
8778reada:
8779                readahead_tree_block(fs_info, bytenr);
8780                nread++;
8781        }
8782        wc->reada_slot = slot;
8783}
8784
8785/*
8786 * helper to process tree block while walking down the tree.
8787 *
8788 * when wc->stage == UPDATE_BACKREF, this function updates
8789 * back refs for pointers in the block.
8790 *
8791 * NOTE: return value 1 means we should stop walking down.
8792 */
8793static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8794                                   struct btrfs_root *root,
8795                                   struct btrfs_path *path,
8796                                   struct walk_control *wc, int lookup_info)
8797{
8798        struct btrfs_fs_info *fs_info = root->fs_info;
8799        int level = wc->level;
8800        struct extent_buffer *eb = path->nodes[level];
8801        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8802        int ret;
8803
8804        if (wc->stage == UPDATE_BACKREF &&
8805            btrfs_header_owner(eb) != root->root_key.objectid)
8806                return 1;
8807
8808        /*
8809         * when reference count of tree block is 1, it won't increase
8810         * again. once full backref flag is set, we never clear it.
8811         */
8812        if (lookup_info &&
8813            ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
8814             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8815                BUG_ON(!path->locks[level]);
8816                ret = btrfs_lookup_extent_info(trans, fs_info,
8817                                               eb->start, level, 1,
8818                                               &wc->refs[level],
8819                                               &wc->flags[level]);
8820                BUG_ON(ret == -ENOMEM);
8821                if (ret)
8822                        return ret;
8823                BUG_ON(wc->refs[level] == 0);
8824        }
8825
8826        if (wc->stage == DROP_REFERENCE) {
8827                if (wc->refs[level] > 1)
8828                        return 1;
8829
8830                if (path->locks[level] && !wc->keep_locks) {
8831                        btrfs_tree_unlock_rw(eb, path->locks[level]);
8832                        path->locks[level] = 0;
8833                }
8834                return 0;
8835        }
8836
8837        /* wc->stage == UPDATE_BACKREF */
8838        if (!(wc->flags[level] & flag)) {
8839                BUG_ON(!path->locks[level]);
8840                ret = btrfs_inc_ref(trans, root, eb, 1);
8841                BUG_ON(ret); /* -ENOMEM */
8842                ret = btrfs_dec_ref(trans, root, eb, 0);
8843                BUG_ON(ret); /* -ENOMEM */
8844                ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
8845                                                  eb->len, flag,
8846                                                  btrfs_header_level(eb), 0);
8847                BUG_ON(ret); /* -ENOMEM */
8848                wc->flags[level] |= flag;
8849        }
8850
8851        /*
8852         * the block is shared by multiple trees, so it's not good to
8853         * keep the tree lock
8854         */
8855        if (path->locks[level] && level > 0) {
8856                btrfs_tree_unlock_rw(eb, path->locks[level]);
8857                path->locks[level] = 0;
8858        }
8859        return 0;
8860}
8861
8862/*
8863 * helper to process tree block pointer.
8864 *
8865 * when wc->stage == DROP_REFERENCE, this function checks
8866 * reference count of the block pointed to. if the block
8867 * is shared and we need update back refs for the subtree
8868 * rooted at the block, this function changes wc->stage to
8869 * UPDATE_BACKREF. if the block is shared and there is no
8870 * need to update back, this function drops the reference
8871 * to the block.
8872 *
8873 * NOTE: return value 1 means we should stop walking down.
8874 */
8875static noinline int do_walk_down(struct btrfs_trans_handle *trans,
8876                                 struct btrfs_root *root,
8877                                 struct btrfs_path *path,
8878                                 struct walk_control *wc, int *lookup_info)
8879{
8880        struct btrfs_fs_info *fs_info = root->fs_info;
8881        u64 bytenr;
8882        u64 generation;
8883        u64 parent;
8884        struct btrfs_key key;
8885        struct btrfs_key first_key;
8886        struct extent_buffer *next;
8887        int level = wc->level;
8888        int reada = 0;
8889        int ret = 0;
8890        bool need_account = false;
8891
8892        generation = btrfs_node_ptr_generation(path->nodes[level],
8893                                               path->slots[level]);
8894        /*
8895         * if the lower level block was created before the snapshot
8896         * was created, we know there is no need to update back refs
8897         * for the subtree
8898         */
8899        if (wc->stage == UPDATE_BACKREF &&
8900            generation <= root->root_key.offset) {
8901                *lookup_info = 1;
8902                return 1;
8903        }
8904
8905        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8906        btrfs_node_key_to_cpu(path->nodes[level], &first_key,
8907                              path->slots[level]);
8908
8909        next = find_extent_buffer(fs_info, bytenr);
8910        if (!next) {
8911                next = btrfs_find_create_tree_block(fs_info, bytenr);
8912                if (IS_ERR(next))
8913                        return PTR_ERR(next);
8914
8915                btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
8916                                               level - 1);
8917                reada = 1;
8918        }
8919        btrfs_tree_lock(next);
8920        btrfs_set_lock_blocking(next);
8921
8922        ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
8923                                       &wc->refs[level - 1],
8924                                       &wc->flags[level - 1]);
8925        if (ret < 0)
8926                goto out_unlock;
8927
8928        if (unlikely(wc->refs[level - 1] == 0)) {
8929                btrfs_err(fs_info, "Missing references.");
8930                ret = -EIO;
8931                goto out_unlock;
8932        }
8933        *lookup_info = 0;
8934
8935        if (wc->stage == DROP_REFERENCE) {
8936                if (wc->refs[level - 1] > 1) {
8937                        need_account = true;
8938                        if (level == 1 &&
8939                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8940                                goto skip;
8941
8942                        if (!wc->update_ref ||
8943                            generation <= root->root_key.offset)
8944                                goto skip;
8945
8946                        btrfs_node_key_to_cpu(path->nodes[level], &key,
8947                                              path->slots[level]);
8948                        ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
8949                        if (ret < 0)
8950                                goto skip;
8951
8952                        wc->stage = UPDATE_BACKREF;
8953                        wc->shared_level = level - 1;
8954                }
8955        } else {
8956                if (level == 1 &&
8957                    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
8958                        goto skip;
8959        }
8960
8961        if (!btrfs_buffer_uptodate(next, generation, 0)) {
8962                btrfs_tree_unlock(next);
8963                free_extent_buffer(next);
8964                next = NULL;
8965                *lookup_info = 1;
8966        }
8967
8968        if (!next) {
8969                if (reada && level == 1)
8970                        reada_walk_down(trans, root, wc, path);
8971                next = read_tree_block(fs_info, bytenr, generation, level - 1,
8972                                       &first_key);
8973                if (IS_ERR(next)) {
8974                        return PTR_ERR(next);
8975                } else if (!extent_buffer_uptodate(next)) {
8976                        free_extent_buffer(next);
8977                        return -EIO;
8978                }
8979                btrfs_tree_lock(next);
8980                btrfs_set_lock_blocking(next);
8981        }
8982
8983        level--;
8984        ASSERT(level == btrfs_header_level(next));
8985        if (level != btrfs_header_level(next)) {
8986                btrfs_err(root->fs_info, "mismatched level");
8987                ret = -EIO;
8988                goto out_unlock;
8989        }
8990        path->nodes[level] = next;
8991        path->slots[level] = 0;
8992        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8993        wc->level = level;
8994        if (wc->level == 1)
8995                wc->reada_slot = 0;
8996        return 0;
8997skip:
8998        wc->refs[level - 1] = 0;
8999        wc->flags[level - 1] = 0;
9000        if (wc->stage == DROP_REFERENCE) {
9001                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
9002                        parent = path->nodes[level]->start;
9003                } else {
9004                        ASSERT(root->root_key.objectid ==
9005                               btrfs_header_owner(path->nodes[level]));
9006                        if (root->root_key.objectid !=
9007                            btrfs_header_owner(path->nodes[level])) {
9008                                btrfs_err(root->fs_info,
9009                                                "mismatched block owner");
9010                                ret = -EIO;
9011                                goto out_unlock;
9012                        }
9013                        parent = 0;
9014                }
9015
9016                /*
9017                 * Reloc tree doesn't contribute to qgroup numbers, and we have
9018                 * already accounted them at merge time (replace_path),
9019                 * thus we could skip expensive subtree trace here.
9020                 */
9021                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
9022                    need_account) {
9023                        ret = btrfs_qgroup_trace_subtree(trans, next,
9024                                                         generation, level - 1);
9025                        if (ret) {
9026                                btrfs_err_rl(fs_info,
9027                                             "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
9028                                             ret);
9029                        }
9030                }
9031                ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
9032                                        parent, root->root_key.objectid,
9033                                        level - 1, 0);
9034                if (ret)
9035                        goto out_unlock;
9036        }
9037
9038        *lookup_info = 1;
9039        ret = 1;
9040
9041out_unlock:
9042        btrfs_tree_unlock(next);
9043        free_extent_buffer(next);
9044
9045        return ret;
9046}
9047
9048/*
9049 * helper to process tree block while walking up the tree.
9050 *
9051 * when wc->stage == DROP_REFERENCE, this function drops
9052 * reference count on the block.
9053 *
9054 * when wc->stage == UPDATE_BACKREF, this function changes
9055 * wc->stage back to DROP_REFERENCE if we changed wc->stage
9056 * to UPDATE_BACKREF previously while processing the block.
9057 *
9058 * NOTE: return value 1 means we should stop walking up.
9059 */
9060static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
9061                                 struct btrfs_root *root,
9062                                 struct btrfs_path *path,
9063                                 struct walk_control *wc)
9064{
9065        struct btrfs_fs_info *fs_info = root->fs_info;
9066        int ret;
9067        int level = wc->level;
9068        struct extent_buffer *eb = path->nodes[level];
9069        u64 parent = 0;
9070
9071        if (wc->stage == UPDATE_BACKREF) {
9072                BUG_ON(wc->shared_level < level);
9073                if (level < wc->shared_level)
9074                        goto out;
9075
9076                ret = find_next_key(path, level + 1, &wc->update_progress);
9077                if (ret > 0)
9078                        wc->update_ref = 0;
9079
9080                wc->stage = DROP_REFERENCE;
9081                wc->shared_level = -1;
9082                path->slots[level] = 0;
9083
9084                /*
9085                 * check reference count again if the block isn't locked.
9086                 * we should start walking down the tree again if reference
9087                 * count is one.
9088                 */
9089                if (!path->locks[level]) {
9090                        BUG_ON(level == 0);
9091                        btrfs_tree_lock(eb);
9092                        btrfs_set_lock_blocking(eb);
9093                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9094
9095                        ret = btrfs_lookup_extent_info(trans, fs_info,
9096                                                       eb->start, level, 1,
9097                                                       &wc->refs[level],
9098                                                       &wc->flags[level]);
9099                        if (ret < 0) {
9100                                btrfs_tree_unlock_rw(eb, path->locks[level]);
9101                                path->locks[level] = 0;
9102                                return ret;
9103                        }
9104                        BUG_ON(wc->refs[level] == 0);
9105                        if (wc->refs[level] == 1) {
9106                                btrfs_tree_unlock_rw(eb, path->locks[level]);
9107                                path->locks[level] = 0;
9108                                return 1;
9109                        }
9110                }
9111        }
9112
9113        /* wc->stage == DROP_REFERENCE */
9114        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
9115
9116        if (wc->refs[level] == 1) {
9117                if (level == 0) {
9118                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9119                                ret = btrfs_dec_ref(trans, root, eb, 1);
9120                        else
9121                                ret = btrfs_dec_ref(trans, root, eb, 0);
9122                        BUG_ON(ret); /* -ENOMEM */
9123                        ret = btrfs_qgroup_trace_leaf_items(trans, eb);
9124                        if (ret) {
9125                                btrfs_err_rl(fs_info,
9126                                             "error %d accounting leaf items. Quota is out of sync, rescan required.",
9127                                             ret);
9128                        }
9129                }
9130                /* make block locked assertion in clean_tree_block happy */
9131                if (!path->locks[level] &&
9132                    btrfs_header_generation(eb) == trans->transid) {
9133                        btrfs_tree_lock(eb);
9134                        btrfs_set_lock_blocking(eb);
9135                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9136                }
9137                clean_tree_block(fs_info, eb);
9138        }
9139
9140        if (eb == root->node) {
9141                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9142                        parent = eb->start;
9143                else if (root->root_key.objectid != btrfs_header_owner(eb))
9144                        goto owner_mismatch;
9145        } else {
9146                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9147                        parent = path->nodes[level + 1]->start;
9148                else if (root->root_key.objectid !=
9149                         btrfs_header_owner(path->nodes[level + 1]))
9150                        goto owner_mismatch;
9151        }
9152
9153        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9154out:
9155        wc->refs[level] = 0;
9156        wc->flags[level] = 0;
9157        return 0;
9158
9159owner_mismatch:
9160        btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
9161                     btrfs_header_owner(eb), root->root_key.objectid);
9162        return -EUCLEAN;
9163}
9164
9165static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
9166                                   struct btrfs_root *root,
9167                                   struct btrfs_path *path,
9168                                   struct walk_control *wc)
9169{
9170        int level = wc->level;
9171        int lookup_info = 1;
9172        int ret;
9173
9174        while (level >= 0) {
9175                ret = walk_down_proc(trans, root, path, wc, lookup_info);
9176                if (ret > 0)
9177                        break;
9178
9179                if (level == 0)
9180                        break;
9181
9182                if (path->slots[level] >=
9183                    btrfs_header_nritems(path->nodes[level]))
9184                        break;
9185
9186                ret = do_walk_down(trans, root, path, wc, &lookup_info);
9187                if (ret > 0) {
9188                        path->slots[level]++;
9189                        continue;
9190                } else if (ret < 0)
9191                        return ret;
9192                level = wc->level;
9193        }
9194        return 0;
9195}
9196
9197static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9198                                 struct btrfs_root *root,
9199                                 struct btrfs_path *path,
9200                                 struct walk_control *wc, int max_level)
9201{
9202        int level = wc->level;
9203        int ret;
9204
9205        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
9206        while (level < max_level && path->nodes[level]) {
9207                wc->level = level;
9208                if (path->slots[level] + 1 <
9209                    btrfs_header_nritems(path->nodes[level])) {
9210                        path->slots[level]++;
9211                        return 0;
9212                } else {
9213                        ret = walk_up_proc(trans, root, path, wc);
9214                        if (ret > 0)
9215                                return 0;
9216                        if (ret < 0)
9217                                return ret;
9218
9219                        if (path->locks[level]) {
9220                                btrfs_tree_unlock_rw(path->nodes[level],
9221                                                     path->locks[level]);
9222                                path->locks[level] = 0;
9223                        }
9224                        free_extent_buffer(path->nodes[level]);
9225                        path->nodes[level] = NULL;
9226                        level++;
9227                }
9228        }
9229        return 1;
9230}
9231
9232/*
9233 * drop a subvolume tree.
9234 *
9235 * this function traverses the tree freeing any blocks that only
9236 * referenced by the tree.
9237 *
9238 * when a shared tree block is found. this function decreases its
9239 * reference count by one. if update_ref is true, this function
9240 * also make sure backrefs for the shared block and all lower level
9241 * blocks are properly updated.
9242 *
9243 * If called with for_reloc == 0, may exit early with -EAGAIN
9244 */
9245int btrfs_drop_snapshot(struct btrfs_root *root,
9246                         struct btrfs_block_rsv *block_rsv, int update_ref,
9247                         int for_reloc)
9248{
9249        struct btrfs_fs_info *fs_info = root->fs_info;
9250        struct btrfs_path *path;
9251        struct btrfs_trans_handle *trans;
9252        struct btrfs_root *tree_root = fs_info->tree_root;
9253        struct btrfs_root_item *root_item = &root->root_item;
9254        struct walk_control *wc;
9255        struct btrfs_key key;
9256        int err = 0;
9257        int ret;
9258        int level;
9259        bool root_dropped = false;
9260
9261        btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
9262
9263        path = btrfs_alloc_path();
9264        if (!path) {
9265                err = -ENOMEM;
9266                goto out;
9267        }
9268
9269        wc = kzalloc(sizeof(*wc), GFP_NOFS);
9270        if (!wc) {
9271                btrfs_free_path(path);
9272                err = -ENOMEM;
9273                goto out;
9274        }
9275
9276        trans = btrfs_start_transaction(tree_root, 0);
9277        if (IS_ERR(trans)) {
9278                err = PTR_ERR(trans);
9279                goto out_free;
9280        }
9281
9282        err = btrfs_run_delayed_items(trans);
9283        if (err)
9284                goto out_end_trans;
9285
9286        if (block_rsv)
9287                trans->block_rsv = block_rsv;
9288
9289        /*
9290         * This will help us catch people modifying the fs tree while we're
9291         * dropping it.  It is unsafe to mess with the fs tree while it's being
9292         * dropped as we unlock the root node and parent nodes as we walk down
9293         * the tree, assuming nothing will change.  If something does change
9294         * then we'll have stale information and drop references to blocks we've
9295         * already dropped.
9296         */
9297        set_bit(BTRFS_ROOT_DELETING, &root->state);
9298        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9299                level = btrfs_header_level(root->node);
9300                path->nodes[level] = btrfs_lock_root_node(root);
9301                btrfs_set_lock_blocking(path->nodes[level]);
9302                path->slots[level] = 0;
9303                path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9304                memset(&wc->update_progress, 0,
9305                       sizeof(wc->update_progress));
9306        } else {
9307                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9308                memcpy(&wc->update_progress, &key,
9309                       sizeof(wc->update_progress));
9310
9311                level = root_item->drop_level;
9312                BUG_ON(level == 0);
9313                path->lowest_level = level;
9314                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
9315                path->lowest_level = 0;
9316                if (ret < 0) {
9317                        err = ret;
9318                        goto out_end_trans;
9319                }
9320                WARN_ON(ret > 0);
9321
9322                /*
9323                 * unlock our path, this is safe because only this
9324                 * function is allowed to delete this snapshot
9325                 */
9326                btrfs_unlock_up_safe(path, 0);
9327
9328                level = btrfs_header_level(root->node);
9329                while (1) {
9330                        btrfs_tree_lock(path->nodes[level]);
9331                        btrfs_set_lock_blocking(path->nodes[level]);
9332                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9333
9334                        ret = btrfs_lookup_extent_info(trans, fs_info,
9335                                                path->nodes[level]->start,
9336                                                level, 1, &wc->refs[level],
9337                                                &wc->flags[level]);
9338                        if (ret < 0) {
9339                                err = ret;
9340                                goto out_end_trans;
9341                        }
9342                        BUG_ON(wc->refs[level] == 0);
9343
9344                        if (level == root_item->drop_level)
9345                                break;
9346
9347                        btrfs_tree_unlock(path->nodes[level]);
9348                        path->locks[level] = 0;
9349                        WARN_ON(wc->refs[level] != 1);
9350                        level--;
9351                }
9352        }
9353
9354        wc->level = level;
9355        wc->shared_level = -1;
9356        wc->stage = DROP_REFERENCE;
9357        wc->update_ref = update_ref;
9358        wc->keep_locks = 0;
9359        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9360
9361        while (1) {
9362
9363                ret = walk_down_tree(trans, root, path, wc);
9364                if (ret < 0) {
9365                        err = ret;
9366                        break;
9367                }
9368
9369                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
9370                if (ret < 0) {
9371                        err = ret;
9372                        break;
9373                }
9374
9375                if (ret > 0) {
9376                        BUG_ON(wc->stage != DROP_REFERENCE);
9377                        break;
9378                }
9379
9380                if (wc->stage == DROP_REFERENCE) {
9381                        level = wc->level;
9382                        btrfs_node_key(path->nodes[level],
9383                                       &root_item->drop_progress,
9384                                       path->slots[level]);
9385                        root_item->drop_level = level;
9386                }
9387
9388                BUG_ON(wc->level == 0);
9389                if (btrfs_should_end_transaction(trans) ||
9390                    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
9391                        ret = btrfs_update_root(trans, tree_root,
9392                                                &root->root_key,
9393                                                root_item);
9394                        if (ret) {
9395                                btrfs_abort_transaction(trans, ret);
9396                                err = ret;
9397                                goto out_end_trans;
9398                        }
9399
9400                        btrfs_end_transaction_throttle(trans);
9401                        if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
9402                                btrfs_debug(fs_info,
9403                                            "drop snapshot early exit");
9404                                err = -EAGAIN;
9405                                goto out_free;
9406                        }
9407
9408                        trans = btrfs_start_transaction(tree_root, 0);
9409                        if (IS_ERR(trans)) {
9410                                err = PTR_ERR(trans);
9411                                goto out_free;
9412                        }
9413                        if (block_rsv)
9414                                trans->block_rsv = block_rsv;
9415                }
9416        }
9417        btrfs_release_path(path);
9418        if (err)
9419                goto out_end_trans;
9420
9421        ret = btrfs_del_root(trans, &root->root_key);
9422        if (ret) {
9423                btrfs_abort_transaction(trans, ret);
9424                err = ret;
9425                goto out_end_trans;
9426        }
9427
9428        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9429                ret = btrfs_find_root(tree_root, &root->root_key, path,
9430                                      NULL, NULL);
9431                if (ret < 0) {
9432                        btrfs_abort_transaction(trans, ret);
9433                        err = ret;
9434                        goto out_end_trans;
9435                } else if (ret > 0) {
9436                        /* if we fail to delete the orphan item this time
9437                         * around, it'll get picked up the next time.
9438                         *
9439                         * The most common failure here is just -ENOENT.
9440                         */
9441                        btrfs_del_orphan_item(trans, tree_root,
9442                                              root->root_key.objectid);
9443                }
9444        }
9445
9446        if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9447                btrfs_add_dropped_root(trans, root);
9448        } else {
9449                free_extent_buffer(root->node);
9450                free_extent_buffer(root->commit_root);
9451                btrfs_put_fs_root(root);
9452        }
9453        root_dropped = true;
9454out_end_trans:
9455        btrfs_end_transaction_throttle(trans);
9456out_free:
9457        kfree(wc);
9458        btrfs_free_path(path);
9459out:
9460        /*
9461         * So if we need to stop dropping the snapshot for whatever reason we
9462         * need to make sure to add it back to the dead root list so that we
9463         * keep trying to do the work later.  This also cleans up roots if we
9464         * don't have it in the radix (like when we recover after a power fail
9465         * or unmount) so we don't leak memory.
9466         */
9467        if (!for_reloc && !root_dropped)
9468                btrfs_add_dead_root(root);
9469        if (err && err != -EAGAIN)
9470                btrfs_handle_fs_error(fs_info, err, NULL);
9471        return err;
9472}
9473
9474/*
9475 * drop subtree rooted at tree block 'node'.
9476 *
9477 * NOTE: this function will unlock and release tree block 'node'
9478 * only used by relocation code
9479 */
9480int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
9481                        struct btrfs_root *root,
9482                        struct extent_buffer *node,
9483                        struct extent_buffer *parent)
9484{
9485        struct btrfs_fs_info *fs_info = root->fs_info;
9486        struct btrfs_path *path;
9487        struct walk_control *wc;
9488        int level;
9489        int parent_level;
9490        int ret = 0;
9491        int wret;
9492
9493        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
9494
9495        path = btrfs_alloc_path();
9496        if (!path)
9497                return -ENOMEM;
9498
9499        wc = kzalloc(sizeof(*wc), GFP_NOFS);
9500        if (!wc) {
9501                btrfs_free_path(path);
9502                return -ENOMEM;
9503        }
9504
9505        btrfs_assert_tree_locked(parent);
9506        parent_level = btrfs_header_level(parent);
9507        extent_buffer_get(parent);
9508        path->nodes[parent_level] = parent;
9509        path->slots[parent_level] = btrfs_header_nritems(parent);
9510
9511        btrfs_assert_tree_locked(node);
9512        level = btrfs_header_level(node);
9513        path->nodes[level] = node;
9514        path->slots[level] = 0;
9515        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9516
9517        wc->refs[parent_level] = 1;
9518        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
9519        wc->level = level;
9520        wc->shared_level = -1;
9521        wc->stage = DROP_REFERENCE;
9522        wc->update_ref = 0;
9523        wc->keep_locks = 1;
9524        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
9525
9526        while (1) {
9527                wret = walk_down_tree(trans, root, path, wc);
9528                if (wret < 0) {
9529                        ret = wret;
9530                        break;
9531                }
9532
9533                wret = walk_up_tree(trans, root, path, wc, parent_level);
9534                if (wret < 0)
9535                        ret = wret;
9536                if (wret != 0)
9537                        break;
9538        }
9539
9540        kfree(wc);
9541        btrfs_free_path(path);
9542        return ret;
9543}
9544
9545static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
9546{
9547        u64 num_devices;
9548        u64 stripped;
9549
9550        /*
9551         * if restripe for this chunk_type is on pick target profile and
9552         * return, otherwise do the usual balance
9553         */
9554        stripped = get_restripe_target(fs_info, flags);
9555        if (stripped)
9556                return extended_to_chunk(stripped);
9557
9558        num_devices = fs_info->fs_devices->rw_devices;
9559
9560        stripped = BTRFS_BLOCK_GROUP_RAID0 |
9561                BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9562                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
9563
9564        if (num_devices == 1) {
9565                stripped |= BTRFS_BLOCK_GROUP_DUP;
9566                stripped = flags & ~stripped;
9567
9568                /* turn raid0 into single device chunks */
9569                if (flags & BTRFS_BLOCK_GROUP_RAID0)
9570                        return stripped;
9571
9572                /* turn mirroring into duplication */
9573                if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
9574                             BTRFS_BLOCK_GROUP_RAID10))
9575                        return stripped | BTRFS_BLOCK_GROUP_DUP;
9576        } else {
9577                /* they already had raid on here, just return */
9578                if (flags & stripped)
9579                        return flags;
9580
9581                stripped |= BTRFS_BLOCK_GROUP_DUP;
9582                stripped = flags & ~stripped;
9583
9584                /* switch duplicated blocks with raid1 */
9585                if (flags & BTRFS_BLOCK_GROUP_DUP)
9586                        return stripped | BTRFS_BLOCK_GROUP_RAID1;
9587
9588                /* this is drive concat, leave it alone */
9589        }
9590
9591        return flags;
9592}
9593
9594static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
9595{
9596        struct btrfs_space_info *sinfo = cache->space_info;
9597        u64 num_bytes;
9598        u64 min_allocable_bytes;
9599        int ret = -ENOSPC;
9600
9601        /*
9602         * We need some metadata space and system metadata space for
9603         * allocating chunks in some corner cases until we force to set
9604         * it to be readonly.
9605         */
9606        if ((sinfo->flags &
9607             (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
9608            !force)
9609                min_allocable_bytes = SZ_1M;
9610        else
9611                min_allocable_bytes = 0;
9612
9613        spin_lock(&sinfo->lock);
9614        spin_lock(&cache->lock);
9615
9616        if (cache->ro) {
9617                cache->ro++;
9618                ret = 0;
9619                goto out;
9620        }
9621
9622        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
9623                    cache->bytes_super - btrfs_block_group_used(&cache->item);
9624
9625        if (btrfs_space_info_used(sinfo, true) + num_bytes +
9626            min_allocable_bytes <= sinfo->total_bytes) {
9627                sinfo->bytes_readonly += num_bytes;
9628                cache->ro++;
9629                list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9630                ret = 0;
9631        }
9632out:
9633        spin_unlock(&cache->lock);
9634        spin_unlock(&sinfo->lock);
9635        return ret;
9636}
9637
9638int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
9639
9640{
9641        struct btrfs_fs_info *fs_info = cache->fs_info;
9642        struct btrfs_trans_handle *trans;
9643        u64 alloc_flags;
9644        int ret;
9645
9646again:
9647        trans = btrfs_join_transaction(fs_info->extent_root);
9648        if (IS_ERR(trans))
9649                return PTR_ERR(trans);
9650
9651        /*
9652         * we're not allowed to set block groups readonly after the dirty
9653         * block groups cache has started writing.  If it already started,
9654         * back off and let this transaction commit
9655         */
9656        mutex_lock(&fs_info->ro_block_group_mutex);
9657        if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9658                u64 transid = trans->transid;
9659
9660                mutex_unlock(&fs_info->ro_block_group_mutex);
9661                btrfs_end_transaction(trans);
9662
9663                ret = btrfs_wait_for_commit(fs_info, transid);
9664                if (ret)
9665                        return ret;
9666                goto again;
9667        }
9668
9669        /*
9670         * if we are changing raid levels, try to allocate a corresponding
9671         * block group with the new raid level.
9672         */
9673        alloc_flags = update_block_group_flags(fs_info, cache->flags);
9674        if (alloc_flags != cache->flags) {
9675                ret = do_chunk_alloc(trans, alloc_flags,
9676                                     CHUNK_ALLOC_FORCE);
9677                /*
9678                 * ENOSPC is allowed here, we may have enough space
9679                 * already allocated at the new raid level to
9680                 * carry on
9681                 */
9682                if (ret == -ENOSPC)
9683                        ret = 0;
9684                if (ret < 0)
9685                        goto out;
9686        }
9687
9688        ret = inc_block_group_ro(cache, 0);
9689        if (!ret)
9690                goto out;
9691        alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
9692        ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9693        if (ret < 0)
9694                goto out;
9695        ret = inc_block_group_ro(cache, 0);
9696out:
9697        if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
9698                alloc_flags = update_block_group_flags(fs_info, cache->flags);
9699                mutex_lock(&fs_info->chunk_mutex);
9700                check_system_chunk(trans, alloc_flags);
9701                mutex_unlock(&fs_info->chunk_mutex);
9702        }
9703        mutex_unlock(&fs_info->ro_block_group_mutex);
9704
9705        btrfs_end_transaction(trans);
9706        return ret;
9707}
9708
9709int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
9710{
9711        u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
9712
9713        return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
9714}
9715
9716/*
9717 * helper to account the unused space of all the readonly block group in the
9718 * space_info. takes mirrors into account.
9719 */
9720u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9721{
9722        struct btrfs_block_group_cache *block_group;
9723        u64 free_bytes = 0;
9724        int factor;
9725
9726        /* It's df, we don't care if it's racy */
9727        if (list_empty(&sinfo->ro_bgs))
9728                return 0;
9729
9730        spin_lock(&sinfo->lock);
9731        list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9732                spin_lock(&block_group->lock);
9733
9734                if (!block_group->ro) {
9735                        spin_unlock(&block_group->lock);
9736                        continue;
9737                }
9738
9739                factor = btrfs_bg_type_to_factor(block_group->flags);
9740                free_bytes += (block_group->key.offset -
9741                               btrfs_block_group_used(&block_group->item)) *
9742                               factor;
9743
9744                spin_unlock(&block_group->lock);
9745        }
9746        spin_unlock(&sinfo->lock);
9747
9748        return free_bytes;
9749}
9750
9751void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
9752{
9753        struct btrfs_space_info *sinfo = cache->space_info;
9754        u64 num_bytes;
9755
9756        BUG_ON(!cache->ro);
9757
9758        spin_lock(&sinfo->lock);
9759        spin_lock(&cache->lock);
9760        if (!--cache->ro) {
9761                num_bytes = cache->key.offset - cache->reserved -
9762                            cache->pinned - cache->bytes_super -
9763                            btrfs_block_group_used(&cache->item);
9764                sinfo->bytes_readonly -= num_bytes;
9765                list_del_init(&cache->ro_list);
9766        }
9767        spin_unlock(&cache->lock);
9768        spin_unlock(&sinfo->lock);
9769}
9770
9771/*
9772 * Checks to see if it's even possible to relocate this block group.
9773 *
9774 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
9775 * ok to go ahead and try.
9776 */
9777int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
9778{
9779        struct btrfs_root *root = fs_info->extent_root;
9780        struct btrfs_block_group_cache *block_group;
9781        struct btrfs_space_info *space_info;
9782        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
9783        struct btrfs_device *device;
9784        struct btrfs_trans_handle *trans;
9785        u64 min_free;
9786        u64 dev_min = 1;
9787        u64 dev_nr = 0;
9788        u64 target;
9789        int debug;
9790        int index;
9791        int full = 0;
9792        int ret = 0;
9793
9794        debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
9795
9796        block_group = btrfs_lookup_block_group(fs_info, bytenr);
9797
9798        /* odd, couldn't find the block group, leave it alone */
9799        if (!block_group) {
9800                if (debug)
9801                        btrfs_warn(fs_info,
9802                                   "can't find block group for bytenr %llu",
9803                                   bytenr);
9804                return -1;
9805        }
9806
9807        min_free = btrfs_block_group_used(&block_group->item);
9808
9809        /* no bytes used, we're good */
9810        if (!min_free)
9811                goto out;
9812
9813        space_info = block_group->space_info;
9814        spin_lock(&space_info->lock);
9815
9816        full = space_info->full;
9817
9818        /*
9819         * if this is the last block group we have in this space, we can't
9820         * relocate it unless we're able to allocate a new chunk below.
9821         *
9822         * Otherwise, we need to make sure we have room in the space to handle
9823         * all of the extents from this block group.  If we can, we're good
9824         */
9825        if ((space_info->total_bytes != block_group->key.offset) &&
9826            (btrfs_space_info_used(space_info, false) + min_free <
9827             space_info->total_bytes)) {
9828                spin_unlock(&space_info->lock);
9829                goto out;
9830        }
9831        spin_unlock(&space_info->lock);
9832
9833        /*
9834         * ok we don't have enough space, but maybe we have free space on our
9835         * devices to allocate new chunks for relocation, so loop through our
9836         * alloc devices and guess if we have enough space.  if this block
9837         * group is going to be restriped, run checks against the target
9838         * profile instead of the current one.
9839         */
9840        ret = -1;
9841
9842        /*
9843         * index:
9844         *      0: raid10
9845         *      1: raid1
9846         *      2: dup
9847         *      3: raid0
9848         *      4: single
9849         */
9850        target = get_restripe_target(fs_info, block_group->flags);
9851        if (target) {
9852                index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
9853        } else {
9854                /*
9855                 * this is just a balance, so if we were marked as full
9856                 * we know there is no space for a new chunk
9857                 */
9858                if (full) {
9859                        if (debug)
9860                                btrfs_warn(fs_info,
9861                                           "no space to alloc new chunk for block group %llu",
9862                                           block_group->key.objectid);
9863                        goto out;
9864                }
9865
9866                index = btrfs_bg_flags_to_raid_index(block_group->flags);
9867        }
9868
9869        if (index == BTRFS_RAID_RAID10) {
9870                dev_min = 4;
9871                /* Divide by 2 */
9872                min_free >>= 1;
9873        } else if (index == BTRFS_RAID_RAID1) {
9874                dev_min = 2;
9875        } else if (index == BTRFS_RAID_DUP) {
9876                /* Multiply by 2 */
9877                min_free <<= 1;
9878        } else if (index == BTRFS_RAID_RAID0) {
9879                dev_min = fs_devices->rw_devices;
9880                min_free = div64_u64(min_free, dev_min);
9881        }
9882
9883        /* We need to do this so that we can look at pending chunks */
9884        trans = btrfs_join_transaction(root);
9885        if (IS_ERR(trans)) {
9886                ret = PTR_ERR(trans);
9887                goto out;
9888        }
9889
9890        mutex_lock(&fs_info->chunk_mutex);
9891        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9892                u64 dev_offset;
9893
9894                /*
9895                 * check to make sure we can actually find a chunk with enough
9896                 * space to fit our block group in.
9897                 */
9898                if (device->total_bytes > device->bytes_used + min_free &&
9899                    !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
9900                        ret = find_free_dev_extent(trans, device, min_free,
9901                                                   &dev_offset, NULL);
9902                        if (!ret)
9903                                dev_nr++;
9904
9905                        if (dev_nr >= dev_min)
9906                                break;
9907
9908                        ret = -1;
9909                }
9910        }
9911        if (debug && ret == -1)
9912                btrfs_warn(fs_info,
9913                           "no space to allocate a new chunk for block group %llu",
9914                           block_group->key.objectid);
9915        mutex_unlock(&fs_info->chunk_mutex);
9916        btrfs_end_transaction(trans);
9917out:
9918        btrfs_put_block_group(block_group);
9919        return ret;
9920}
9921
9922static int find_first_block_group(struct btrfs_fs_info *fs_info,
9923                                  struct btrfs_path *path,
9924                                  struct btrfs_key *key)
9925{
9926        struct btrfs_root *root = fs_info->extent_root;
9927        int ret = 0;
9928        struct btrfs_key found_key;
9929        struct extent_buffer *leaf;
9930        struct btrfs_block_group_item bg;
9931        u64 flags;
9932        int slot;
9933
9934        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
9935        if (ret < 0)
9936                goto out;
9937
9938        while (1) {
9939                slot = path->slots[0];
9940                leaf = path->nodes[0];
9941                if (slot >= btrfs_header_nritems(leaf)) {
9942                        ret = btrfs_next_leaf(root, path);
9943                        if (ret == 0)
9944                                continue;
9945                        if (ret < 0)
9946                                goto out;
9947                        break;
9948                }
9949                btrfs_item_key_to_cpu(leaf, &found_key, slot);
9950
9951                if (found_key.objectid >= key->objectid &&
9952                    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9953                        struct extent_map_tree *em_tree;
9954                        struct extent_map *em;
9955
9956                        em_tree = &root->fs_info->mapping_tree.map_tree;
9957                        read_lock(&em_tree->lock);
9958                        em = lookup_extent_mapping(em_tree, found_key.objectid,
9959                                                   found_key.offset);
9960                        read_unlock(&em_tree->lock);
9961                        if (!em) {
9962                                btrfs_err(fs_info,
9963                        "logical %llu len %llu found bg but no related chunk",
9964                                          found_key.objectid, found_key.offset);
9965                                ret = -ENOENT;
9966                        } else if (em->start != found_key.objectid ||
9967                                   em->len != found_key.offset) {
9968                                btrfs_err(fs_info,
9969                "block group %llu len %llu mismatch with chunk %llu len %llu",
9970                                          found_key.objectid, found_key.offset,
9971                                          em->start, em->len);
9972                                ret = -EUCLEAN;
9973                        } else {
9974                                read_extent_buffer(leaf, &bg,
9975                                        btrfs_item_ptr_offset(leaf, slot),
9976                                        sizeof(bg));
9977                                flags = btrfs_block_group_flags(&bg) &
9978                                        BTRFS_BLOCK_GROUP_TYPE_MASK;
9979
9980                                if (flags != (em->map_lookup->type &
9981                                              BTRFS_BLOCK_GROUP_TYPE_MASK)) {
9982                                        btrfs_err(fs_info,
9983"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
9984                                                found_key.objectid,
9985                                                found_key.offset, flags,
9986                                                (BTRFS_BLOCK_GROUP_TYPE_MASK &
9987                                                 em->map_lookup->type));
9988                                        ret = -EUCLEAN;
9989                                } else {
9990                                        ret = 0;
9991                                }
9992                        }
9993                        free_extent_map(em);
9994                        goto out;
9995                }
9996                path->slots[0]++;
9997        }
9998out:
9999        return ret;
10000}
10001
10002void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
10003{
10004        struct btrfs_block_group_cache *block_group;
10005        u64 last = 0;
10006
10007        while (1) {
10008                struct inode *inode;
10009
10010                block_group = btrfs_lookup_first_block_group(info, last);
10011                while (block_group) {
10012                        wait_block_group_cache_done(block_group);
10013                        spin_lock(&block_group->lock);
10014                        if (block_group->iref)
10015                                break;
10016                        spin_unlock(&block_group->lock);
10017                        block_group = next_block_group(info, block_group);
10018                }
10019                if (!block_group) {
10020                        if (last == 0)
10021                                break;
10022                        last = 0;
10023                        continue;
10024                }
10025
10026                inode = block_group->inode;
10027                block_group->iref = 0;
10028                block_group->inode = NULL;
10029                spin_unlock(&block_group->lock);
10030                ASSERT(block_group->io_ctl.inode == NULL);
10031                iput(inode);
10032                last = block_group->key.objectid + block_group->key.offset;
10033                btrfs_put_block_group(block_group);
10034        }
10035}
10036
10037/*
10038 * Must be called only after stopping all workers, since we could have block
10039 * group caching kthreads running, and therefore they could race with us if we
10040 * freed the block groups before stopping them.
10041 */
10042int btrfs_free_block_groups(struct btrfs_fs_info *info)
10043{
10044        struct btrfs_block_group_cache *block_group;
10045        struct btrfs_space_info *space_info;
10046        struct btrfs_caching_control *caching_ctl;
10047        struct rb_node *n;
10048
10049        down_write(&info->commit_root_sem);
10050        while (!list_empty(&info->caching_block_groups)) {
10051                caching_ctl = list_entry(info->caching_block_groups.next,
10052                                         struct btrfs_caching_control, list);
10053                list_del(&caching_ctl->list);
10054                put_caching_control(caching_ctl);
10055        }
10056        up_write(&info->commit_root_sem);
10057
10058        spin_lock(&info->unused_bgs_lock);
10059        while (!list_empty(&info->unused_bgs)) {
10060                block_group = list_first_entry(&info->unused_bgs,
10061                                               struct btrfs_block_group_cache,
10062                                               bg_list);
10063                list_del_init(&block_group->bg_list);
10064                btrfs_put_block_group(block_group);
10065        }
10066        spin_unlock(&info->unused_bgs_lock);
10067
10068        spin_lock(&info->block_group_cache_lock);
10069        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
10070                block_group = rb_entry(n, struct btrfs_block_group_cache,
10071                                       cache_node);
10072                rb_erase(&block_group->cache_node,
10073                         &info->block_group_cache_tree);
10074                RB_CLEAR_NODE(&block_group->cache_node);
10075                spin_unlock(&info->block_group_cache_lock);
10076
10077                down_write(&block_group->space_info->groups_sem);
10078                list_del(&block_group->list);
10079                up_write(&block_group->space_info->groups_sem);
10080
10081                /*
10082                 * We haven't cached this block group, which means we could
10083                 * possibly have excluded extents on this block group.
10084                 */
10085                if (block_group->cached == BTRFS_CACHE_NO ||
10086                    block_group->cached == BTRFS_CACHE_ERROR)
10087                        free_excluded_extents(block_group);
10088
10089                btrfs_remove_free_space_cache(block_group);
10090                ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
10091                ASSERT(list_empty(&block_group->dirty_list));
10092                ASSERT(list_empty(&block_group->io_list));
10093                ASSERT(list_empty(&block_group->bg_list));
10094                ASSERT(atomic_read(&block_group->count) == 1);
10095                btrfs_put_block_group(block_group);
10096
10097                spin_lock(&info->block_group_cache_lock);
10098        }
10099        spin_unlock(&info->block_group_cache_lock);
10100
10101        /* now that all the block groups are freed, go through and
10102         * free all the space_info structs.  This is only called during
10103         * the final stages of unmount, and so we know nobody is
10104         * using them.  We call synchronize_rcu() once before we start,
10105         * just to be on the safe side.
10106         */
10107        synchronize_rcu();
10108
10109        release_global_block_rsv(info);
10110
10111        while (!list_empty(&info->space_info)) {
10112                int i;
10113
10114                space_info = list_entry(info->space_info.next,
10115                                        struct btrfs_space_info,
10116                                        list);
10117
10118                /*
10119                 * Do not hide this behind enospc_debug, this is actually
10120                 * important and indicates a real bug if this happens.
10121                 */
10122                if (WARN_ON(space_info->bytes_pinned > 0 ||
10123                            space_info->bytes_reserved > 0 ||
10124                            space_info->bytes_may_use > 0))
10125                        dump_space_info(info, space_info, 0, 0);
10126                list_del(&space_info->list);
10127                for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
10128                        struct kobject *kobj;
10129                        kobj = space_info->block_group_kobjs[i];
10130                        space_info->block_group_kobjs[i] = NULL;
10131                        if (kobj) {
10132                                kobject_del(kobj);
10133                                kobject_put(kobj);
10134                        }
10135                }
10136                kobject_del(&space_info->kobj);
10137                kobject_put(&space_info->kobj);
10138        }
10139        return 0;
10140}
10141
10142/* link_block_group will queue up kobjects to add when we're reclaim-safe */
10143void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
10144{
10145        struct btrfs_space_info *space_info;
10146        struct raid_kobject *rkobj;
10147        LIST_HEAD(list);
10148        int index;
10149        int ret = 0;
10150
10151        spin_lock(&fs_info->pending_raid_kobjs_lock);
10152        list_splice_init(&fs_info->pending_raid_kobjs, &list);
10153        spin_unlock(&fs_info->pending_raid_kobjs_lock);
10154
10155        list_for_each_entry(rkobj, &list, list) {
10156                space_info = __find_space_info(fs_info, rkobj->flags);
10157                index = btrfs_bg_flags_to_raid_index(rkobj->flags);
10158
10159                ret = kobject_add(&rkobj->kobj, &space_info->kobj,
10160                                  "%s", get_raid_name(index));
10161                if (ret) {
10162                        kobject_put(&rkobj->kobj);
10163                        break;
10164                }
10165        }
10166        if (ret)
10167                btrfs_warn(fs_info,
10168                           "failed to add kobject for block cache, ignoring");
10169}
10170
10171static void link_block_group(struct btrfs_block_group_cache *cache)
10172{
10173        struct btrfs_space_info *space_info = cache->space_info;
10174        struct btrfs_fs_info *fs_info = cache->fs_info;
10175        int index = btrfs_bg_flags_to_raid_index(cache->flags);
10176        bool first = false;
10177
10178        down_write(&space_info->groups_sem);
10179        if (list_empty(&space_info->block_groups[index]))
10180                first = true;
10181        list_add_tail(&cache->list, &space_info->block_groups[index]);
10182        up_write(&space_info->groups_sem);
10183
10184        if (first) {
10185                struct raid_kobject *rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
10186                if (!rkobj) {
10187                        btrfs_warn(cache->fs_info,
10188                                "couldn't alloc memory for raid level kobject");
10189                        return;
10190                }
10191                rkobj->flags = cache->flags;
10192                kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
10193
10194                spin_lock(&fs_info->pending_raid_kobjs_lock);
10195                list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
10196                spin_unlock(&fs_info->pending_raid_kobjs_lock);
10197                space_info->block_group_kobjs[index] = &rkobj->kobj;
10198        }
10199}
10200
10201static struct btrfs_block_group_cache *
10202btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
10203                               u64 start, u64 size)
10204{
10205        struct btrfs_block_group_cache *cache;
10206
10207        cache = kzalloc(sizeof(*cache), GFP_NOFS);
10208        if (!cache)
10209                return NULL;
10210
10211        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
10212                                        GFP_NOFS);
10213        if (!cache->free_space_ctl) {
10214                kfree(cache);
10215                return NULL;
10216        }
10217
10218        cache->key.objectid = start;
10219        cache->key.offset = size;
10220        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10221
10222        cache->fs_info = fs_info;
10223        cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
10224        set_free_space_tree_thresholds(cache);
10225
10226        atomic_set(&cache->count, 1);
10227        spin_lock_init(&cache->lock);
10228        init_rwsem(&cache->data_rwsem);
10229        INIT_LIST_HEAD(&cache->list);
10230        INIT_LIST_HEAD(&cache->cluster_list);
10231        INIT_LIST_HEAD(&cache->bg_list);
10232        INIT_LIST_HEAD(&cache->ro_list);
10233        INIT_LIST_HEAD(&cache->dirty_list);
10234        INIT_LIST_HEAD(&cache->io_list);
10235        btrfs_init_free_space_ctl(cache);
10236        atomic_set(&cache->trimming, 0);
10237        mutex_init(&cache->free_space_lock);
10238        btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
10239
10240        return cache;
10241}
10242
10243
10244/*
10245 * Iterate all chunks and verify that each of them has the corresponding block
10246 * group
10247 */
10248static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
10249{
10250        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
10251        struct extent_map *em;
10252        struct btrfs_block_group_cache *bg;
10253        u64 start = 0;
10254        int ret = 0;
10255
10256        while (1) {
10257                read_lock(&map_tree->map_tree.lock);
10258                /*
10259                 * lookup_extent_mapping will return the first extent map
10260                 * intersecting the range, so setting @len to 1 is enough to
10261                 * get the first chunk.
10262                 */
10263                em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
10264                read_unlock(&map_tree->map_tree.lock);
10265                if (!em)
10266                        break;
10267
10268                bg = btrfs_lookup_block_group(fs_info, em->start);
10269                if (!bg) {
10270                        btrfs_err(fs_info,
10271        "chunk start=%llu len=%llu doesn't have corresponding block group",
10272                                     em->start, em->len);
10273                        ret = -EUCLEAN;
10274                        free_extent_map(em);
10275                        break;
10276                }
10277                if (bg->key.objectid != em->start ||
10278                    bg->key.offset != em->len ||
10279                    (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
10280                    (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
10281                        btrfs_err(fs_info,
10282"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
10283                                em->start, em->len,
10284                                em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
10285                                bg->key.objectid, bg->key.offset,
10286                                bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
10287                        ret = -EUCLEAN;
10288                        free_extent_map(em);
10289                        btrfs_put_block_group(bg);
10290                        break;
10291                }
10292                start = em->start + em->len;
10293                free_extent_map(em);
10294                btrfs_put_block_group(bg);
10295        }
10296        return ret;
10297}
10298
10299int btrfs_read_block_groups(struct btrfs_fs_info *info)
10300{
10301        struct btrfs_path *path;
10302        int ret;
10303        struct btrfs_block_group_cache *cache;
10304        struct btrfs_space_info *space_info;
10305        struct btrfs_key key;
10306        struct btrfs_key found_key;
10307        struct extent_buffer *leaf;
10308        int need_clear = 0;
10309        u64 cache_gen;
10310        u64 feature;
10311        int mixed;
10312
10313        feature = btrfs_super_incompat_flags(info->super_copy);
10314        mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
10315
10316        key.objectid = 0;
10317        key.offset = 0;
10318        key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
10319        path = btrfs_alloc_path();
10320        if (!path)
10321                return -ENOMEM;
10322        path->reada = READA_FORWARD;
10323
10324        cache_gen = btrfs_super_cache_generation(info->super_copy);
10325        if (btrfs_test_opt(info, SPACE_CACHE) &&
10326            btrfs_super_generation(info->super_copy) != cache_gen)
10327                need_clear = 1;
10328        if (btrfs_test_opt(info, CLEAR_CACHE))
10329                need_clear = 1;
10330
10331        while (1) {
10332                ret = find_first_block_group(info, path, &key);
10333                if (ret > 0)
10334                        break;
10335                if (ret != 0)
10336                        goto error;
10337
10338                leaf = path->nodes[0];
10339                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10340
10341                cache = btrfs_create_block_group_cache(info, found_key.objectid,
10342                                                       found_key.offset);
10343                if (!cache) {
10344                        ret = -ENOMEM;
10345                        goto error;
10346                }
10347
10348                if (need_clear) {
10349                        /*
10350                         * When we mount with old space cache, we need to
10351                         * set BTRFS_DC_CLEAR and set dirty flag.
10352                         *
10353                         * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
10354                         *    truncate the old free space cache inode and
10355                         *    setup a new one.
10356                         * b) Setting 'dirty flag' makes sure that we flush
10357                         *    the new space cache info onto disk.
10358                         */
10359                        if (btrfs_test_opt(info, SPACE_CACHE))
10360                                cache->disk_cache_state = BTRFS_DC_CLEAR;
10361                }
10362
10363                read_extent_buffer(leaf, &cache->item,
10364                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
10365                                   sizeof(cache->item));
10366                cache->flags = btrfs_block_group_flags(&cache->item);
10367                if (!mixed &&
10368                    ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
10369                    (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
10370                        btrfs_err(info,
10371"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
10372                                  cache->key.objectid);
10373                        ret = -EINVAL;
10374                        goto error;
10375                }
10376
10377                key.objectid = found_key.objectid + found_key.offset;
10378                btrfs_release_path(path);
10379
10380                /*
10381                 * We need to exclude the super stripes now so that the space
10382                 * info has super bytes accounted for, otherwise we'll think
10383                 * we have more space than we actually do.
10384                 */
10385                ret = exclude_super_stripes(cache);
10386                if (ret) {
10387                        /*
10388                         * We may have excluded something, so call this just in
10389                         * case.
10390                         */
10391                        free_excluded_extents(cache);
10392                        btrfs_put_block_group(cache);
10393                        goto error;
10394                }
10395
10396                /*
10397                 * check for two cases, either we are full, and therefore
10398                 * don't need to bother with the caching work since we won't
10399                 * find any space, or we are empty, and we can just add all
10400                 * the space in and be done with it.  This saves us _a_lot_ of
10401                 * time, particularly in the full case.
10402                 */
10403                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10404                        cache->last_byte_to_unpin = (u64)-1;
10405                        cache->cached = BTRFS_CACHE_FINISHED;
10406                        free_excluded_extents(cache);
10407                } else if (btrfs_block_group_used(&cache->item) == 0) {
10408                        cache->last_byte_to_unpin = (u64)-1;
10409                        cache->cached = BTRFS_CACHE_FINISHED;
10410                        add_new_free_space(cache, found_key.objectid,
10411                                           found_key.objectid +
10412                                           found_key.offset);
10413                        free_excluded_extents(cache);
10414                }
10415
10416                ret = btrfs_add_block_group_cache(info, cache);
10417                if (ret) {
10418                        btrfs_remove_free_space_cache(cache);
10419                        btrfs_put_block_group(cache);
10420                        goto error;
10421                }
10422
10423                trace_btrfs_add_block_group(info, cache, 0);
10424                update_space_info(info, cache->flags, found_key.offset,
10425                                  btrfs_block_group_used(&cache->item),
10426                                  cache->bytes_super, &space_info);
10427
10428                cache->space_info = space_info;
10429
10430                link_block_group(cache);
10431
10432                set_avail_alloc_bits(info, cache->flags);
10433                if (btrfs_chunk_readonly(info, cache->key.objectid)) {
10434                        inc_block_group_ro(cache, 1);
10435                } else if (btrfs_block_group_used(&cache->item) == 0) {
10436                        ASSERT(list_empty(&cache->bg_list));
10437                        btrfs_mark_bg_unused(cache);
10438                }
10439        }
10440
10441        list_for_each_entry_rcu(space_info, &info->space_info, list) {
10442                if (!(get_alloc_profile(info, space_info->flags) &
10443                      (BTRFS_BLOCK_GROUP_RAID10 |
10444                       BTRFS_BLOCK_GROUP_RAID1 |
10445                       BTRFS_BLOCK_GROUP_RAID5 |
10446                       BTRFS_BLOCK_GROUP_RAID6 |
10447                       BTRFS_BLOCK_GROUP_DUP)))
10448                        continue;
10449                /*
10450                 * avoid allocating from un-mirrored block group if there are
10451                 * mirrored block groups.
10452                 */
10453                list_for_each_entry(cache,
10454                                &space_info->block_groups[BTRFS_RAID_RAID0],
10455                                list)
10456                        inc_block_group_ro(cache, 1);
10457                list_for_each_entry(cache,
10458                                &space_info->block_groups[BTRFS_RAID_SINGLE],
10459                                list)
10460                        inc_block_group_ro(cache, 1);
10461        }
10462
10463        btrfs_add_raid_kobjects(info);
10464        init_global_block_rsv(info);
10465        ret = check_chunk_block_group_mappings(info);
10466error:
10467        btrfs_free_path(path);
10468        return ret;
10469}
10470
10471void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
10472{
10473        struct btrfs_fs_info *fs_info = trans->fs_info;
10474        struct btrfs_block_group_cache *block_group;
10475        struct btrfs_root *extent_root = fs_info->extent_root;
10476        struct btrfs_block_group_item item;
10477        struct btrfs_key key;
10478        int ret = 0;
10479
10480        if (!trans->can_flush_pending_bgs)
10481                return;
10482
10483        while (!list_empty(&trans->new_bgs)) {
10484                block_group = list_first_entry(&trans->new_bgs,
10485                                               struct btrfs_block_group_cache,
10486                                               bg_list);
10487                if (ret)
10488                        goto next;
10489
10490                spin_lock(&block_group->lock);
10491                memcpy(&item, &block_group->item, sizeof(item));
10492                memcpy(&key, &block_group->key, sizeof(key));
10493                spin_unlock(&block_group->lock);
10494
10495                ret = btrfs_insert_item(trans, extent_root, &key, &item,
10496                                        sizeof(item));
10497                if (ret)
10498                        btrfs_abort_transaction(trans, ret);
10499                ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
10500                if (ret)
10501                        btrfs_abort_transaction(trans, ret);
10502                add_block_group_free_space(trans, block_group);
10503                /* already aborted the transaction if it failed. */
10504next:
10505                btrfs_delayed_refs_rsv_release(fs_info, 1);
10506                list_del_init(&block_group->bg_list);
10507        }
10508        btrfs_trans_release_chunk_metadata(trans);
10509}
10510
10511int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
10512                           u64 type, u64 chunk_offset, u64 size)
10513{
10514        struct btrfs_fs_info *fs_info = trans->fs_info;
10515        struct btrfs_block_group_cache *cache;
10516        int ret;
10517
10518        btrfs_set_log_full_commit(fs_info, trans);
10519
10520        cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
10521        if (!cache)
10522                return -ENOMEM;
10523
10524        btrfs_set_block_group_used(&cache->item, bytes_used);
10525        btrfs_set_block_group_chunk_objectid(&cache->item,
10526                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
10527        btrfs_set_block_group_flags(&cache->item, type);
10528
10529        cache->flags = type;
10530        cache->last_byte_to_unpin = (u64)-1;
10531        cache->cached = BTRFS_CACHE_FINISHED;
10532        cache->needs_free_space = 1;
10533        ret = exclude_super_stripes(cache);
10534        if (ret) {
10535                /*
10536                 * We may have excluded something, so call this just in
10537                 * case.
10538                 */
10539                free_excluded_extents(cache);
10540                btrfs_put_block_group(cache);
10541                return ret;
10542        }
10543
10544        add_new_free_space(cache, chunk_offset, chunk_offset + size);
10545
10546        free_excluded_extents(cache);
10547
10548#ifdef CONFIG_BTRFS_DEBUG
10549        if (btrfs_should_fragment_free_space(cache)) {
10550                u64 new_bytes_used = size - bytes_used;
10551
10552                bytes_used += new_bytes_used >> 1;
10553                fragment_free_space(cache);
10554        }
10555#endif
10556        /*
10557         * Ensure the corresponding space_info object is created and
10558         * assigned to our block group. We want our bg to be added to the rbtree
10559         * with its ->space_info set.
10560         */
10561        cache->space_info = __find_space_info(fs_info, cache->flags);
10562        ASSERT(cache->space_info);
10563
10564        ret = btrfs_add_block_group_cache(fs_info, cache);
10565        if (ret) {
10566                btrfs_remove_free_space_cache(cache);
10567                btrfs_put_block_group(cache);
10568                return ret;
10569        }
10570
10571        /*
10572         * Now that our block group has its ->space_info set and is inserted in
10573         * the rbtree, update the space info's counters.
10574         */
10575        trace_btrfs_add_block_group(fs_info, cache, 1);
10576        update_space_info(fs_info, cache->flags, size, bytes_used,
10577                                cache->bytes_super, &cache->space_info);
10578        update_global_block_rsv(fs_info);
10579
10580        link_block_group(cache);
10581
10582        list_add_tail(&cache->bg_list, &trans->new_bgs);
10583        trans->delayed_ref_updates++;
10584        btrfs_update_delayed_refs_rsv(trans);
10585
10586        set_avail_alloc_bits(fs_info, type);
10587        return 0;
10588}
10589
10590static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
10591{
10592        u64 extra_flags = chunk_to_extended(flags) &
10593                                BTRFS_EXTENDED_PROFILE_MASK;
10594
10595        write_seqlock(&fs_info->profiles_lock);
10596        if (flags & BTRFS_BLOCK_GROUP_DATA)
10597                fs_info->avail_data_alloc_bits &= ~extra_flags;
10598        if (flags & BTRFS_BLOCK_GROUP_METADATA)
10599                fs_info->avail_metadata_alloc_bits &= ~extra_flags;
10600        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
10601                fs_info->avail_system_alloc_bits &= ~extra_flags;
10602        write_sequnlock(&fs_info->profiles_lock);
10603}
10604
10605int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10606                             u64 group_start, struct extent_map *em)
10607{
10608        struct btrfs_fs_info *fs_info = trans->fs_info;
10609        struct btrfs_root *root = fs_info->extent_root;
10610        struct btrfs_path *path;
10611        struct btrfs_block_group_cache *block_group;
10612        struct btrfs_free_cluster *cluster;
10613        struct btrfs_root *tree_root = fs_info->tree_root;
10614        struct btrfs_key key;
10615        struct inode *inode;
10616        struct kobject *kobj = NULL;
10617        int ret;
10618        int index;
10619        int factor;
10620        struct btrfs_caching_control *caching_ctl = NULL;
10621        bool remove_em;
10622        bool remove_rsv = false;
10623
10624        block_group = btrfs_lookup_block_group(fs_info, group_start);
10625        BUG_ON(!block_group);
10626        BUG_ON(!block_group->ro);
10627
10628        trace_btrfs_remove_block_group(block_group);
10629        /*
10630         * Free the reserved super bytes from this block group before
10631         * remove it.
10632         */
10633        free_excluded_extents(block_group);
10634        btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
10635                                  block_group->key.offset);
10636
10637        memcpy(&key, &block_group->key, sizeof(key));
10638        index = btrfs_bg_flags_to_raid_index(block_group->flags);
10639        factor = btrfs_bg_type_to_factor(block_group->flags);
10640
10641        /* make sure this block group isn't part of an allocation cluster */
10642        cluster = &fs_info->data_alloc_cluster;
10643        spin_lock(&cluster->refill_lock);
10644        btrfs_return_cluster_to_free_space(block_group, cluster);
10645        spin_unlock(&cluster->refill_lock);
10646
10647        /*
10648         * make sure this block group isn't part of a metadata
10649         * allocation cluster
10650         */
10651        cluster = &fs_info->meta_alloc_cluster;
10652        spin_lock(&cluster->refill_lock);
10653        btrfs_return_cluster_to_free_space(block_group, cluster);
10654        spin_unlock(&cluster->refill_lock);
10655
10656        path = btrfs_alloc_path();
10657        if (!path) {
10658                ret = -ENOMEM;
10659                goto out;
10660        }
10661
10662        /*
10663         * get the inode first so any iput calls done for the io_list
10664         * aren't the final iput (no unlinks allowed now)
10665         */
10666        inode = lookup_free_space_inode(fs_info, block_group, path);
10667
10668        mutex_lock(&trans->transaction->cache_write_mutex);
10669        /*
10670         * Make sure our free space cache IO is done before removing the
10671         * free space inode
10672         */
10673        spin_lock(&trans->transaction->dirty_bgs_lock);
10674        if (!list_empty(&block_group->io_list)) {
10675                list_del_init(&block_group->io_list);
10676
10677                WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
10678
10679                spin_unlock(&trans->transaction->dirty_bgs_lock);
10680                btrfs_wait_cache_io(trans, block_group, path);
10681                btrfs_put_block_group(block_group);
10682                spin_lock(&trans->transaction->dirty_bgs_lock);
10683        }
10684
10685        if (!list_empty(&block_group->dirty_list)) {
10686                list_del_init(&block_group->dirty_list);
10687                remove_rsv = true;
10688                btrfs_put_block_group(block_group);
10689        }
10690        spin_unlock(&trans->transaction->dirty_bgs_lock);
10691        mutex_unlock(&trans->transaction->cache_write_mutex);
10692
10693        if (!IS_ERR(inode)) {
10694                ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10695                if (ret) {
10696                        btrfs_add_delayed_iput(inode);
10697                        goto out;
10698                }
10699                clear_nlink(inode);
10700                /* One for the block groups ref */
10701                spin_lock(&block_group->lock);
10702                if (block_group->iref) {
10703                        block_group->iref = 0;
10704                        block_group->inode = NULL;
10705                        spin_unlock(&block_group->lock);
10706                        iput(inode);
10707                } else {
10708                        spin_unlock(&block_group->lock);
10709                }
10710                /* One for our lookup ref */
10711                btrfs_add_delayed_iput(inode);
10712        }
10713
10714        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
10715        key.offset = block_group->key.objectid;
10716        key.type = 0;
10717
10718        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
10719        if (ret < 0)
10720                goto out;
10721        if (ret > 0)
10722                btrfs_release_path(path);
10723        if (ret == 0) {
10724                ret = btrfs_del_item(trans, tree_root, path);
10725                if (ret)
10726                        goto out;
10727                btrfs_release_path(path);
10728        }
10729
10730        spin_lock(&fs_info->block_group_cache_lock);
10731        rb_erase(&block_group->cache_node,
10732                 &fs_info->block_group_cache_tree);
10733        RB_CLEAR_NODE(&block_group->cache_node);
10734
10735        if (fs_info->first_logical_byte == block_group->key.objectid)
10736                fs_info->first_logical_byte = (u64)-1;
10737        spin_unlock(&fs_info->block_group_cache_lock);
10738
10739        down_write(&block_group->space_info->groups_sem);
10740        /*
10741         * we must use list_del_init so people can check to see if they
10742         * are still on the list after taking the semaphore
10743         */
10744        list_del_init(&block_group->list);
10745        if (list_empty(&block_group->space_info->block_groups[index])) {
10746                kobj = block_group->space_info->block_group_kobjs[index];
10747                block_group->space_info->block_group_kobjs[index] = NULL;
10748                clear_avail_alloc_bits(fs_info, block_group->flags);
10749        }
10750        up_write(&block_group->space_info->groups_sem);
10751        if (kobj) {
10752                kobject_del(kobj);
10753                kobject_put(kobj);
10754        }
10755
10756        if (block_group->has_caching_ctl)
10757                caching_ctl = get_caching_control(block_group);
10758        if (block_group->cached == BTRFS_CACHE_STARTED)
10759                wait_block_group_cache_done(block_group);
10760        if (block_group->has_caching_ctl) {
10761                down_write(&fs_info->commit_root_sem);
10762                if (!caching_ctl) {
10763                        struct btrfs_caching_control *ctl;
10764
10765                        list_for_each_entry(ctl,
10766                                    &fs_info->caching_block_groups, list)
10767                                if (ctl->block_group == block_group) {
10768                                        caching_ctl = ctl;
10769                                        refcount_inc(&caching_ctl->count);
10770                                        break;
10771                                }
10772                }
10773                if (caching_ctl)
10774                        list_del_init(&caching_ctl->list);
10775                up_write(&fs_info->commit_root_sem);
10776                if (caching_ctl) {
10777                        /* Once for the caching bgs list and once for us. */
10778                        put_caching_control(caching_ctl);
10779                        put_caching_control(caching_ctl);
10780                }
10781        }
10782
10783        spin_lock(&trans->transaction->dirty_bgs_lock);
10784        if (!list_empty(&block_group->dirty_list)) {
10785                WARN_ON(1);
10786        }
10787        if (!list_empty(&block_group->io_list)) {
10788                WARN_ON(1);
10789        }
10790        spin_unlock(&trans->transaction->dirty_bgs_lock);
10791        btrfs_remove_free_space_cache(block_group);
10792
10793        spin_lock(&block_group->space_info->lock);
10794        list_del_init(&block_group->ro_list);
10795
10796        if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
10797                WARN_ON(block_group->space_info->total_bytes
10798                        < block_group->key.offset);
10799                WARN_ON(block_group->space_info->bytes_readonly
10800                        < block_group->key.offset);
10801                WARN_ON(block_group->space_info->disk_total
10802                        < block_group->key.offset * factor);
10803        }
10804        block_group->space_info->total_bytes -= block_group->key.offset;
10805        block_group->space_info->bytes_readonly -= block_group->key.offset;
10806        block_group->space_info->disk_total -= block_group->key.offset * factor;
10807
10808        spin_unlock(&block_group->space_info->lock);
10809
10810        memcpy(&key, &block_group->key, sizeof(key));
10811
10812        mutex_lock(&fs_info->chunk_mutex);
10813        if (!list_empty(&em->list)) {
10814                /* We're in the transaction->pending_chunks list. */
10815                free_extent_map(em);
10816        }
10817        spin_lock(&block_group->lock);
10818        block_group->removed = 1;
10819        /*
10820         * At this point trimming can't start on this block group, because we
10821         * removed the block group from the tree fs_info->block_group_cache_tree
10822         * so no one can't find it anymore and even if someone already got this
10823         * block group before we removed it from the rbtree, they have already
10824         * incremented block_group->trimming - if they didn't, they won't find
10825         * any free space entries because we already removed them all when we
10826         * called btrfs_remove_free_space_cache().
10827         *
10828         * And we must not remove the extent map from the fs_info->mapping_tree
10829         * to prevent the same logical address range and physical device space
10830         * ranges from being reused for a new block group. This is because our
10831         * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
10832         * completely transactionless, so while it is trimming a range the
10833         * currently running transaction might finish and a new one start,
10834         * allowing for new block groups to be created that can reuse the same
10835         * physical device locations unless we take this special care.
10836         *
10837         * There may also be an implicit trim operation if the file system
10838         * is mounted with -odiscard. The same protections must remain
10839         * in place until the extents have been discarded completely when
10840         * the transaction commit has completed.
10841         */
10842        remove_em = (atomic_read(&block_group->trimming) == 0);
10843        /*
10844         * Make sure a trimmer task always sees the em in the pinned_chunks list
10845         * if it sees block_group->removed == 1 (needs to lock block_group->lock
10846         * before checking block_group->removed).
10847         */
10848        if (!remove_em) {
10849                /*
10850                 * Our em might be in trans->transaction->pending_chunks which
10851                 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
10852                 * and so is the fs_info->pinned_chunks list.
10853                 *
10854                 * So at this point we must be holding the chunk_mutex to avoid
10855                 * any races with chunk allocation (more specifically at
10856                 * volumes.c:contains_pending_extent()), to ensure it always
10857                 * sees the em, either in the pending_chunks list or in the
10858                 * pinned_chunks list.
10859                 */
10860                list_move_tail(&em->list, &fs_info->pinned_chunks);
10861        }
10862        spin_unlock(&block_group->lock);
10863
10864        if (remove_em) {
10865                struct extent_map_tree *em_tree;
10866
10867                em_tree = &fs_info->mapping_tree.map_tree;
10868                write_lock(&em_tree->lock);
10869                /*
10870                 * The em might be in the pending_chunks list, so make sure the
10871                 * chunk mutex is locked, since remove_extent_mapping() will
10872                 * delete us from that list.
10873                 */
10874                remove_extent_mapping(em_tree, em);
10875                write_unlock(&em_tree->lock);
10876                /* once for the tree */
10877                free_extent_map(em);
10878        }
10879
10880        mutex_unlock(&fs_info->chunk_mutex);
10881
10882        ret = remove_block_group_free_space(trans, block_group);
10883        if (ret)
10884                goto out;
10885
10886        btrfs_put_block_group(block_group);
10887        btrfs_put_block_group(block_group);
10888
10889        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
10890        if (ret > 0)
10891                ret = -EIO;
10892        if (ret < 0)
10893                goto out;
10894
10895        ret = btrfs_del_item(trans, root, path);
10896out:
10897        if (remove_rsv)
10898                btrfs_delayed_refs_rsv_release(fs_info, 1);
10899        btrfs_free_path(path);
10900        return ret;
10901}
10902
10903struct btrfs_trans_handle *
10904btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
10905                                     const u64 chunk_offset)
10906{
10907        struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
10908        struct extent_map *em;
10909        struct map_lookup *map;
10910        unsigned int num_items;
10911
10912        read_lock(&em_tree->lock);
10913        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
10914        read_unlock(&em_tree->lock);
10915        ASSERT(em && em->start == chunk_offset);
10916
10917        /*
10918         * We need to reserve 3 + N units from the metadata space info in order
10919         * to remove a block group (done at btrfs_remove_chunk() and at
10920         * btrfs_remove_block_group()), which are used for:
10921         *
10922         * 1 unit for adding the free space inode's orphan (located in the tree
10923         * of tree roots).
10924         * 1 unit for deleting the block group item (located in the extent
10925         * tree).
10926         * 1 unit for deleting the free space item (located in tree of tree
10927         * roots).
10928         * N units for deleting N device extent items corresponding to each
10929         * stripe (located in the device tree).
10930         *
10931         * In order to remove a block group we also need to reserve units in the
10932         * system space info in order to update the chunk tree (update one or
10933         * more device items and remove one chunk item), but this is done at
10934         * btrfs_remove_chunk() through a call to check_system_chunk().
10935         */
10936        map = em->map_lookup;
10937        num_items = 3 + map->num_stripes;
10938        free_extent_map(em);
10939
10940        return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10941                                                           num_items, 1);
10942}
10943
10944/*
10945 * Process the unused_bgs list and remove any that don't have any allocated
10946 * space inside of them.
10947 */
10948void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
10949{
10950        struct btrfs_block_group_cache *block_group;
10951        struct btrfs_space_info *space_info;
10952        struct btrfs_trans_handle *trans;
10953        int ret = 0;
10954
10955        if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
10956                return;
10957
10958        spin_lock(&fs_info->unused_bgs_lock);
10959        while (!list_empty(&fs_info->unused_bgs)) {
10960                u64 start, end;
10961                int trimming;
10962
10963                block_group = list_first_entry(&fs_info->unused_bgs,
10964                                               struct btrfs_block_group_cache,
10965                                               bg_list);
10966                list_del_init(&block_group->bg_list);
10967
10968                space_info = block_group->space_info;
10969
10970                if (ret || btrfs_mixed_space_info(space_info)) {
10971                        btrfs_put_block_group(block_group);
10972                        continue;
10973                }
10974                spin_unlock(&fs_info->unused_bgs_lock);
10975
10976                mutex_lock(&fs_info->delete_unused_bgs_mutex);
10977
10978                /* Don't want to race with allocators so take the groups_sem */
10979                down_write(&space_info->groups_sem);
10980                spin_lock(&block_group->lock);
10981                if (block_group->reserved || block_group->pinned ||
10982                    btrfs_block_group_used(&block_group->item) ||
10983                    block_group->ro ||
10984                    list_is_singular(&block_group->list)) {
10985                        /*
10986                         * We want to bail if we made new allocations or have
10987                         * outstanding allocations in this block group.  We do
10988                         * the ro check in case balance is currently acting on
10989                         * this block group.
10990                         */
10991                        trace_btrfs_skip_unused_block_group(block_group);
10992                        spin_unlock(&block_group->lock);
10993                        up_write(&space_info->groups_sem);
10994                        goto next;
10995                }
10996                spin_unlock(&block_group->lock);
10997
10998                /* We don't want to force the issue, only flip if it's ok. */
10999                ret = inc_block_group_ro(block_group, 0);
11000                up_write(&space_info->groups_sem);
11001                if (ret < 0) {
11002                        ret = 0;
11003                        goto next;
11004                }
11005
11006                /*
11007                 * Want to do this before we do anything else so we can recover
11008                 * properly if we fail to join the transaction.
11009                 */
11010                trans = btrfs_start_trans_remove_block_group(fs_info,
11011                                                     block_group->key.objectid);
11012                if (IS_ERR(trans)) {
11013                        btrfs_dec_block_group_ro(block_group);
11014                        ret = PTR_ERR(trans);
11015                        goto next;
11016                }
11017
11018                /*
11019                 * We could have pending pinned extents for this block group,
11020                 * just delete them, we don't care about them anymore.
11021                 */
11022                start = block_group->key.objectid;
11023                end = start + block_group->key.offset - 1;
11024                /*
11025                 * Hold the unused_bg_unpin_mutex lock to avoid racing with
11026                 * btrfs_finish_extent_commit(). If we are at transaction N,
11027                 * another task might be running finish_extent_commit() for the
11028                 * previous transaction N - 1, and have seen a range belonging
11029                 * to the block group in freed_extents[] before we were able to
11030                 * clear the whole block group range from freed_extents[]. This
11031                 * means that task can lookup for the block group after we
11032                 * unpinned it from freed_extents[] and removed it, leading to
11033                 * a BUG_ON() at btrfs_unpin_extent_range().
11034                 */
11035                mutex_lock(&fs_info->unused_bg_unpin_mutex);
11036                ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
11037                                  EXTENT_DIRTY);
11038                if (ret) {
11039                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11040                        btrfs_dec_block_group_ro(block_group);
11041                        goto end_trans;
11042                }
11043                ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
11044                                  EXTENT_DIRTY);
11045                if (ret) {
11046                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11047                        btrfs_dec_block_group_ro(block_group);
11048                        goto end_trans;
11049                }
11050                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
11051
11052                /* Reset pinned so btrfs_put_block_group doesn't complain */
11053                spin_lock(&space_info->lock);
11054                spin_lock(&block_group->lock);
11055
11056                update_bytes_pinned(space_info, -block_group->pinned);
11057                space_info->bytes_readonly += block_group->pinned;
11058                percpu_counter_add_batch(&space_info->total_bytes_pinned,
11059                                   -block_group->pinned,
11060                                   BTRFS_TOTAL_BYTES_PINNED_BATCH);
11061                block_group->pinned = 0;
11062
11063                spin_unlock(&block_group->lock);
11064                spin_unlock(&space_info->lock);
11065
11066                /* DISCARD can flip during remount */
11067                trimming = btrfs_test_opt(fs_info, DISCARD);
11068
11069                /* Implicit trim during transaction commit. */
11070                if (trimming)
11071                        btrfs_get_block_group_trimming(block_group);
11072
11073                /*
11074                 * Btrfs_remove_chunk will abort the transaction if things go
11075                 * horribly wrong.
11076                 */
11077                ret = btrfs_remove_chunk(trans, block_group->key.objectid);
11078
11079                if (ret) {
11080                        if (trimming)
11081                                btrfs_put_block_group_trimming(block_group);
11082                        goto end_trans;
11083                }
11084
11085                /*
11086                 * If we're not mounted with -odiscard, we can just forget
11087                 * about this block group. Otherwise we'll need to wait
11088                 * until transaction commit to do the actual discard.
11089                 */
11090                if (trimming) {
11091                        spin_lock(&fs_info->unused_bgs_lock);
11092                        /*
11093                         * A concurrent scrub might have added us to the list
11094                         * fs_info->unused_bgs, so use a list_move operation
11095                         * to add the block group to the deleted_bgs list.
11096                         */
11097                        list_move(&block_group->bg_list,
11098                                  &trans->transaction->deleted_bgs);
11099                        spin_unlock(&fs_info->unused_bgs_lock);
11100                        btrfs_get_block_group(block_group);
11101                }
11102end_trans:
11103                btrfs_end_transaction(trans);
11104next:
11105                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
11106                btrfs_put_block_group(block_group);
11107                spin_lock(&fs_info->unused_bgs_lock);
11108        }
11109        spin_unlock(&fs_info->unused_bgs_lock);
11110}
11111
11112int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
11113{
11114        struct btrfs_super_block *disk_super;
11115        u64 features;
11116        u64 flags;
11117        int mixed = 0;
11118        int ret;
11119
11120        disk_super = fs_info->super_copy;
11121        if (!btrfs_super_root(disk_super))
11122                return -EINVAL;
11123
11124        features = btrfs_super_incompat_flags(disk_super);
11125        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
11126                mixed = 1;
11127
11128        flags = BTRFS_BLOCK_GROUP_SYSTEM;
11129        ret = create_space_info(fs_info, flags);
11130        if (ret)
11131                goto out;
11132
11133        if (mixed) {
11134                flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
11135                ret = create_space_info(fs_info, flags);
11136        } else {
11137                flags = BTRFS_BLOCK_GROUP_METADATA;
11138                ret = create_space_info(fs_info, flags);
11139                if (ret)
11140                        goto out;
11141
11142                flags = BTRFS_BLOCK_GROUP_DATA;
11143                ret = create_space_info(fs_info, flags);
11144        }
11145out:
11146        return ret;
11147}
11148
11149int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
11150                                   u64 start, u64 end)
11151{
11152        return unpin_extent_range(fs_info, start, end, false);
11153}
11154
11155/*
11156 * It used to be that old block groups would be left around forever.
11157 * Iterating over them would be enough to trim unused space.  Since we
11158 * now automatically remove them, we also need to iterate over unallocated
11159 * space.
11160 *
11161 * We don't want a transaction for this since the discard may take a
11162 * substantial amount of time.  We don't require that a transaction be
11163 * running, but we do need to take a running transaction into account
11164 * to ensure that we're not discarding chunks that were released or
11165 * allocated in the current transaction.
11166 *
11167 * Holding the chunks lock will prevent other threads from allocating
11168 * or releasing chunks, but it won't prevent a running transaction
11169 * from committing and releasing the memory that the pending chunks
11170 * list head uses.  For that, we need to take a reference to the
11171 * transaction and hold the commit root sem.  We only need to hold
11172 * it while performing the free space search since we have already
11173 * held back allocations.
11174 */
11175static int btrfs_trim_free_extents(struct btrfs_device *device,
11176                                   u64 minlen, u64 *trimmed)
11177{
11178        u64 start = 0, len = 0;
11179        int ret;
11180
11181        *trimmed = 0;
11182
11183        /* Discard not supported = nothing to do. */
11184        if (!blk_queue_discard(bdev_get_queue(device->bdev)))
11185                return 0;
11186
11187        /* Not writable = nothing to do. */
11188        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
11189                return 0;
11190
11191        /* No free space = nothing to do. */
11192        if (device->total_bytes <= device->bytes_used)
11193                return 0;
11194
11195        ret = 0;
11196
11197        while (1) {
11198                struct btrfs_fs_info *fs_info = device->fs_info;
11199                struct btrfs_transaction *trans;
11200                u64 bytes;
11201
11202                ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
11203                if (ret)
11204                        break;
11205
11206                ret = down_read_killable(&fs_info->commit_root_sem);
11207                if (ret) {
11208                        mutex_unlock(&fs_info->chunk_mutex);
11209                        break;
11210                }
11211
11212                spin_lock(&fs_info->trans_lock);
11213                trans = fs_info->running_transaction;
11214                if (trans)
11215                        refcount_inc(&trans->use_count);
11216                spin_unlock(&fs_info->trans_lock);
11217
11218                if (!trans)
11219                        up_read(&fs_info->commit_root_sem);
11220
11221                ret = find_free_dev_extent_start(trans, device, minlen, start,
11222                                                 &start, &len);
11223                if (trans) {
11224                        up_read(&fs_info->commit_root_sem);
11225                        btrfs_put_transaction(trans);
11226                }
11227
11228                if (ret) {
11229                        mutex_unlock(&fs_info->chunk_mutex);
11230                        if (ret == -ENOSPC)
11231                                ret = 0;
11232                        break;
11233                }
11234
11235                ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
11236                mutex_unlock(&fs_info->chunk_mutex);
11237
11238                if (ret)
11239                        break;
11240
11241                start += len;
11242                *trimmed += bytes;
11243
11244                if (fatal_signal_pending(current)) {
11245                        ret = -ERESTARTSYS;
11246                        break;
11247                }
11248
11249                cond_resched();
11250        }
11251
11252        return ret;
11253}
11254
11255/*
11256 * Trim the whole filesystem by:
11257 * 1) trimming the free space in each block group
11258 * 2) trimming the unallocated space on each device
11259 *
11260 * This will also continue trimming even if a block group or device encounters
11261 * an error.  The return value will be the last error, or 0 if nothing bad
11262 * happens.
11263 */
11264int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
11265{
11266        struct btrfs_block_group_cache *cache = NULL;
11267        struct btrfs_device *device;
11268        struct list_head *devices;
11269        u64 group_trimmed;
11270        u64 start;
11271        u64 end;
11272        u64 trimmed = 0;
11273        u64 bg_failed = 0;
11274        u64 dev_failed = 0;
11275        int bg_ret = 0;
11276        int dev_ret = 0;
11277        int ret = 0;
11278
11279        cache = btrfs_lookup_first_block_group(fs_info, range->start);
11280        for (; cache; cache = next_block_group(fs_info, cache)) {
11281                if (cache->key.objectid >= (range->start + range->len)) {
11282                        btrfs_put_block_group(cache);
11283                        break;
11284                }
11285
11286                start = max(range->start, cache->key.objectid);
11287                end = min(range->start + range->len,
11288                                cache->key.objectid + cache->key.offset);
11289
11290                if (end - start >= range->minlen) {
11291                        if (!block_group_cache_done(cache)) {
11292                                ret = cache_block_group(cache, 0);
11293                                if (ret) {
11294                                        bg_failed++;
11295                                        bg_ret = ret;
11296                                        continue;
11297                                }
11298                                ret = wait_block_group_cache_done(cache);
11299                                if (ret) {
11300                                        bg_failed++;
11301                                        bg_ret = ret;
11302                                        continue;
11303                                }
11304                        }
11305                        ret = btrfs_trim_block_group(cache,
11306                                                     &group_trimmed,
11307                                                     start,
11308                                                     end,
11309                                                     range->minlen);
11310
11311                        trimmed += group_trimmed;
11312                        if (ret) {
11313                                bg_failed++;
11314                                bg_ret = ret;
11315                                continue;
11316                        }
11317                }
11318        }
11319
11320        if (bg_failed)
11321                btrfs_warn(fs_info,
11322                        "failed to trim %llu block group(s), last error %d",
11323                        bg_failed, bg_ret);
11324        mutex_lock(&fs_info->fs_devices->device_list_mutex);
11325        devices = &fs_info->fs_devices->devices;
11326        list_for_each_entry(device, devices, dev_list) {
11327                ret = btrfs_trim_free_extents(device, range->minlen,
11328                                              &group_trimmed);
11329                if (ret) {
11330                        dev_failed++;
11331                        dev_ret = ret;
11332                        break;
11333                }
11334
11335                trimmed += group_trimmed;
11336        }
11337        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
11338
11339        if (dev_failed)
11340                btrfs_warn(fs_info,
11341                        "failed to trim %llu device(s), last error %d",
11342                        dev_failed, dev_ret);
11343        range->len = trimmed;
11344        if (bg_ret)
11345                return bg_ret;
11346        return dev_ret;
11347}
11348
11349/*
11350 * btrfs_{start,end}_write_no_snapshotting() are similar to
11351 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
11352 * data into the page cache through nocow before the subvolume is snapshoted,
11353 * but flush the data into disk after the snapshot creation, or to prevent
11354 * operations while snapshotting is ongoing and that cause the snapshot to be
11355 * inconsistent (writes followed by expanding truncates for example).
11356 */
11357void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
11358{
11359        percpu_counter_dec(&root->subv_writers->counter);
11360        cond_wake_up(&root->subv_writers->wait);
11361}
11362
11363int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
11364{
11365        if (atomic_read(&root->will_be_snapshotted))
11366                return 0;
11367
11368        percpu_counter_inc(&root->subv_writers->counter);
11369        /*
11370         * Make sure counter is updated before we check for snapshot creation.
11371         */
11372        smp_mb();
11373        if (atomic_read(&root->will_be_snapshotted)) {
11374                btrfs_end_write_no_snapshotting(root);
11375                return 0;
11376        }
11377        return 1;
11378}
11379
11380void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
11381{
11382        while (true) {
11383                int ret;
11384
11385                ret = btrfs_start_write_no_snapshotting(root);
11386                if (ret)
11387                        break;
11388                wait_var_event(&root->will_be_snapshotted,
11389                               !atomic_read(&root->will_be_snapshotted));
11390        }
11391}
11392
11393void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
11394{
11395        struct btrfs_fs_info *fs_info = bg->fs_info;
11396
11397        spin_lock(&fs_info->unused_bgs_lock);
11398        if (list_empty(&bg->bg_list)) {
11399                btrfs_get_block_group(bg);
11400                trace_btrfs_add_unused_block_group(bg);
11401                list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
11402        }
11403        spin_unlock(&fs_info->unused_bgs_lock);
11404}
11405