linux/fs/btrfs/extent-tree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/pagemap.h>
  20#include <linux/writeback.h>
  21#include <linux/blkdev.h>
  22#include <linux/sort.h>
  23#include <linux/rcupdate.h>
  24#include <linux/kthread.h>
  25#include <linux/slab.h>
  26#include <linux/ratelimit.h>
  27#include <linux/percpu_counter.h>
  28#include "hash.h"
  29#include "tree-log.h"
  30#include "disk-io.h"
  31#include "print-tree.h"
  32#include "volumes.h"
  33#include "raid56.h"
  34#include "locking.h"
  35#include "free-space-cache.h"
  36#include "math.h"
  37#include "sysfs.h"
  38#include "qgroup.h"
  39
  40#undef SCRAMBLE_DELAYED_REFS
  41
  42/*
  43 * control flags for do_chunk_alloc's force field
  44 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  45 * if we really need one.
  46 *
  47 * CHUNK_ALLOC_LIMITED means to only try and allocate one
  48 * if we have very few chunks already allocated.  This is
  49 * used as part of the clustering code to help make sure
  50 * we have a good pool of storage to cluster in, without
  51 * filling the FS with empty chunks
  52 *
  53 * CHUNK_ALLOC_FORCE means it must try to allocate one
  54 *
  55 */
  56enum {
  57        CHUNK_ALLOC_NO_FORCE = 0,
  58        CHUNK_ALLOC_LIMITED = 1,
  59        CHUNK_ALLOC_FORCE = 2,
  60};
  61
  62/*
  63 * Control how reservations are dealt with.
  64 *
  65 * RESERVE_FREE - freeing a reservation.
  66 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
  67 *   ENOSPC accounting
  68 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
  69 *   bytes_may_use as the ENOSPC accounting is done elsewhere
  70 */
  71enum {
  72        RESERVE_FREE = 0,
  73        RESERVE_ALLOC = 1,
  74        RESERVE_ALLOC_NO_ACCOUNT = 2,
  75};
  76
  77static int update_block_group(struct btrfs_root *root,
  78                              u64 bytenr, u64 num_bytes, int alloc);
  79static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
  80                                struct btrfs_root *root,
  81                                u64 bytenr, u64 num_bytes, u64 parent,
  82                                u64 root_objectid, u64 owner_objectid,
  83                                u64 owner_offset, int refs_to_drop,
  84                                struct btrfs_delayed_extent_op *extra_op,
  85                                int no_quota);
  86static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
  87                                    struct extent_buffer *leaf,
  88                                    struct btrfs_extent_item *ei);
  89static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
  90                                      struct btrfs_root *root,
  91                                      u64 parent, u64 root_objectid,
  92                                      u64 flags, u64 owner, u64 offset,
  93                                      struct btrfs_key *ins, int ref_mod);
  94static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  95                                     struct btrfs_root *root,
  96                                     u64 parent, u64 root_objectid,
  97                                     u64 flags, struct btrfs_disk_key *key,
  98                                     int level, struct btrfs_key *ins,
  99                                     int no_quota);
 100static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 101                          struct btrfs_root *extent_root, u64 flags,
 102                          int force);
 103static int find_next_key(struct btrfs_path *path, int level,
 104                         struct btrfs_key *key);
 105static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 106                            int dump_block_groups);
 107static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 108                                       u64 num_bytes, int reserve,
 109                                       int delalloc);
 110static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
 111                               u64 num_bytes);
 112int btrfs_pin_extent(struct btrfs_root *root,
 113                     u64 bytenr, u64 num_bytes, int reserved);
 114
 115static noinline int
 116block_group_cache_done(struct btrfs_block_group_cache *cache)
 117{
 118        smp_mb();
 119        return cache->cached == BTRFS_CACHE_FINISHED ||
 120                cache->cached == BTRFS_CACHE_ERROR;
 121}
 122
 123static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 124{
 125        return (cache->flags & bits) == bits;
 126}
 127
 128static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 129{
 130        atomic_inc(&cache->count);
 131}
 132
 133void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 134{
 135        if (atomic_dec_and_test(&cache->count)) {
 136                WARN_ON(cache->pinned > 0);
 137                WARN_ON(cache->reserved > 0);
 138                kfree(cache->free_space_ctl);
 139                kfree(cache);
 140        }
 141}
 142
 143/*
 144 * this adds the block group to the fs_info rb tree for the block group
 145 * cache
 146 */
 147static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
 148                                struct btrfs_block_group_cache *block_group)
 149{
 150        struct rb_node **p;
 151        struct rb_node *parent = NULL;
 152        struct btrfs_block_group_cache *cache;
 153
 154        spin_lock(&info->block_group_cache_lock);
 155        p = &info->block_group_cache_tree.rb_node;
 156
 157        while (*p) {
 158                parent = *p;
 159                cache = rb_entry(parent, struct btrfs_block_group_cache,
 160                                 cache_node);
 161                if (block_group->key.objectid < cache->key.objectid) {
 162                        p = &(*p)->rb_left;
 163                } else if (block_group->key.objectid > cache->key.objectid) {
 164                        p = &(*p)->rb_right;
 165                } else {
 166                        spin_unlock(&info->block_group_cache_lock);
 167                        return -EEXIST;
 168                }
 169        }
 170
 171        rb_link_node(&block_group->cache_node, parent, p);
 172        rb_insert_color(&block_group->cache_node,
 173                        &info->block_group_cache_tree);
 174
 175        if (info->first_logical_byte > block_group->key.objectid)
 176                info->first_logical_byte = block_group->key.objectid;
 177
 178        spin_unlock(&info->block_group_cache_lock);
 179
 180        return 0;
 181}
 182
 183/*
 184 * This will return the block group at or after bytenr if contains is 0, else
 185 * it will return the block group that contains the bytenr
 186 */
 187static struct btrfs_block_group_cache *
 188block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 189                              int contains)
 190{
 191        struct btrfs_block_group_cache *cache, *ret = NULL;
 192        struct rb_node *n;
 193        u64 end, start;
 194
 195        spin_lock(&info->block_group_cache_lock);
 196        n = info->block_group_cache_tree.rb_node;
 197
 198        while (n) {
 199                cache = rb_entry(n, struct btrfs_block_group_cache,
 200                                 cache_node);
 201                end = cache->key.objectid + cache->key.offset - 1;
 202                start = cache->key.objectid;
 203
 204                if (bytenr < start) {
 205                        if (!contains && (!ret || start < ret->key.objectid))
 206                                ret = cache;
 207                        n = n->rb_left;
 208                } else if (bytenr > start) {
 209                        if (contains && bytenr <= end) {
 210                                ret = cache;
 211                                break;
 212                        }
 213                        n = n->rb_right;
 214                } else {
 215                        ret = cache;
 216                        break;
 217                }
 218        }
 219        if (ret) {
 220                btrfs_get_block_group(ret);
 221                if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
 222                        info->first_logical_byte = ret->key.objectid;
 223        }
 224        spin_unlock(&info->block_group_cache_lock);
 225
 226        return ret;
 227}
 228
 229static int add_excluded_extent(struct btrfs_root *root,
 230                               u64 start, u64 num_bytes)
 231{
 232        u64 end = start + num_bytes - 1;
 233        set_extent_bits(&root->fs_info->freed_extents[0],
 234                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 235        set_extent_bits(&root->fs_info->freed_extents[1],
 236                        start, end, EXTENT_UPTODATE, GFP_NOFS);
 237        return 0;
 238}
 239
 240static void free_excluded_extents(struct btrfs_root *root,
 241                                  struct btrfs_block_group_cache *cache)
 242{
 243        u64 start, end;
 244
 245        start = cache->key.objectid;
 246        end = start + cache->key.offset - 1;
 247
 248        clear_extent_bits(&root->fs_info->freed_extents[0],
 249                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 250        clear_extent_bits(&root->fs_info->freed_extents[1],
 251                          start, end, EXTENT_UPTODATE, GFP_NOFS);
 252}
 253
 254static int exclude_super_stripes(struct btrfs_root *root,
 255                                 struct btrfs_block_group_cache *cache)
 256{
 257        u64 bytenr;
 258        u64 *logical;
 259        int stripe_len;
 260        int i, nr, ret;
 261
 262        if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
 263                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
 264                cache->bytes_super += stripe_len;
 265                ret = add_excluded_extent(root, cache->key.objectid,
 266                                          stripe_len);
 267                if (ret)
 268                        return ret;
 269        }
 270
 271        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 272                bytenr = btrfs_sb_offset(i);
 273                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 274                                       cache->key.objectid, bytenr,
 275                                       0, &logical, &nr, &stripe_len);
 276                if (ret)
 277                        return ret;
 278
 279                while (nr--) {
 280                        u64 start, len;
 281
 282                        if (logical[nr] > cache->key.objectid +
 283                            cache->key.offset)
 284                                continue;
 285
 286                        if (logical[nr] + stripe_len <= cache->key.objectid)
 287                                continue;
 288
 289                        start = logical[nr];
 290                        if (start < cache->key.objectid) {
 291                                start = cache->key.objectid;
 292                                len = (logical[nr] + stripe_len) - start;
 293                        } else {
 294                                len = min_t(u64, stripe_len,
 295                                            cache->key.objectid +
 296                                            cache->key.offset - start);
 297                        }
 298
 299                        cache->bytes_super += len;
 300                        ret = add_excluded_extent(root, start, len);
 301                        if (ret) {
 302                                kfree(logical);
 303                                return ret;
 304                        }
 305                }
 306
 307                kfree(logical);
 308        }
 309        return 0;
 310}
 311
 312static struct btrfs_caching_control *
 313get_caching_control(struct btrfs_block_group_cache *cache)
 314{
 315        struct btrfs_caching_control *ctl;
 316
 317        spin_lock(&cache->lock);
 318        if (cache->cached != BTRFS_CACHE_STARTED) {
 319                spin_unlock(&cache->lock);
 320                return NULL;
 321        }
 322
 323        /* We're loading it the fast way, so we don't have a caching_ctl. */
 324        if (!cache->caching_ctl) {
 325                spin_unlock(&cache->lock);
 326                return NULL;
 327        }
 328
 329        ctl = cache->caching_ctl;
 330        atomic_inc(&ctl->count);
 331        spin_unlock(&cache->lock);
 332        return ctl;
 333}
 334
 335static void put_caching_control(struct btrfs_caching_control *ctl)
 336{
 337        if (atomic_dec_and_test(&ctl->count))
 338                kfree(ctl);
 339}
 340
 341/*
 342 * this is only called by cache_block_group, since we could have freed extents
 343 * we need to check the pinned_extents for any extents that can't be used yet
 344 * since their free space will be released as soon as the transaction commits.
 345 */
 346static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 347                              struct btrfs_fs_info *info, u64 start, u64 end)
 348{
 349        u64 extent_start, extent_end, size, total_added = 0;
 350        int ret;
 351
 352        while (start < end) {
 353                ret = find_first_extent_bit(info->pinned_extents, start,
 354                                            &extent_start, &extent_end,
 355                                            EXTENT_DIRTY | EXTENT_UPTODATE,
 356                                            NULL);
 357                if (ret)
 358                        break;
 359
 360                if (extent_start <= start) {
 361                        start = extent_end + 1;
 362                } else if (extent_start > start && extent_start < end) {
 363                        size = extent_start - start;
 364                        total_added += size;
 365                        ret = btrfs_add_free_space(block_group, start,
 366                                                   size);
 367                        BUG_ON(ret); /* -ENOMEM or logic error */
 368                        start = extent_end + 1;
 369                } else {
 370                        break;
 371                }
 372        }
 373
 374        if (start < end) {
 375                size = end - start;
 376                total_added += size;
 377                ret = btrfs_add_free_space(block_group, start, size);
 378                BUG_ON(ret); /* -ENOMEM or logic error */
 379        }
 380
 381        return total_added;
 382}
 383
 384static noinline void caching_thread(struct btrfs_work *work)
 385{
 386        struct btrfs_block_group_cache *block_group;
 387        struct btrfs_fs_info *fs_info;
 388        struct btrfs_caching_control *caching_ctl;
 389        struct btrfs_root *extent_root;
 390        struct btrfs_path *path;
 391        struct extent_buffer *leaf;
 392        struct btrfs_key key;
 393        u64 total_found = 0;
 394        u64 last = 0;
 395        u32 nritems;
 396        int ret = -ENOMEM;
 397
 398        caching_ctl = container_of(work, struct btrfs_caching_control, work);
 399        block_group = caching_ctl->block_group;
 400        fs_info = block_group->fs_info;
 401        extent_root = fs_info->extent_root;
 402
 403        path = btrfs_alloc_path();
 404        if (!path)
 405                goto out;
 406
 407        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 408
 409        /*
 410         * We don't want to deadlock with somebody trying to allocate a new
 411         * extent for the extent root while also trying to search the extent
 412         * root to add free space.  So we skip locking and search the commit
 413         * root, since its read-only
 414         */
 415        path->skip_locking = 1;
 416        path->search_commit_root = 1;
 417        path->reada = 1;
 418
 419        key.objectid = last;
 420        key.offset = 0;
 421        key.type = BTRFS_EXTENT_ITEM_KEY;
 422again:
 423        mutex_lock(&caching_ctl->mutex);
 424        /* need to make sure the commit_root doesn't disappear */
 425        down_read(&fs_info->commit_root_sem);
 426
 427next:
 428        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 429        if (ret < 0)
 430                goto err;
 431
 432        leaf = path->nodes[0];
 433        nritems = btrfs_header_nritems(leaf);
 434
 435        while (1) {
 436                if (btrfs_fs_closing(fs_info) > 1) {
 437                        last = (u64)-1;
 438                        break;
 439                }
 440
 441                if (path->slots[0] < nritems) {
 442                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 443                } else {
 444                        ret = find_next_key(path, 0, &key);
 445                        if (ret)
 446                                break;
 447
 448                        if (need_resched() ||
 449                            rwsem_is_contended(&fs_info->commit_root_sem)) {
 450                                caching_ctl->progress = last;
 451                                btrfs_release_path(path);
 452                                up_read(&fs_info->commit_root_sem);
 453                                mutex_unlock(&caching_ctl->mutex);
 454                                cond_resched();
 455                                goto again;
 456                        }
 457
 458                        ret = btrfs_next_leaf(extent_root, path);
 459                        if (ret < 0)
 460                                goto err;
 461                        if (ret)
 462                                break;
 463                        leaf = path->nodes[0];
 464                        nritems = btrfs_header_nritems(leaf);
 465                        continue;
 466                }
 467
 468                if (key.objectid < last) {
 469                        key.objectid = last;
 470                        key.offset = 0;
 471                        key.type = BTRFS_EXTENT_ITEM_KEY;
 472
 473                        caching_ctl->progress = last;
 474                        btrfs_release_path(path);
 475                        goto next;
 476                }
 477
 478                if (key.objectid < block_group->key.objectid) {
 479                        path->slots[0]++;
 480                        continue;
 481                }
 482
 483                if (key.objectid >= block_group->key.objectid +
 484                    block_group->key.offset)
 485                        break;
 486
 487                if (key.type == BTRFS_EXTENT_ITEM_KEY ||
 488                    key.type == BTRFS_METADATA_ITEM_KEY) {
 489                        total_found += add_new_free_space(block_group,
 490                                                          fs_info, last,
 491                                                          key.objectid);
 492                        if (key.type == BTRFS_METADATA_ITEM_KEY)
 493                                last = key.objectid +
 494                                        fs_info->tree_root->leafsize;
 495                        else
 496                                last = key.objectid + key.offset;
 497
 498                        if (total_found > (1024 * 1024 * 2)) {
 499                                total_found = 0;
 500                                wake_up(&caching_ctl->wait);
 501                        }
 502                }
 503                path->slots[0]++;
 504        }
 505        ret = 0;
 506
 507        total_found += add_new_free_space(block_group, fs_info, last,
 508                                          block_group->key.objectid +
 509                                          block_group->key.offset);
 510        caching_ctl->progress = (u64)-1;
 511
 512        spin_lock(&block_group->lock);
 513        block_group->caching_ctl = NULL;
 514        block_group->cached = BTRFS_CACHE_FINISHED;
 515        spin_unlock(&block_group->lock);
 516
 517err:
 518        btrfs_free_path(path);
 519        up_read(&fs_info->commit_root_sem);
 520
 521        free_excluded_extents(extent_root, block_group);
 522
 523        mutex_unlock(&caching_ctl->mutex);
 524out:
 525        if (ret) {
 526                spin_lock(&block_group->lock);
 527                block_group->caching_ctl = NULL;
 528                block_group->cached = BTRFS_CACHE_ERROR;
 529                spin_unlock(&block_group->lock);
 530        }
 531        wake_up(&caching_ctl->wait);
 532
 533        put_caching_control(caching_ctl);
 534        btrfs_put_block_group(block_group);
 535}
 536
 537static int cache_block_group(struct btrfs_block_group_cache *cache,
 538                             int load_cache_only)
 539{
 540        DEFINE_WAIT(wait);
 541        struct btrfs_fs_info *fs_info = cache->fs_info;
 542        struct btrfs_caching_control *caching_ctl;
 543        int ret = 0;
 544
 545        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
 546        if (!caching_ctl)
 547                return -ENOMEM;
 548
 549        INIT_LIST_HEAD(&caching_ctl->list);
 550        mutex_init(&caching_ctl->mutex);
 551        init_waitqueue_head(&caching_ctl->wait);
 552        caching_ctl->block_group = cache;
 553        caching_ctl->progress = cache->key.objectid;
 554        atomic_set(&caching_ctl->count, 1);
 555        btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
 556
 557        spin_lock(&cache->lock);
 558        /*
 559         * This should be a rare occasion, but this could happen I think in the
 560         * case where one thread starts to load the space cache info, and then
 561         * some other thread starts a transaction commit which tries to do an
 562         * allocation while the other thread is still loading the space cache
 563         * info.  The previous loop should have kept us from choosing this block
 564         * group, but if we've moved to the state where we will wait on caching
 565         * block groups we need to first check if we're doing a fast load here,
 566         * so we can wait for it to finish, otherwise we could end up allocating
 567         * from a block group who's cache gets evicted for one reason or
 568         * another.
 569         */
 570        while (cache->cached == BTRFS_CACHE_FAST) {
 571                struct btrfs_caching_control *ctl;
 572
 573                ctl = cache->caching_ctl;
 574                atomic_inc(&ctl->count);
 575                prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
 576                spin_unlock(&cache->lock);
 577
 578                schedule();
 579
 580                finish_wait(&ctl->wait, &wait);
 581                put_caching_control(ctl);
 582                spin_lock(&cache->lock);
 583        }
 584
 585        if (cache->cached != BTRFS_CACHE_NO) {
 586                spin_unlock(&cache->lock);
 587                kfree(caching_ctl);
 588                return 0;
 589        }
 590        WARN_ON(cache->caching_ctl);
 591        cache->caching_ctl = caching_ctl;
 592        cache->cached = BTRFS_CACHE_FAST;
 593        spin_unlock(&cache->lock);
 594
 595        if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 596                ret = load_free_space_cache(fs_info, cache);
 597
 598                spin_lock(&cache->lock);
 599                if (ret == 1) {
 600                        cache->caching_ctl = NULL;
 601                        cache->cached = BTRFS_CACHE_FINISHED;
 602                        cache->last_byte_to_unpin = (u64)-1;
 603                } else {
 604                        if (load_cache_only) {
 605                                cache->caching_ctl = NULL;
 606                                cache->cached = BTRFS_CACHE_NO;
 607                        } else {
 608                                cache->cached = BTRFS_CACHE_STARTED;
 609                        }
 610                }
 611                spin_unlock(&cache->lock);
 612                wake_up(&caching_ctl->wait);
 613                if (ret == 1) {
 614                        put_caching_control(caching_ctl);
 615                        free_excluded_extents(fs_info->extent_root, cache);
 616                        return 0;
 617                }
 618        } else {
 619                /*
 620                 * We are not going to do the fast caching, set cached to the
 621                 * appropriate value and wakeup any waiters.
 622                 */
 623                spin_lock(&cache->lock);
 624                if (load_cache_only) {
 625                        cache->caching_ctl = NULL;
 626                        cache->cached = BTRFS_CACHE_NO;
 627                } else {
 628                        cache->cached = BTRFS_CACHE_STARTED;
 629                }
 630                spin_unlock(&cache->lock);
 631                wake_up(&caching_ctl->wait);
 632        }
 633
 634        if (load_cache_only) {
 635                put_caching_control(caching_ctl);
 636                return 0;
 637        }
 638
 639        down_write(&fs_info->commit_root_sem);
 640        atomic_inc(&caching_ctl->count);
 641        list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
 642        up_write(&fs_info->commit_root_sem);
 643
 644        btrfs_get_block_group(cache);
 645
 646        btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
 647
 648        return ret;
 649}
 650
 651/*
 652 * return the block group that starts at or after bytenr
 653 */
 654static struct btrfs_block_group_cache *
 655btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
 656{
 657        struct btrfs_block_group_cache *cache;
 658
 659        cache = block_group_cache_tree_search(info, bytenr, 0);
 660
 661        return cache;
 662}
 663
 664/*
 665 * return the block group that contains the given bytenr
 666 */
 667struct btrfs_block_group_cache *btrfs_lookup_block_group(
 668                                                 struct btrfs_fs_info *info,
 669                                                 u64 bytenr)
 670{
 671        struct btrfs_block_group_cache *cache;
 672
 673        cache = block_group_cache_tree_search(info, bytenr, 1);
 674
 675        return cache;
 676}
 677
 678static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 679                                                  u64 flags)
 680{
 681        struct list_head *head = &info->space_info;
 682        struct btrfs_space_info *found;
 683
 684        flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 685
 686        rcu_read_lock();
 687        list_for_each_entry_rcu(found, head, list) {
 688                if (found->flags & flags) {
 689                        rcu_read_unlock();
 690                        return found;
 691                }
 692        }
 693        rcu_read_unlock();
 694        return NULL;
 695}
 696
 697/*
 698 * after adding space to the filesystem, we need to clear the full flags
 699 * on all the space infos.
 700 */
 701void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 702{
 703        struct list_head *head = &info->space_info;
 704        struct btrfs_space_info *found;
 705
 706        rcu_read_lock();
 707        list_for_each_entry_rcu(found, head, list)
 708                found->full = 0;
 709        rcu_read_unlock();
 710}
 711
 712/* simple helper to search for an existing extent at a given offset */
 713int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 714{
 715        int ret;
 716        struct btrfs_key key;
 717        struct btrfs_path *path;
 718
 719        path = btrfs_alloc_path();
 720        if (!path)
 721                return -ENOMEM;
 722
 723        key.objectid = start;
 724        key.offset = len;
 725        key.type = BTRFS_EXTENT_ITEM_KEY;
 726        ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 727                                0, 0);
 728        if (ret > 0) {
 729                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 730                if (key.objectid == start &&
 731                    key.type == BTRFS_METADATA_ITEM_KEY)
 732                        ret = 0;
 733        }
 734        btrfs_free_path(path);
 735        return ret;
 736}
 737
 738/*
 739 * helper function to lookup reference count and flags of a tree block.
 740 *
 741 * the head node for delayed ref is used to store the sum of all the
 742 * reference count modifications queued up in the rbtree. the head
 743 * node may also store the extent flags to set. This way you can check
 744 * to see what the reference count and extent flags would be if all of
 745 * the delayed refs are not processed.
 746 */
 747int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 748                             struct btrfs_root *root, u64 bytenr,
 749                             u64 offset, int metadata, u64 *refs, u64 *flags)
 750{
 751        struct btrfs_delayed_ref_head *head;
 752        struct btrfs_delayed_ref_root *delayed_refs;
 753        struct btrfs_path *path;
 754        struct btrfs_extent_item *ei;
 755        struct extent_buffer *leaf;
 756        struct btrfs_key key;
 757        u32 item_size;
 758        u64 num_refs;
 759        u64 extent_flags;
 760        int ret;
 761
 762        /*
 763         * If we don't have skinny metadata, don't bother doing anything
 764         * different
 765         */
 766        if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
 767                offset = root->leafsize;
 768                metadata = 0;
 769        }
 770
 771        path = btrfs_alloc_path();
 772        if (!path)
 773                return -ENOMEM;
 774
 775        if (!trans) {
 776                path->skip_locking = 1;
 777                path->search_commit_root = 1;
 778        }
 779
 780search_again:
 781        key.objectid = bytenr;
 782        key.offset = offset;
 783        if (metadata)
 784                key.type = BTRFS_METADATA_ITEM_KEY;
 785        else
 786                key.type = BTRFS_EXTENT_ITEM_KEY;
 787
 788again:
 789        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 790                                &key, path, 0, 0);
 791        if (ret < 0)
 792                goto out_free;
 793
 794        if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
 795                if (path->slots[0]) {
 796                        path->slots[0]--;
 797                        btrfs_item_key_to_cpu(path->nodes[0], &key,
 798                                              path->slots[0]);
 799                        if (key.objectid == bytenr &&
 800                            key.type == BTRFS_EXTENT_ITEM_KEY &&
 801                            key.offset == root->leafsize)
 802                                ret = 0;
 803                }
 804                if (ret) {
 805                        key.objectid = bytenr;
 806                        key.type = BTRFS_EXTENT_ITEM_KEY;
 807                        key.offset = root->leafsize;
 808                        btrfs_release_path(path);
 809                        goto again;
 810                }
 811        }
 812
 813        if (ret == 0) {
 814                leaf = path->nodes[0];
 815                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
 816                if (item_size >= sizeof(*ei)) {
 817                        ei = btrfs_item_ptr(leaf, path->slots[0],
 818                                            struct btrfs_extent_item);
 819                        num_refs = btrfs_extent_refs(leaf, ei);
 820                        extent_flags = btrfs_extent_flags(leaf, ei);
 821                } else {
 822#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 823                        struct btrfs_extent_item_v0 *ei0;
 824                        BUG_ON(item_size != sizeof(*ei0));
 825                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
 826                                             struct btrfs_extent_item_v0);
 827                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
 828                        /* FIXME: this isn't correct for data */
 829                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
 830#else
 831                        BUG();
 832#endif
 833                }
 834                BUG_ON(num_refs == 0);
 835        } else {
 836                num_refs = 0;
 837                extent_flags = 0;
 838                ret = 0;
 839        }
 840
 841        if (!trans)
 842                goto out;
 843
 844        delayed_refs = &trans->transaction->delayed_refs;
 845        spin_lock(&delayed_refs->lock);
 846        head = btrfs_find_delayed_ref_head(trans, bytenr);
 847        if (head) {
 848                if (!mutex_trylock(&head->mutex)) {
 849                        atomic_inc(&head->node.refs);
 850                        spin_unlock(&delayed_refs->lock);
 851
 852                        btrfs_release_path(path);
 853
 854                        /*
 855                         * Mutex was contended, block until it's released and try
 856                         * again
 857                         */
 858                        mutex_lock(&head->mutex);
 859                        mutex_unlock(&head->mutex);
 860                        btrfs_put_delayed_ref(&head->node);
 861                        goto search_again;
 862                }
 863                spin_lock(&head->lock);
 864                if (head->extent_op && head->extent_op->update_flags)
 865                        extent_flags |= head->extent_op->flags_to_set;
 866                else
 867                        BUG_ON(num_refs == 0);
 868
 869                num_refs += head->node.ref_mod;
 870                spin_unlock(&head->lock);
 871                mutex_unlock(&head->mutex);
 872        }
 873        spin_unlock(&delayed_refs->lock);
 874out:
 875        WARN_ON(num_refs == 0);
 876        if (refs)
 877                *refs = num_refs;
 878        if (flags)
 879                *flags = extent_flags;
 880out_free:
 881        btrfs_free_path(path);
 882        return ret;
 883}
 884
 885/*
 886 * Back reference rules.  Back refs have three main goals:
 887 *
 888 * 1) differentiate between all holders of references to an extent so that
 889 *    when a reference is dropped we can make sure it was a valid reference
 890 *    before freeing the extent.
 891 *
 892 * 2) Provide enough information to quickly find the holders of an extent
 893 *    if we notice a given block is corrupted or bad.
 894 *
 895 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 896 *    maintenance.  This is actually the same as #2, but with a slightly
 897 *    different use case.
 898 *
 899 * There are two kinds of back refs. The implicit back refs is optimized
 900 * for pointers in non-shared tree blocks. For a given pointer in a block,
 901 * back refs of this kind provide information about the block's owner tree
 902 * and the pointer's key. These information allow us to find the block by
 903 * b-tree searching. The full back refs is for pointers in tree blocks not
 904 * referenced by their owner trees. The location of tree block is recorded
 905 * in the back refs. Actually the full back refs is generic, and can be
 906 * used in all cases the implicit back refs is used. The major shortcoming
 907 * of the full back refs is its overhead. Every time a tree block gets
 908 * COWed, we have to update back refs entry for all pointers in it.
 909 *
 910 * For a newly allocated tree block, we use implicit back refs for
 911 * pointers in it. This means most tree related operations only involve
 912 * implicit back refs. For a tree block created in old transaction, the
 913 * only way to drop a reference to it is COW it. So we can detect the
 914 * event that tree block loses its owner tree's reference and do the
 915 * back refs conversion.
 916 *
 917 * When a tree block is COW'd through a tree, there are four cases:
 918 *
 919 * The reference count of the block is one and the tree is the block's
 920 * owner tree. Nothing to do in this case.
 921 *
 922 * The reference count of the block is one and the tree is not the
 923 * block's owner tree. In this case, full back refs is used for pointers
 924 * in the block. Remove these full back refs, add implicit back refs for
 925 * every pointers in the new block.
 926 *
 927 * The reference count of the block is greater than one and the tree is
 928 * the block's owner tree. In this case, implicit back refs is used for
 929 * pointers in the block. Add full back refs for every pointers in the
 930 * block, increase lower level extents' reference counts. The original
 931 * implicit back refs are entailed to the new block.
 932 *
 933 * The reference count of the block is greater than one and the tree is
 934 * not the block's owner tree. Add implicit back refs for every pointer in
 935 * the new block, increase lower level extents' reference count.
 936 *
 937 * Back Reference Key composing:
 938 *
 939 * The key objectid corresponds to the first byte in the extent,
 940 * The key type is used to differentiate between types of back refs.
 941 * There are different meanings of the key offset for different types
 942 * of back refs.
 943 *
 944 * File extents can be referenced by:
 945 *
 946 * - multiple snapshots, subvolumes, or different generations in one subvol
 947 * - different files inside a single subvolume
 948 * - different offsets inside a file (bookend extents in file.c)
 949 *
 950 * The extent ref structure for the implicit back refs has fields for:
 951 *
 952 * - Objectid of the subvolume root
 953 * - objectid of the file holding the reference
 954 * - original offset in the file
 955 * - how many bookend extents
 956 *
 957 * The key offset for the implicit back refs is hash of the first
 958 * three fields.
 959 *
 960 * The extent ref structure for the full back refs has field for:
 961 *
 962 * - number of pointers in the tree leaf
 963 *
 964 * The key offset for the implicit back refs is the first byte of
 965 * the tree leaf
 966 *
 967 * When a file extent is allocated, The implicit back refs is used.
 968 * the fields are filled in:
 969 *
 970 *     (root_key.objectid, inode objectid, offset in file, 1)
 971 *
 972 * When a file extent is removed file truncation, we find the
 973 * corresponding implicit back refs and check the following fields:
 974 *
 975 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
 976 *
 977 * Btree extents can be referenced by:
 978 *
 979 * - Different subvolumes
 980 *
 981 * Both the implicit back refs and the full back refs for tree blocks
 982 * only consist of key. The key offset for the implicit back refs is
 983 * objectid of block's owner tree. The key offset for the full back refs
 984 * is the first byte of parent block.
 985 *
 986 * When implicit back refs is used, information about the lowest key and
 987 * level of the tree block are required. These information are stored in
 988 * tree block info structure.
 989 */
 990
 991#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 992static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
 993                                  struct btrfs_root *root,
 994                                  struct btrfs_path *path,
 995                                  u64 owner, u32 extra_size)
 996{
 997        struct btrfs_extent_item *item;
 998        struct btrfs_extent_item_v0 *ei0;
 999        struct btrfs_extent_ref_v0 *ref0;
1000        struct btrfs_tree_block_info *bi;
1001        struct extent_buffer *leaf;
1002        struct btrfs_key key;
1003        struct btrfs_key found_key;
1004        u32 new_size = sizeof(*item);
1005        u64 refs;
1006        int ret;
1007
1008        leaf = path->nodes[0];
1009        BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1010
1011        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1012        ei0 = btrfs_item_ptr(leaf, path->slots[0],
1013                             struct btrfs_extent_item_v0);
1014        refs = btrfs_extent_refs_v0(leaf, ei0);
1015
1016        if (owner == (u64)-1) {
1017                while (1) {
1018                        if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1019                                ret = btrfs_next_leaf(root, path);
1020                                if (ret < 0)
1021                                        return ret;
1022                                BUG_ON(ret > 0); /* Corruption */
1023                                leaf = path->nodes[0];
1024                        }
1025                        btrfs_item_key_to_cpu(leaf, &found_key,
1026                                              path->slots[0]);
1027                        BUG_ON(key.objectid != found_key.objectid);
1028                        if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1029                                path->slots[0]++;
1030                                continue;
1031                        }
1032                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1033                                              struct btrfs_extent_ref_v0);
1034                        owner = btrfs_ref_objectid_v0(leaf, ref0);
1035                        break;
1036                }
1037        }
1038        btrfs_release_path(path);
1039
1040        if (owner < BTRFS_FIRST_FREE_OBJECTID)
1041                new_size += sizeof(*bi);
1042
1043        new_size -= sizeof(*ei0);
1044        ret = btrfs_search_slot(trans, root, &key, path,
1045                                new_size + extra_size, 1);
1046        if (ret < 0)
1047                return ret;
1048        BUG_ON(ret); /* Corruption */
1049
1050        btrfs_extend_item(root, path, new_size);
1051
1052        leaf = path->nodes[0];
1053        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1054        btrfs_set_extent_refs(leaf, item, refs);
1055        /* FIXME: get real generation */
1056        btrfs_set_extent_generation(leaf, item, 0);
1057        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1058                btrfs_set_extent_flags(leaf, item,
1059                                       BTRFS_EXTENT_FLAG_TREE_BLOCK |
1060                                       BTRFS_BLOCK_FLAG_FULL_BACKREF);
1061                bi = (struct btrfs_tree_block_info *)(item + 1);
1062                /* FIXME: get first key of the block */
1063                memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1064                btrfs_set_tree_block_level(leaf, bi, (int)owner);
1065        } else {
1066                btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1067        }
1068        btrfs_mark_buffer_dirty(leaf);
1069        return 0;
1070}
1071#endif
1072
1073static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1074{
1075        u32 high_crc = ~(u32)0;
1076        u32 low_crc = ~(u32)0;
1077        __le64 lenum;
1078
1079        lenum = cpu_to_le64(root_objectid);
1080        high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1081        lenum = cpu_to_le64(owner);
1082        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1083        lenum = cpu_to_le64(offset);
1084        low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1085
1086        return ((u64)high_crc << 31) ^ (u64)low_crc;
1087}
1088
1089static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1090                                     struct btrfs_extent_data_ref *ref)
1091{
1092        return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1093                                    btrfs_extent_data_ref_objectid(leaf, ref),
1094                                    btrfs_extent_data_ref_offset(leaf, ref));
1095}
1096
1097static int match_extent_data_ref(struct extent_buffer *leaf,
1098                                 struct btrfs_extent_data_ref *ref,
1099                                 u64 root_objectid, u64 owner, u64 offset)
1100{
1101        if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1102            btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1103            btrfs_extent_data_ref_offset(leaf, ref) != offset)
1104                return 0;
1105        return 1;
1106}
1107
1108static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1109                                           struct btrfs_root *root,
1110                                           struct btrfs_path *path,
1111                                           u64 bytenr, u64 parent,
1112                                           u64 root_objectid,
1113                                           u64 owner, u64 offset)
1114{
1115        struct btrfs_key key;
1116        struct btrfs_extent_data_ref *ref;
1117        struct extent_buffer *leaf;
1118        u32 nritems;
1119        int ret;
1120        int recow;
1121        int err = -ENOENT;
1122
1123        key.objectid = bytenr;
1124        if (parent) {
1125                key.type = BTRFS_SHARED_DATA_REF_KEY;
1126                key.offset = parent;
1127        } else {
1128                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1129                key.offset = hash_extent_data_ref(root_objectid,
1130                                                  owner, offset);
1131        }
1132again:
1133        recow = 0;
1134        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1135        if (ret < 0) {
1136                err = ret;
1137                goto fail;
1138        }
1139
1140        if (parent) {
1141                if (!ret)
1142                        return 0;
1143#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1144                key.type = BTRFS_EXTENT_REF_V0_KEY;
1145                btrfs_release_path(path);
1146                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1147                if (ret < 0) {
1148                        err = ret;
1149                        goto fail;
1150                }
1151                if (!ret)
1152                        return 0;
1153#endif
1154                goto fail;
1155        }
1156
1157        leaf = path->nodes[0];
1158        nritems = btrfs_header_nritems(leaf);
1159        while (1) {
1160                if (path->slots[0] >= nritems) {
1161                        ret = btrfs_next_leaf(root, path);
1162                        if (ret < 0)
1163                                err = ret;
1164                        if (ret)
1165                                goto fail;
1166
1167                        leaf = path->nodes[0];
1168                        nritems = btrfs_header_nritems(leaf);
1169                        recow = 1;
1170                }
1171
1172                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1173                if (key.objectid != bytenr ||
1174                    key.type != BTRFS_EXTENT_DATA_REF_KEY)
1175                        goto fail;
1176
1177                ref = btrfs_item_ptr(leaf, path->slots[0],
1178                                     struct btrfs_extent_data_ref);
1179
1180                if (match_extent_data_ref(leaf, ref, root_objectid,
1181                                          owner, offset)) {
1182                        if (recow) {
1183                                btrfs_release_path(path);
1184                                goto again;
1185                        }
1186                        err = 0;
1187                        break;
1188                }
1189                path->slots[0]++;
1190        }
1191fail:
1192        return err;
1193}
1194
1195static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1196                                           struct btrfs_root *root,
1197                                           struct btrfs_path *path,
1198                                           u64 bytenr, u64 parent,
1199                                           u64 root_objectid, u64 owner,
1200                                           u64 offset, int refs_to_add)
1201{
1202        struct btrfs_key key;
1203        struct extent_buffer *leaf;
1204        u32 size;
1205        u32 num_refs;
1206        int ret;
1207
1208        key.objectid = bytenr;
1209        if (parent) {
1210                key.type = BTRFS_SHARED_DATA_REF_KEY;
1211                key.offset = parent;
1212                size = sizeof(struct btrfs_shared_data_ref);
1213        } else {
1214                key.type = BTRFS_EXTENT_DATA_REF_KEY;
1215                key.offset = hash_extent_data_ref(root_objectid,
1216                                                  owner, offset);
1217                size = sizeof(struct btrfs_extent_data_ref);
1218        }
1219
1220        ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1221        if (ret && ret != -EEXIST)
1222                goto fail;
1223
1224        leaf = path->nodes[0];
1225        if (parent) {
1226                struct btrfs_shared_data_ref *ref;
1227                ref = btrfs_item_ptr(leaf, path->slots[0],
1228                                     struct btrfs_shared_data_ref);
1229                if (ret == 0) {
1230                        btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1231                } else {
1232                        num_refs = btrfs_shared_data_ref_count(leaf, ref);
1233                        num_refs += refs_to_add;
1234                        btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1235                }
1236        } else {
1237                struct btrfs_extent_data_ref *ref;
1238                while (ret == -EEXIST) {
1239                        ref = btrfs_item_ptr(leaf, path->slots[0],
1240                                             struct btrfs_extent_data_ref);
1241                        if (match_extent_data_ref(leaf, ref, root_objectid,
1242                                                  owner, offset))
1243                                break;
1244                        btrfs_release_path(path);
1245                        key.offset++;
1246                        ret = btrfs_insert_empty_item(trans, root, path, &key,
1247                                                      size);
1248                        if (ret && ret != -EEXIST)
1249                                goto fail;
1250
1251                        leaf = path->nodes[0];
1252                }
1253                ref = btrfs_item_ptr(leaf, path->slots[0],
1254                                     struct btrfs_extent_data_ref);
1255                if (ret == 0) {
1256                        btrfs_set_extent_data_ref_root(leaf, ref,
1257                                                       root_objectid);
1258                        btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1259                        btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1260                        btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1261                } else {
1262                        num_refs = btrfs_extent_data_ref_count(leaf, ref);
1263                        num_refs += refs_to_add;
1264                        btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1265                }
1266        }
1267        btrfs_mark_buffer_dirty(leaf);
1268        ret = 0;
1269fail:
1270        btrfs_release_path(path);
1271        return ret;
1272}
1273
1274static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1275                                           struct btrfs_root *root,
1276                                           struct btrfs_path *path,
1277                                           int refs_to_drop, int *last_ref)
1278{
1279        struct btrfs_key key;
1280        struct btrfs_extent_data_ref *ref1 = NULL;
1281        struct btrfs_shared_data_ref *ref2 = NULL;
1282        struct extent_buffer *leaf;
1283        u32 num_refs = 0;
1284        int ret = 0;
1285
1286        leaf = path->nodes[0];
1287        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1288
1289        if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1290                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1291                                      struct btrfs_extent_data_ref);
1292                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1293        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1294                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1295                                      struct btrfs_shared_data_ref);
1296                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1297#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1298        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1299                struct btrfs_extent_ref_v0 *ref0;
1300                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1301                                      struct btrfs_extent_ref_v0);
1302                num_refs = btrfs_ref_count_v0(leaf, ref0);
1303#endif
1304        } else {
1305                BUG();
1306        }
1307
1308        BUG_ON(num_refs < refs_to_drop);
1309        num_refs -= refs_to_drop;
1310
1311        if (num_refs == 0) {
1312                ret = btrfs_del_item(trans, root, path);
1313                *last_ref = 1;
1314        } else {
1315                if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1316                        btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1317                else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1318                        btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1319#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1320                else {
1321                        struct btrfs_extent_ref_v0 *ref0;
1322                        ref0 = btrfs_item_ptr(leaf, path->slots[0],
1323                                        struct btrfs_extent_ref_v0);
1324                        btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1325                }
1326#endif
1327                btrfs_mark_buffer_dirty(leaf);
1328        }
1329        return ret;
1330}
1331
1332static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1333                                          struct btrfs_path *path,
1334                                          struct btrfs_extent_inline_ref *iref)
1335{
1336        struct btrfs_key key;
1337        struct extent_buffer *leaf;
1338        struct btrfs_extent_data_ref *ref1;
1339        struct btrfs_shared_data_ref *ref2;
1340        u32 num_refs = 0;
1341
1342        leaf = path->nodes[0];
1343        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1344        if (iref) {
1345                if (btrfs_extent_inline_ref_type(leaf, iref) ==
1346                    BTRFS_EXTENT_DATA_REF_KEY) {
1347                        ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1348                        num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1349                } else {
1350                        ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1351                        num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1352                }
1353        } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1354                ref1 = btrfs_item_ptr(leaf, path->slots[0],
1355                                      struct btrfs_extent_data_ref);
1356                num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1357        } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1358                ref2 = btrfs_item_ptr(leaf, path->slots[0],
1359                                      struct btrfs_shared_data_ref);
1360                num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1361#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1362        } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1363                struct btrfs_extent_ref_v0 *ref0;
1364                ref0 = btrfs_item_ptr(leaf, path->slots[0],
1365                                      struct btrfs_extent_ref_v0);
1366                num_refs = btrfs_ref_count_v0(leaf, ref0);
1367#endif
1368        } else {
1369                WARN_ON(1);
1370        }
1371        return num_refs;
1372}
1373
1374static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1375                                          struct btrfs_root *root,
1376                                          struct btrfs_path *path,
1377                                          u64 bytenr, u64 parent,
1378                                          u64 root_objectid)
1379{
1380        struct btrfs_key key;
1381        int ret;
1382
1383        key.objectid = bytenr;
1384        if (parent) {
1385                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1386                key.offset = parent;
1387        } else {
1388                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1389                key.offset = root_objectid;
1390        }
1391
1392        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1393        if (ret > 0)
1394                ret = -ENOENT;
1395#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1396        if (ret == -ENOENT && parent) {
1397                btrfs_release_path(path);
1398                key.type = BTRFS_EXTENT_REF_V0_KEY;
1399                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1400                if (ret > 0)
1401                        ret = -ENOENT;
1402        }
1403#endif
1404        return ret;
1405}
1406
1407static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1408                                          struct btrfs_root *root,
1409                                          struct btrfs_path *path,
1410                                          u64 bytenr, u64 parent,
1411                                          u64 root_objectid)
1412{
1413        struct btrfs_key key;
1414        int ret;
1415
1416        key.objectid = bytenr;
1417        if (parent) {
1418                key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1419                key.offset = parent;
1420        } else {
1421                key.type = BTRFS_TREE_BLOCK_REF_KEY;
1422                key.offset = root_objectid;
1423        }
1424
1425        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1426        btrfs_release_path(path);
1427        return ret;
1428}
1429
1430static inline int extent_ref_type(u64 parent, u64 owner)
1431{
1432        int type;
1433        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1434                if (parent > 0)
1435                        type = BTRFS_SHARED_BLOCK_REF_KEY;
1436                else
1437                        type = BTRFS_TREE_BLOCK_REF_KEY;
1438        } else {
1439                if (parent > 0)
1440                        type = BTRFS_SHARED_DATA_REF_KEY;
1441                else
1442                        type = BTRFS_EXTENT_DATA_REF_KEY;
1443        }
1444        return type;
1445}
1446
1447static int find_next_key(struct btrfs_path *path, int level,
1448                         struct btrfs_key *key)
1449
1450{
1451        for (; level < BTRFS_MAX_LEVEL; level++) {
1452                if (!path->nodes[level])
1453                        break;
1454                if (path->slots[level] + 1 >=
1455                    btrfs_header_nritems(path->nodes[level]))
1456                        continue;
1457                if (level == 0)
1458                        btrfs_item_key_to_cpu(path->nodes[level], key,
1459                                              path->slots[level] + 1);
1460                else
1461                        btrfs_node_key_to_cpu(path->nodes[level], key,
1462                                              path->slots[level] + 1);
1463                return 0;
1464        }
1465        return 1;
1466}
1467
1468/*
1469 * look for inline back ref. if back ref is found, *ref_ret is set
1470 * to the address of inline back ref, and 0 is returned.
1471 *
1472 * if back ref isn't found, *ref_ret is set to the address where it
1473 * should be inserted, and -ENOENT is returned.
1474 *
1475 * if insert is true and there are too many inline back refs, the path
1476 * points to the extent item, and -EAGAIN is returned.
1477 *
1478 * NOTE: inline back refs are ordered in the same way that back ref
1479 *       items in the tree are ordered.
1480 */
1481static noinline_for_stack
1482int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1483                                 struct btrfs_root *root,
1484                                 struct btrfs_path *path,
1485                                 struct btrfs_extent_inline_ref **ref_ret,
1486                                 u64 bytenr, u64 num_bytes,
1487                                 u64 parent, u64 root_objectid,
1488                                 u64 owner, u64 offset, int insert)
1489{
1490        struct btrfs_key key;
1491        struct extent_buffer *leaf;
1492        struct btrfs_extent_item *ei;
1493        struct btrfs_extent_inline_ref *iref;
1494        u64 flags;
1495        u64 item_size;
1496        unsigned long ptr;
1497        unsigned long end;
1498        int extra_size;
1499        int type;
1500        int want;
1501        int ret;
1502        int err = 0;
1503        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1504                                                 SKINNY_METADATA);
1505
1506        key.objectid = bytenr;
1507        key.type = BTRFS_EXTENT_ITEM_KEY;
1508        key.offset = num_bytes;
1509
1510        want = extent_ref_type(parent, owner);
1511        if (insert) {
1512                extra_size = btrfs_extent_inline_ref_size(want);
1513                path->keep_locks = 1;
1514        } else
1515                extra_size = -1;
1516
1517        /*
1518         * Owner is our parent level, so we can just add one to get the level
1519         * for the block we are interested in.
1520         */
1521        if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1522                key.type = BTRFS_METADATA_ITEM_KEY;
1523                key.offset = owner;
1524        }
1525
1526again:
1527        ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1528        if (ret < 0) {
1529                err = ret;
1530                goto out;
1531        }
1532
1533        /*
1534         * We may be a newly converted file system which still has the old fat
1535         * extent entries for metadata, so try and see if we have one of those.
1536         */
1537        if (ret > 0 && skinny_metadata) {
1538                skinny_metadata = false;
1539                if (path->slots[0]) {
1540                        path->slots[0]--;
1541                        btrfs_item_key_to_cpu(path->nodes[0], &key,
1542                                              path->slots[0]);
1543                        if (key.objectid == bytenr &&
1544                            key.type == BTRFS_EXTENT_ITEM_KEY &&
1545                            key.offset == num_bytes)
1546                                ret = 0;
1547                }
1548                if (ret) {
1549                        key.objectid = bytenr;
1550                        key.type = BTRFS_EXTENT_ITEM_KEY;
1551                        key.offset = num_bytes;
1552                        btrfs_release_path(path);
1553                        goto again;
1554                }
1555        }
1556
1557        if (ret && !insert) {
1558                err = -ENOENT;
1559                goto out;
1560        } else if (WARN_ON(ret)) {
1561                err = -EIO;
1562                goto out;
1563        }
1564
1565        leaf = path->nodes[0];
1566        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1567#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1568        if (item_size < sizeof(*ei)) {
1569                if (!insert) {
1570                        err = -ENOENT;
1571                        goto out;
1572                }
1573                ret = convert_extent_item_v0(trans, root, path, owner,
1574                                             extra_size);
1575                if (ret < 0) {
1576                        err = ret;
1577                        goto out;
1578                }
1579                leaf = path->nodes[0];
1580                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1581        }
1582#endif
1583        BUG_ON(item_size < sizeof(*ei));
1584
1585        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1586        flags = btrfs_extent_flags(leaf, ei);
1587
1588        ptr = (unsigned long)(ei + 1);
1589        end = (unsigned long)ei + item_size;
1590
1591        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1592                ptr += sizeof(struct btrfs_tree_block_info);
1593                BUG_ON(ptr > end);
1594        }
1595
1596        err = -ENOENT;
1597        while (1) {
1598                if (ptr >= end) {
1599                        WARN_ON(ptr > end);
1600                        break;
1601                }
1602                iref = (struct btrfs_extent_inline_ref *)ptr;
1603                type = btrfs_extent_inline_ref_type(leaf, iref);
1604                if (want < type)
1605                        break;
1606                if (want > type) {
1607                        ptr += btrfs_extent_inline_ref_size(type);
1608                        continue;
1609                }
1610
1611                if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1612                        struct btrfs_extent_data_ref *dref;
1613                        dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1614                        if (match_extent_data_ref(leaf, dref, root_objectid,
1615                                                  owner, offset)) {
1616                                err = 0;
1617                                break;
1618                        }
1619                        if (hash_extent_data_ref_item(leaf, dref) <
1620                            hash_extent_data_ref(root_objectid, owner, offset))
1621                                break;
1622                } else {
1623                        u64 ref_offset;
1624                        ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1625                        if (parent > 0) {
1626                                if (parent == ref_offset) {
1627                                        err = 0;
1628                                        break;
1629                                }
1630                                if (ref_offset < parent)
1631                                        break;
1632                        } else {
1633                                if (root_objectid == ref_offset) {
1634                                        err = 0;
1635                                        break;
1636                                }
1637                                if (ref_offset < root_objectid)
1638                                        break;
1639                        }
1640                }
1641                ptr += btrfs_extent_inline_ref_size(type);
1642        }
1643        if (err == -ENOENT && insert) {
1644                if (item_size + extra_size >=
1645                    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1646                        err = -EAGAIN;
1647                        goto out;
1648                }
1649                /*
1650                 * To add new inline back ref, we have to make sure
1651                 * there is no corresponding back ref item.
1652                 * For simplicity, we just do not add new inline back
1653                 * ref if there is any kind of item for this block
1654                 */
1655                if (find_next_key(path, 0, &key) == 0 &&
1656                    key.objectid == bytenr &&
1657                    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1658                        err = -EAGAIN;
1659                        goto out;
1660                }
1661        }
1662        *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1663out:
1664        if (insert) {
1665                path->keep_locks = 0;
1666                btrfs_unlock_up_safe(path, 1);
1667        }
1668        return err;
1669}
1670
1671/*
1672 * helper to add new inline back ref
1673 */
1674static noinline_for_stack
1675void setup_inline_extent_backref(struct btrfs_root *root,
1676                                 struct btrfs_path *path,
1677                                 struct btrfs_extent_inline_ref *iref,
1678                                 u64 parent, u64 root_objectid,
1679                                 u64 owner, u64 offset, int refs_to_add,
1680                                 struct btrfs_delayed_extent_op *extent_op)
1681{
1682        struct extent_buffer *leaf;
1683        struct btrfs_extent_item *ei;
1684        unsigned long ptr;
1685        unsigned long end;
1686        unsigned long item_offset;
1687        u64 refs;
1688        int size;
1689        int type;
1690
1691        leaf = path->nodes[0];
1692        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1693        item_offset = (unsigned long)iref - (unsigned long)ei;
1694
1695        type = extent_ref_type(parent, owner);
1696        size = btrfs_extent_inline_ref_size(type);
1697
1698        btrfs_extend_item(root, path, size);
1699
1700        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1701        refs = btrfs_extent_refs(leaf, ei);
1702        refs += refs_to_add;
1703        btrfs_set_extent_refs(leaf, ei, refs);
1704        if (extent_op)
1705                __run_delayed_extent_op(extent_op, leaf, ei);
1706
1707        ptr = (unsigned long)ei + item_offset;
1708        end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1709        if (ptr < end - size)
1710                memmove_extent_buffer(leaf, ptr + size, ptr,
1711                                      end - size - ptr);
1712
1713        iref = (struct btrfs_extent_inline_ref *)ptr;
1714        btrfs_set_extent_inline_ref_type(leaf, iref, type);
1715        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1716                struct btrfs_extent_data_ref *dref;
1717                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1718                btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1719                btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1720                btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1721                btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1722        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1723                struct btrfs_shared_data_ref *sref;
1724                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1725                btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1726                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1727        } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1728                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1729        } else {
1730                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1731        }
1732        btrfs_mark_buffer_dirty(leaf);
1733}
1734
1735static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1736                                 struct btrfs_root *root,
1737                                 struct btrfs_path *path,
1738                                 struct btrfs_extent_inline_ref **ref_ret,
1739                                 u64 bytenr, u64 num_bytes, u64 parent,
1740                                 u64 root_objectid, u64 owner, u64 offset)
1741{
1742        int ret;
1743
1744        ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1745                                           bytenr, num_bytes, parent,
1746                                           root_objectid, owner, offset, 0);
1747        if (ret != -ENOENT)
1748                return ret;
1749
1750        btrfs_release_path(path);
1751        *ref_ret = NULL;
1752
1753        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1754                ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1755                                            root_objectid);
1756        } else {
1757                ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1758                                             root_objectid, owner, offset);
1759        }
1760        return ret;
1761}
1762
1763/*
1764 * helper to update/remove inline back ref
1765 */
1766static noinline_for_stack
1767void update_inline_extent_backref(struct btrfs_root *root,
1768                                  struct btrfs_path *path,
1769                                  struct btrfs_extent_inline_ref *iref,
1770                                  int refs_to_mod,
1771                                  struct btrfs_delayed_extent_op *extent_op,
1772                                  int *last_ref)
1773{
1774        struct extent_buffer *leaf;
1775        struct btrfs_extent_item *ei;
1776        struct btrfs_extent_data_ref *dref = NULL;
1777        struct btrfs_shared_data_ref *sref = NULL;
1778        unsigned long ptr;
1779        unsigned long end;
1780        u32 item_size;
1781        int size;
1782        int type;
1783        u64 refs;
1784
1785        leaf = path->nodes[0];
1786        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1787        refs = btrfs_extent_refs(leaf, ei);
1788        WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1789        refs += refs_to_mod;
1790        btrfs_set_extent_refs(leaf, ei, refs);
1791        if (extent_op)
1792                __run_delayed_extent_op(extent_op, leaf, ei);
1793
1794        type = btrfs_extent_inline_ref_type(leaf, iref);
1795
1796        if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1797                dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1798                refs = btrfs_extent_data_ref_count(leaf, dref);
1799        } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1800                sref = (struct btrfs_shared_data_ref *)(iref + 1);
1801                refs = btrfs_shared_data_ref_count(leaf, sref);
1802        } else {
1803                refs = 1;
1804                BUG_ON(refs_to_mod != -1);
1805        }
1806
1807        BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1808        refs += refs_to_mod;
1809
1810        if (refs > 0) {
1811                if (type == BTRFS_EXTENT_DATA_REF_KEY)
1812                        btrfs_set_extent_data_ref_count(leaf, dref, refs);
1813                else
1814                        btrfs_set_shared_data_ref_count(leaf, sref, refs);
1815        } else {
1816                *last_ref = 1;
1817                size =  btrfs_extent_inline_ref_size(type);
1818                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1819                ptr = (unsigned long)iref;
1820                end = (unsigned long)ei + item_size;
1821                if (ptr + size < end)
1822                        memmove_extent_buffer(leaf, ptr, ptr + size,
1823                                              end - ptr - size);
1824                item_size -= size;
1825                btrfs_truncate_item(root, path, item_size, 1);
1826        }
1827        btrfs_mark_buffer_dirty(leaf);
1828}
1829
1830static noinline_for_stack
1831int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1832                                 struct btrfs_root *root,
1833                                 struct btrfs_path *path,
1834                                 u64 bytenr, u64 num_bytes, u64 parent,
1835                                 u64 root_objectid, u64 owner,
1836                                 u64 offset, int refs_to_add,
1837                                 struct btrfs_delayed_extent_op *extent_op)
1838{
1839        struct btrfs_extent_inline_ref *iref;
1840        int ret;
1841
1842        ret = lookup_inline_extent_backref(trans, root, path, &iref,
1843                                           bytenr, num_bytes, parent,
1844                                           root_objectid, owner, offset, 1);
1845        if (ret == 0) {
1846                BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1847                update_inline_extent_backref(root, path, iref,
1848                                             refs_to_add, extent_op, NULL);
1849        } else if (ret == -ENOENT) {
1850                setup_inline_extent_backref(root, path, iref, parent,
1851                                            root_objectid, owner, offset,
1852                                            refs_to_add, extent_op);
1853                ret = 0;
1854        }
1855        return ret;
1856}
1857
1858static int insert_extent_backref(struct btrfs_trans_handle *trans,
1859                                 struct btrfs_root *root,
1860                                 struct btrfs_path *path,
1861                                 u64 bytenr, u64 parent, u64 root_objectid,
1862                                 u64 owner, u64 offset, int refs_to_add)
1863{
1864        int ret;
1865        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1866                BUG_ON(refs_to_add != 1);
1867                ret = insert_tree_block_ref(trans, root, path, bytenr,
1868                                            parent, root_objectid);
1869        } else {
1870                ret = insert_extent_data_ref(trans, root, path, bytenr,
1871                                             parent, root_objectid,
1872                                             owner, offset, refs_to_add);
1873        }
1874        return ret;
1875}
1876
1877static int remove_extent_backref(struct btrfs_trans_handle *trans,
1878                                 struct btrfs_root *root,
1879                                 struct btrfs_path *path,
1880                                 struct btrfs_extent_inline_ref *iref,
1881                                 int refs_to_drop, int is_data, int *last_ref)
1882{
1883        int ret = 0;
1884
1885        BUG_ON(!is_data && refs_to_drop != 1);
1886        if (iref) {
1887                update_inline_extent_backref(root, path, iref,
1888                                             -refs_to_drop, NULL, last_ref);
1889        } else if (is_data) {
1890                ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
1891                                             last_ref);
1892        } else {
1893                *last_ref = 1;
1894                ret = btrfs_del_item(trans, root, path);
1895        }
1896        return ret;
1897}
1898
1899static int btrfs_issue_discard(struct block_device *bdev,
1900                                u64 start, u64 len)
1901{
1902        return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1903}
1904
1905static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1906                                u64 num_bytes, u64 *actual_bytes)
1907{
1908        int ret;
1909        u64 discarded_bytes = 0;
1910        struct btrfs_bio *bbio = NULL;
1911
1912
1913        /* Tell the block device(s) that the sectors can be discarded */
1914        ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1915                              bytenr, &num_bytes, &bbio, 0);
1916        /* Error condition is -ENOMEM */
1917        if (!ret) {
1918                struct btrfs_bio_stripe *stripe = bbio->stripes;
1919                int i;
1920
1921
1922                for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1923                        if (!stripe->dev->can_discard)
1924                                continue;
1925
1926                        ret = btrfs_issue_discard(stripe->dev->bdev,
1927                                                  stripe->physical,
1928                                                  stripe->length);
1929                        if (!ret)
1930                                discarded_bytes += stripe->length;
1931                        else if (ret != -EOPNOTSUPP)
1932                                break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1933
1934                        /*
1935                         * Just in case we get back EOPNOTSUPP for some reason,
1936                         * just ignore the return value so we don't screw up
1937                         * people calling discard_extent.
1938                         */
1939                        ret = 0;
1940                }
1941                kfree(bbio);
1942        }
1943
1944        if (actual_bytes)
1945                *actual_bytes = discarded_bytes;
1946
1947
1948        if (ret == -EOPNOTSUPP)
1949                ret = 0;
1950        return ret;
1951}
1952
1953/* Can return -ENOMEM */
1954int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1955                         struct btrfs_root *root,
1956                         u64 bytenr, u64 num_bytes, u64 parent,
1957                         u64 root_objectid, u64 owner, u64 offset,
1958                         int no_quota)
1959{
1960        int ret;
1961        struct btrfs_fs_info *fs_info = root->fs_info;
1962
1963        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1964               root_objectid == BTRFS_TREE_LOG_OBJECTID);
1965
1966        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1967                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1968                                        num_bytes,
1969                                        parent, root_objectid, (int)owner,
1970                                        BTRFS_ADD_DELAYED_REF, NULL, no_quota);
1971        } else {
1972                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1973                                        num_bytes,
1974                                        parent, root_objectid, owner, offset,
1975                                        BTRFS_ADD_DELAYED_REF, NULL, no_quota);
1976        }
1977        return ret;
1978}
1979
1980static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1981                                  struct btrfs_root *root,
1982                                  u64 bytenr, u64 num_bytes,
1983                                  u64 parent, u64 root_objectid,
1984                                  u64 owner, u64 offset, int refs_to_add,
1985                                  int no_quota,
1986                                  struct btrfs_delayed_extent_op *extent_op)
1987{
1988        struct btrfs_fs_info *fs_info = root->fs_info;
1989        struct btrfs_path *path;
1990        struct extent_buffer *leaf;
1991        struct btrfs_extent_item *item;
1992        struct btrfs_key key;
1993        u64 refs;
1994        int ret;
1995        enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
1996
1997        path = btrfs_alloc_path();
1998        if (!path)
1999                return -ENOMEM;
2000
2001        if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
2002                no_quota = 1;
2003
2004        path->reada = 1;
2005        path->leave_spinning = 1;
2006        /* this will setup the path even if it fails to insert the back ref */
2007        ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
2008                                           bytenr, num_bytes, parent,
2009                                           root_objectid, owner, offset,
2010                                           refs_to_add, extent_op);
2011        if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
2012                goto out;
2013        /*
2014         * Ok we were able to insert an inline extent and it appears to be a new
2015         * reference, deal with the qgroup accounting.
2016         */
2017        if (!ret && !no_quota) {
2018                ASSERT(root->fs_info->quota_enabled);
2019                leaf = path->nodes[0];
2020                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2021                item = btrfs_item_ptr(leaf, path->slots[0],
2022                                      struct btrfs_extent_item);
2023                if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
2024                        type = BTRFS_QGROUP_OPER_ADD_SHARED;
2025                btrfs_release_path(path);
2026
2027                ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
2028                                              bytenr, num_bytes, type, 0);
2029                goto out;
2030        }
2031
2032        /*
2033         * Ok we had -EAGAIN which means we didn't have space to insert and
2034         * inline extent ref, so just update the reference count and add a
2035         * normal backref.
2036         */
2037        leaf = path->nodes[0];
2038        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2039        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2040        refs = btrfs_extent_refs(leaf, item);
2041        if (refs)
2042                type = BTRFS_QGROUP_OPER_ADD_SHARED;
2043        btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2044        if (extent_op)
2045                __run_delayed_extent_op(extent_op, leaf, item);
2046
2047        btrfs_mark_buffer_dirty(leaf);
2048        btrfs_release_path(path);
2049
2050        if (!no_quota) {
2051                ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
2052                                              bytenr, num_bytes, type, 0);
2053                if (ret)
2054                        goto out;
2055        }
2056
2057        path->reada = 1;
2058        path->leave_spinning = 1;
2059        /* now insert the actual backref */
2060        ret = insert_extent_backref(trans, root->fs_info->extent_root,
2061                                    path, bytenr, parent, root_objectid,
2062                                    owner, offset, refs_to_add);
2063        if (ret)
2064                btrfs_abort_transaction(trans, root, ret);
2065out:
2066        btrfs_free_path(path);
2067        return ret;
2068}
2069
2070static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2071                                struct btrfs_root *root,
2072                                struct btrfs_delayed_ref_node *node,
2073                                struct btrfs_delayed_extent_op *extent_op,
2074                                int insert_reserved)
2075{
2076        int ret = 0;
2077        struct btrfs_delayed_data_ref *ref;
2078        struct btrfs_key ins;
2079        u64 parent = 0;
2080        u64 ref_root = 0;
2081        u64 flags = 0;
2082
2083        ins.objectid = node->bytenr;
2084        ins.offset = node->num_bytes;
2085        ins.type = BTRFS_EXTENT_ITEM_KEY;
2086
2087        ref = btrfs_delayed_node_to_data_ref(node);
2088        trace_run_delayed_data_ref(node, ref, node->action);
2089
2090        if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2091                parent = ref->parent;
2092        ref_root = ref->root;
2093
2094        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2095                if (extent_op)
2096                        flags |= extent_op->flags_to_set;
2097                ret = alloc_reserved_file_extent(trans, root,
2098                                                 parent, ref_root, flags,
2099                                                 ref->objectid, ref->offset,
2100                                                 &ins, node->ref_mod);
2101        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2102                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2103                                             node->num_bytes, parent,
2104                                             ref_root, ref->objectid,
2105                                             ref->offset, node->ref_mod,
2106                                             node->no_quota, extent_op);
2107        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2108                ret = __btrfs_free_extent(trans, root, node->bytenr,
2109                                          node->num_bytes, parent,
2110                                          ref_root, ref->objectid,
2111                                          ref->offset, node->ref_mod,
2112                                          extent_op, node->no_quota);
2113        } else {
2114                BUG();
2115        }
2116        return ret;
2117}
2118
2119static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2120                                    struct extent_buffer *leaf,
2121                                    struct btrfs_extent_item *ei)
2122{
2123        u64 flags = btrfs_extent_flags(leaf, ei);
2124        if (extent_op->update_flags) {
2125                flags |= extent_op->flags_to_set;
2126                btrfs_set_extent_flags(leaf, ei, flags);
2127        }
2128
2129        if (extent_op->update_key) {
2130                struct btrfs_tree_block_info *bi;
2131                BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2132                bi = (struct btrfs_tree_block_info *)(ei + 1);
2133                btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2134        }
2135}
2136
2137static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2138                                 struct btrfs_root *root,
2139                                 struct btrfs_delayed_ref_node *node,
2140                                 struct btrfs_delayed_extent_op *extent_op)
2141{
2142        struct btrfs_key key;
2143        struct btrfs_path *path;
2144        struct btrfs_extent_item *ei;
2145        struct extent_buffer *leaf;
2146        u32 item_size;
2147        int ret;
2148        int err = 0;
2149        int metadata = !extent_op->is_data;
2150
2151        if (trans->aborted)
2152                return 0;
2153
2154        if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2155                metadata = 0;
2156
2157        path = btrfs_alloc_path();
2158        if (!path)
2159                return -ENOMEM;
2160
2161        key.objectid = node->bytenr;
2162
2163        if (metadata) {
2164                key.type = BTRFS_METADATA_ITEM_KEY;
2165                key.offset = extent_op->level;
2166        } else {
2167                key.type = BTRFS_EXTENT_ITEM_KEY;
2168                key.offset = node->num_bytes;
2169        }
2170
2171again:
2172        path->reada = 1;
2173        path->leave_spinning = 1;
2174        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2175                                path, 0, 1);
2176        if (ret < 0) {
2177                err = ret;
2178                goto out;
2179        }
2180        if (ret > 0) {
2181                if (metadata) {
2182                        if (path->slots[0] > 0) {
2183                                path->slots[0]--;
2184                                btrfs_item_key_to_cpu(path->nodes[0], &key,
2185                                                      path->slots[0]);
2186                                if (key.objectid == node->bytenr &&
2187                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
2188                                    key.offset == node->num_bytes)
2189                                        ret = 0;
2190                        }
2191                        if (ret > 0) {
2192                                btrfs_release_path(path);
2193                                metadata = 0;
2194
2195                                key.objectid = node->bytenr;
2196                                key.offset = node->num_bytes;
2197                                key.type = BTRFS_EXTENT_ITEM_KEY;
2198                                goto again;
2199                        }
2200                } else {
2201                        err = -EIO;
2202                        goto out;
2203                }
2204        }
2205
2206        leaf = path->nodes[0];
2207        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2208#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2209        if (item_size < sizeof(*ei)) {
2210                ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2211                                             path, (u64)-1, 0);
2212                if (ret < 0) {
2213                        err = ret;
2214                        goto out;
2215                }
2216                leaf = path->nodes[0];
2217                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2218        }
2219#endif
2220        BUG_ON(item_size < sizeof(*ei));
2221        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2222        __run_delayed_extent_op(extent_op, leaf, ei);
2223
2224        btrfs_mark_buffer_dirty(leaf);
2225out:
2226        btrfs_free_path(path);
2227        return err;
2228}
2229
2230static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2231                                struct btrfs_root *root,
2232                                struct btrfs_delayed_ref_node *node,
2233                                struct btrfs_delayed_extent_op *extent_op,
2234                                int insert_reserved)
2235{
2236        int ret = 0;
2237        struct btrfs_delayed_tree_ref *ref;
2238        struct btrfs_key ins;
2239        u64 parent = 0;
2240        u64 ref_root = 0;
2241        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2242                                                 SKINNY_METADATA);
2243
2244        ref = btrfs_delayed_node_to_tree_ref(node);
2245        trace_run_delayed_tree_ref(node, ref, node->action);
2246
2247        if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2248                parent = ref->parent;
2249        ref_root = ref->root;
2250
2251        ins.objectid = node->bytenr;
2252        if (skinny_metadata) {
2253                ins.offset = ref->level;
2254                ins.type = BTRFS_METADATA_ITEM_KEY;
2255        } else {
2256                ins.offset = node->num_bytes;
2257                ins.type = BTRFS_EXTENT_ITEM_KEY;
2258        }
2259
2260        BUG_ON(node->ref_mod != 1);
2261        if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2262                BUG_ON(!extent_op || !extent_op->update_flags);
2263                ret = alloc_reserved_tree_block(trans, root,
2264                                                parent, ref_root,
2265                                                extent_op->flags_to_set,
2266                                                &extent_op->key,
2267                                                ref->level, &ins,
2268                                                node->no_quota);
2269        } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2270                ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2271                                             node->num_bytes, parent, ref_root,
2272                                             ref->level, 0, 1, node->no_quota,
2273                                             extent_op);
2274        } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2275                ret = __btrfs_free_extent(trans, root, node->bytenr,
2276                                          node->num_bytes, parent, ref_root,
2277                                          ref->level, 0, 1, extent_op,
2278                                          node->no_quota);
2279        } else {
2280                BUG();
2281        }
2282        return ret;
2283}
2284
2285/* helper function to actually process a single delayed ref entry */
2286static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2287                               struct btrfs_root *root,
2288                               struct btrfs_delayed_ref_node *node,
2289                               struct btrfs_delayed_extent_op *extent_op,
2290                               int insert_reserved)
2291{
2292        int ret = 0;
2293
2294        if (trans->aborted) {
2295                if (insert_reserved)
2296                        btrfs_pin_extent(root, node->bytenr,
2297                                         node->num_bytes, 1);
2298                return 0;
2299        }
2300
2301        if (btrfs_delayed_ref_is_head(node)) {
2302                struct btrfs_delayed_ref_head *head;
2303                /*
2304                 * we've hit the end of the chain and we were supposed
2305                 * to insert this extent into the tree.  But, it got
2306                 * deleted before we ever needed to insert it, so all
2307                 * we have to do is clean up the accounting
2308                 */
2309                BUG_ON(extent_op);
2310                head = btrfs_delayed_node_to_head(node);
2311                trace_run_delayed_ref_head(node, head, node->action);
2312
2313                if (insert_reserved) {
2314                        btrfs_pin_extent(root, node->bytenr,
2315                                         node->num_bytes, 1);
2316                        if (head->is_data) {
2317                                ret = btrfs_del_csums(trans, root,
2318                                                      node->bytenr,
2319                                                      node->num_bytes);
2320                        }
2321                }
2322                return ret;
2323        }
2324
2325        if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2326            node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2327                ret = run_delayed_tree_ref(trans, root, node, extent_op,
2328                                           insert_reserved);
2329        else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2330                 node->type == BTRFS_SHARED_DATA_REF_KEY)
2331                ret = run_delayed_data_ref(trans, root, node, extent_op,
2332                                           insert_reserved);
2333        else
2334                BUG();
2335        return ret;
2336}
2337
2338static noinline struct btrfs_delayed_ref_node *
2339select_delayed_ref(struct btrfs_delayed_ref_head *head)
2340{
2341        struct rb_node *node;
2342        struct btrfs_delayed_ref_node *ref, *last = NULL;;
2343
2344        /*
2345         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2346         * this prevents ref count from going down to zero when
2347         * there still are pending delayed ref.
2348         */
2349        node = rb_first(&head->ref_root);
2350        while (node) {
2351                ref = rb_entry(node, struct btrfs_delayed_ref_node,
2352                                rb_node);
2353                if (ref->action == BTRFS_ADD_DELAYED_REF)
2354                        return ref;
2355                else if (last == NULL)
2356                        last = ref;
2357                node = rb_next(node);
2358        }
2359        return last;
2360}
2361
2362/*
2363 * Returns 0 on success or if called with an already aborted transaction.
2364 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2365 */
2366static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2367                                             struct btrfs_root *root,
2368                                             unsigned long nr)
2369{
2370        struct btrfs_delayed_ref_root *delayed_refs;
2371        struct btrfs_delayed_ref_node *ref;
2372        struct btrfs_delayed_ref_head *locked_ref = NULL;
2373        struct btrfs_delayed_extent_op *extent_op;
2374        struct btrfs_fs_info *fs_info = root->fs_info;
2375        ktime_t start = ktime_get();
2376        int ret;
2377        unsigned long count = 0;
2378        unsigned long actual_count = 0;
2379        int must_insert_reserved = 0;
2380
2381        delayed_refs = &trans->transaction->delayed_refs;
2382        while (1) {
2383                if (!locked_ref) {
2384                        if (count >= nr)
2385                                break;
2386
2387                        spin_lock(&delayed_refs->lock);
2388                        locked_ref = btrfs_select_ref_head(trans);
2389                        if (!locked_ref) {
2390                                spin_unlock(&delayed_refs->lock);
2391                                break;
2392                        }
2393
2394                        /* grab the lock that says we are going to process
2395                         * all the refs for this head */
2396                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
2397                        spin_unlock(&delayed_refs->lock);
2398                        /*
2399                         * we may have dropped the spin lock to get the head
2400                         * mutex lock, and that might have given someone else
2401                         * time to free the head.  If that's true, it has been
2402                         * removed from our list and we can move on.
2403                         */
2404                        if (ret == -EAGAIN) {
2405                                locked_ref = NULL;
2406                                count++;
2407                                continue;
2408                        }
2409                }
2410
2411                /*
2412                 * We need to try and merge add/drops of the same ref since we
2413                 * can run into issues with relocate dropping the implicit ref
2414                 * and then it being added back again before the drop can
2415                 * finish.  If we merged anything we need to re-loop so we can
2416                 * get a good ref.
2417                 */
2418                spin_lock(&locked_ref->lock);
2419                btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2420                                         locked_ref);
2421
2422                /*
2423                 * locked_ref is the head node, so we have to go one
2424                 * node back for any delayed ref updates
2425                 */
2426                ref = select_delayed_ref(locked_ref);
2427
2428                if (ref && ref->seq &&
2429                    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2430                        spin_unlock(&locked_ref->lock);
2431                        btrfs_delayed_ref_unlock(locked_ref);
2432                        spin_lock(&delayed_refs->lock);
2433                        locked_ref->processing = 0;
2434                        delayed_refs->num_heads_ready++;
2435                        spin_unlock(&delayed_refs->lock);
2436                        locked_ref = NULL;
2437                        cond_resched();
2438                        count++;
2439                        continue;
2440                }
2441
2442                /*
2443                 * record the must insert reserved flag before we
2444                 * drop the spin lock.
2445                 */
2446                must_insert_reserved = locked_ref->must_insert_reserved;
2447                locked_ref->must_insert_reserved = 0;
2448
2449                extent_op = locked_ref->extent_op;
2450                locked_ref->extent_op = NULL;
2451
2452                if (!ref) {
2453
2454
2455                        /* All delayed refs have been processed, Go ahead
2456                         * and send the head node to run_one_delayed_ref,
2457                         * so that any accounting fixes can happen
2458                         */
2459                        ref = &locked_ref->node;
2460
2461                        if (extent_op && must_insert_reserved) {
2462                                btrfs_free_delayed_extent_op(extent_op);
2463                                extent_op = NULL;
2464                        }
2465
2466                        if (extent_op) {
2467                                spin_unlock(&locked_ref->lock);
2468                                ret = run_delayed_extent_op(trans, root,
2469                                                            ref, extent_op);
2470                                btrfs_free_delayed_extent_op(extent_op);
2471
2472                                if (ret) {
2473                                        /*
2474                                         * Need to reset must_insert_reserved if
2475                                         * there was an error so the abort stuff
2476                                         * can cleanup the reserved space
2477                                         * properly.
2478                                         */
2479                                        if (must_insert_reserved)
2480                                                locked_ref->must_insert_reserved = 1;
2481                                        locked_ref->processing = 0;
2482                                        btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2483                                        btrfs_delayed_ref_unlock(locked_ref);
2484                                        return ret;
2485                                }
2486                                continue;
2487                        }
2488
2489                        /*
2490                         * Need to drop our head ref lock and re-aqcuire the
2491                         * delayed ref lock and then re-check to make sure
2492                         * nobody got added.
2493                         */
2494                        spin_unlock(&locked_ref->lock);
2495                        spin_lock(&delayed_refs->lock);
2496                        spin_lock(&locked_ref->lock);
2497                        if (rb_first(&locked_ref->ref_root) ||
2498                            locked_ref->extent_op) {
2499                                spin_unlock(&locked_ref->lock);
2500                                spin_unlock(&delayed_refs->lock);
2501                                continue;
2502                        }
2503                        ref->in_tree = 0;
2504                        delayed_refs->num_heads--;
2505                        rb_erase(&locked_ref->href_node,
2506                                 &delayed_refs->href_root);
2507                        spin_unlock(&delayed_refs->lock);
2508                } else {
2509                        actual_count++;
2510                        ref->in_tree = 0;
2511                        rb_erase(&ref->rb_node, &locked_ref->ref_root);
2512                }
2513                atomic_dec(&delayed_refs->num_entries);
2514
2515                if (!btrfs_delayed_ref_is_head(ref)) {
2516                        /*
2517                         * when we play the delayed ref, also correct the
2518                         * ref_mod on head
2519                         */
2520                        switch (ref->action) {
2521                        case BTRFS_ADD_DELAYED_REF:
2522                        case BTRFS_ADD_DELAYED_EXTENT:
2523                                locked_ref->node.ref_mod -= ref->ref_mod;
2524                                break;
2525                        case BTRFS_DROP_DELAYED_REF:
2526                                locked_ref->node.ref_mod += ref->ref_mod;
2527                                break;
2528                        default:
2529                                WARN_ON(1);
2530                        }
2531                }
2532                spin_unlock(&locked_ref->lock);
2533
2534                ret = run_one_delayed_ref(trans, root, ref, extent_op,
2535                                          must_insert_reserved);
2536
2537                btrfs_free_delayed_extent_op(extent_op);
2538                if (ret) {
2539                        locked_ref->processing = 0;
2540                        btrfs_delayed_ref_unlock(locked_ref);
2541                        btrfs_put_delayed_ref(ref);
2542                        btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2543                        return ret;
2544                }
2545
2546                /*
2547                 * If this node is a head, that means all the refs in this head
2548                 * have been dealt with, and we will pick the next head to deal
2549                 * with, so we must unlock the head and drop it from the cluster
2550                 * list before we release it.
2551                 */
2552                if (btrfs_delayed_ref_is_head(ref)) {
2553                        btrfs_delayed_ref_unlock(locked_ref);
2554                        locked_ref = NULL;
2555                }
2556                btrfs_put_delayed_ref(ref);
2557                count++;
2558                cond_resched();
2559        }
2560
2561        /*
2562         * We don't want to include ref heads since we can have empty ref heads
2563         * and those will drastically skew our runtime down since we just do
2564         * accounting, no actual extent tree updates.
2565         */
2566        if (actual_count > 0) {
2567                u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2568                u64 avg;
2569
2570                /*
2571                 * We weigh the current average higher than our current runtime
2572                 * to avoid large swings in the average.
2573                 */
2574                spin_lock(&delayed_refs->lock);
2575                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2576                avg = div64_u64(avg, 4);
2577                fs_info->avg_delayed_ref_runtime = avg;
2578                spin_unlock(&delayed_refs->lock);
2579        }
2580        return 0;
2581}
2582
2583#ifdef SCRAMBLE_DELAYED_REFS
2584/*
2585 * Normally delayed refs get processed in ascending bytenr order. This
2586 * correlates in most cases to the order added. To expose dependencies on this
2587 * order, we start to process the tree in the middle instead of the beginning
2588 */
2589static u64 find_middle(struct rb_root *root)
2590{
2591        struct rb_node *n = root->rb_node;
2592        struct btrfs_delayed_ref_node *entry;
2593        int alt = 1;
2594        u64 middle;
2595        u64 first = 0, last = 0;
2596
2597        n = rb_first(root);
2598        if (n) {
2599                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2600                first = entry->bytenr;
2601        }
2602        n = rb_last(root);
2603        if (n) {
2604                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2605                last = entry->bytenr;
2606        }
2607        n = root->rb_node;
2608
2609        while (n) {
2610                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2611                WARN_ON(!entry->in_tree);
2612
2613                middle = entry->bytenr;
2614
2615                if (alt)
2616                        n = n->rb_left;
2617                else
2618                        n = n->rb_right;
2619
2620                alt = 1 - alt;
2621        }
2622        return middle;
2623}
2624#endif
2625
2626static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2627{
2628        u64 num_bytes;
2629
2630        num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2631                             sizeof(struct btrfs_extent_inline_ref));
2632        if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2633                num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2634
2635        /*
2636         * We don't ever fill up leaves all the way so multiply by 2 just to be
2637         * closer to what we're really going to want to ouse.
2638         */
2639        return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2640}
2641
2642int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2643                                       struct btrfs_root *root)
2644{
2645        struct btrfs_block_rsv *global_rsv;
2646        u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2647        u64 num_bytes;
2648        int ret = 0;
2649
2650        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2651        num_heads = heads_to_leaves(root, num_heads);
2652        if (num_heads > 1)
2653                num_bytes += (num_heads - 1) * root->leafsize;
2654        num_bytes <<= 1;
2655        global_rsv = &root->fs_info->global_block_rsv;
2656
2657        /*
2658         * If we can't allocate any more chunks lets make sure we have _lots_ of
2659         * wiggle room since running delayed refs can create more delayed refs.
2660         */
2661        if (global_rsv->space_info->full)
2662                num_bytes <<= 1;
2663
2664        spin_lock(&global_rsv->lock);
2665        if (global_rsv->reserved <= num_bytes)
2666                ret = 1;
2667        spin_unlock(&global_rsv->lock);
2668        return ret;
2669}
2670
2671int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2672                                       struct btrfs_root *root)
2673{
2674        struct btrfs_fs_info *fs_info = root->fs_info;
2675        u64 num_entries =
2676                atomic_read(&trans->transaction->delayed_refs.num_entries);
2677        u64 avg_runtime;
2678        u64 val;
2679
2680        smp_mb();
2681        avg_runtime = fs_info->avg_delayed_ref_runtime;
2682        val = num_entries * avg_runtime;
2683        if (num_entries * avg_runtime >= NSEC_PER_SEC)
2684                return 1;
2685        if (val >= NSEC_PER_SEC / 2)
2686                return 2;
2687
2688        return btrfs_check_space_for_delayed_refs(trans, root);
2689}
2690
2691struct async_delayed_refs {
2692        struct btrfs_root *root;
2693        int count;
2694        int error;
2695        int sync;
2696        struct completion wait;
2697        struct btrfs_work work;
2698};
2699
2700static void delayed_ref_async_start(struct btrfs_work *work)
2701{
2702        struct async_delayed_refs *async;
2703        struct btrfs_trans_handle *trans;
2704        int ret;
2705
2706        async = container_of(work, struct async_delayed_refs, work);
2707
2708        trans = btrfs_join_transaction(async->root);
2709        if (IS_ERR(trans)) {
2710                async->error = PTR_ERR(trans);
2711                goto done;
2712        }
2713
2714        /*
2715         * trans->sync means that when we call end_transaciton, we won't
2716         * wait on delayed refs
2717         */
2718        trans->sync = true;
2719        ret = btrfs_run_delayed_refs(trans, async->root, async->count);
2720        if (ret)
2721                async->error = ret;
2722
2723        ret = btrfs_end_transaction(trans, async->root);
2724        if (ret && !async->error)
2725                async->error = ret;
2726done:
2727        if (async->sync)
2728                complete(&async->wait);
2729        else
2730                kfree(async);
2731}
2732
2733int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2734                                 unsigned long count, int wait)
2735{
2736        struct async_delayed_refs *async;
2737        int ret;
2738
2739        async = kmalloc(sizeof(*async), GFP_NOFS);
2740        if (!async)
2741                return -ENOMEM;
2742
2743        async->root = root->fs_info->tree_root;
2744        async->count = count;
2745        async->error = 0;
2746        if (wait)
2747                async->sync = 1;
2748        else
2749                async->sync = 0;
2750        init_completion(&async->wait);
2751
2752        btrfs_init_work(&async->work, delayed_ref_async_start,
2753                        NULL, NULL);
2754
2755        btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2756
2757        if (wait) {
2758                wait_for_completion(&async->wait);
2759                ret = async->error;
2760                kfree(async);
2761                return ret;
2762        }
2763        return 0;
2764}
2765
2766/*
2767 * this starts processing the delayed reference count updates and
2768 * extent insertions we have queued up so far.  count can be
2769 * 0, which means to process everything in the tree at the start
2770 * of the run (but not newly added entries), or it can be some target
2771 * number you'd like to process.
2772 *
2773 * Returns 0 on success or if called with an aborted transaction
2774 * Returns <0 on error and aborts the transaction
2775 */
2776int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2777                           struct btrfs_root *root, unsigned long count)
2778{
2779        struct rb_node *node;
2780        struct btrfs_delayed_ref_root *delayed_refs;
2781        struct btrfs_delayed_ref_head *head;
2782        int ret;
2783        int run_all = count == (unsigned long)-1;
2784        int run_most = 0;
2785
2786        /* We'll clean this up in btrfs_cleanup_transaction */
2787        if (trans->aborted)
2788                return 0;
2789
2790        if (root == root->fs_info->extent_root)
2791                root = root->fs_info->tree_root;
2792
2793        delayed_refs = &trans->transaction->delayed_refs;
2794        if (count == 0) {
2795                count = atomic_read(&delayed_refs->num_entries) * 2;
2796                run_most = 1;
2797        }
2798
2799again:
2800#ifdef SCRAMBLE_DELAYED_REFS
2801        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2802#endif
2803        ret = __btrfs_run_delayed_refs(trans, root, count);
2804        if (ret < 0) {
2805                btrfs_abort_transaction(trans, root, ret);
2806                return ret;
2807        }
2808
2809        if (run_all) {
2810                if (!list_empty(&trans->new_bgs))
2811                        btrfs_create_pending_block_groups(trans, root);
2812
2813                spin_lock(&delayed_refs->lock);
2814                node = rb_first(&delayed_refs->href_root);
2815                if (!node) {
2816                        spin_unlock(&delayed_refs->lock);
2817                        goto out;
2818                }
2819                count = (unsigned long)-1;
2820
2821                while (node) {
2822                        head = rb_entry(node, struct btrfs_delayed_ref_head,
2823                                        href_node);
2824                        if (btrfs_delayed_ref_is_head(&head->node)) {
2825                                struct btrfs_delayed_ref_node *ref;
2826
2827                                ref = &head->node;
2828                                atomic_inc(&ref->refs);
2829
2830                                spin_unlock(&delayed_refs->lock);
2831                                /*
2832                                 * Mutex was contended, block until it's
2833                                 * released and try again
2834                                 */
2835                                mutex_lock(&head->mutex);
2836                                mutex_unlock(&head->mutex);
2837
2838                                btrfs_put_delayed_ref(ref);
2839                                cond_resched();
2840                                goto again;
2841                        } else {
2842                                WARN_ON(1);
2843                        }
2844                        node = rb_next(node);
2845                }
2846                spin_unlock(&delayed_refs->lock);
2847                cond_resched();
2848                goto again;
2849        }
2850out:
2851        ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
2852        if (ret)
2853                return ret;
2854        assert_qgroups_uptodate(trans);
2855        return 0;
2856}
2857
2858int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2859                                struct btrfs_root *root,
2860                                u64 bytenr, u64 num_bytes, u64 flags,
2861                                int level, int is_data)
2862{
2863        struct btrfs_delayed_extent_op *extent_op;
2864        int ret;
2865
2866        extent_op = btrfs_alloc_delayed_extent_op();
2867        if (!extent_op)
2868                return -ENOMEM;
2869
2870        extent_op->flags_to_set = flags;
2871        extent_op->update_flags = 1;
2872        extent_op->update_key = 0;
2873        extent_op->is_data = is_data ? 1 : 0;
2874        extent_op->level = level;
2875
2876        ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2877                                          num_bytes, extent_op);
2878        if (ret)
2879                btrfs_free_delayed_extent_op(extent_op);
2880        return ret;
2881}
2882
2883static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2884                                      struct btrfs_root *root,
2885                                      struct btrfs_path *path,
2886                                      u64 objectid, u64 offset, u64 bytenr)
2887{
2888        struct btrfs_delayed_ref_head *head;
2889        struct btrfs_delayed_ref_node *ref;
2890        struct btrfs_delayed_data_ref *data_ref;
2891        struct btrfs_delayed_ref_root *delayed_refs;
2892        struct rb_node *node;
2893        int ret = 0;
2894
2895        delayed_refs = &trans->transaction->delayed_refs;
2896        spin_lock(&delayed_refs->lock);
2897        head = btrfs_find_delayed_ref_head(trans, bytenr);
2898        if (!head) {
2899                spin_unlock(&delayed_refs->lock);
2900                return 0;
2901        }
2902
2903        if (!mutex_trylock(&head->mutex)) {
2904                atomic_inc(&head->node.refs);
2905                spin_unlock(&delayed_refs->lock);
2906
2907                btrfs_release_path(path);
2908
2909                /*
2910                 * Mutex was contended, block until it's released and let
2911                 * caller try again
2912                 */
2913                mutex_lock(&head->mutex);
2914                mutex_unlock(&head->mutex);
2915                btrfs_put_delayed_ref(&head->node);
2916                return -EAGAIN;
2917        }
2918        spin_unlock(&delayed_refs->lock);
2919
2920        spin_lock(&head->lock);
2921        node = rb_first(&head->ref_root);
2922        while (node) {
2923                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2924                node = rb_next(node);
2925
2926                /* If it's a shared ref we know a cross reference exists */
2927                if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2928                        ret = 1;
2929                        break;
2930                }
2931
2932                data_ref = btrfs_delayed_node_to_data_ref(ref);
2933
2934                /*
2935                 * If our ref doesn't match the one we're currently looking at
2936                 * then we have a cross reference.
2937                 */
2938                if (data_ref->root != root->root_key.objectid ||
2939                    data_ref->objectid != objectid ||
2940                    data_ref->offset != offset) {
2941                        ret = 1;
2942                        break;
2943                }
2944        }
2945        spin_unlock(&head->lock);
2946        mutex_unlock(&head->mutex);
2947        return ret;
2948}
2949
2950static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2951                                        struct btrfs_root *root,
2952                                        struct btrfs_path *path,
2953                                        u64 objectid, u64 offset, u64 bytenr)
2954{
2955        struct btrfs_root *extent_root = root->fs_info->extent_root;
2956        struct extent_buffer *leaf;
2957        struct btrfs_extent_data_ref *ref;
2958        struct btrfs_extent_inline_ref *iref;
2959        struct btrfs_extent_item *ei;
2960        struct btrfs_key key;
2961        u32 item_size;
2962        int ret;
2963
2964        key.objectid = bytenr;
2965        key.offset = (u64)-1;
2966        key.type = BTRFS_EXTENT_ITEM_KEY;
2967
2968        ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2969        if (ret < 0)
2970                goto out;
2971        BUG_ON(ret == 0); /* Corruption */
2972
2973        ret = -ENOENT;
2974        if (path->slots[0] == 0)
2975                goto out;
2976
2977        path->slots[0]--;
2978        leaf = path->nodes[0];
2979        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2980
2981        if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2982                goto out;
2983
2984        ret = 1;
2985        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2986#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2987        if (item_size < sizeof(*ei)) {
2988                WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2989                goto out;
2990        }
2991#endif
2992        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2993
2994        if (item_size != sizeof(*ei) +
2995            btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2996                goto out;
2997
2998        if (btrfs_extent_generation(leaf, ei) <=
2999            btrfs_root_last_snapshot(&root->root_item))
3000                goto out;
3001
3002        iref = (struct btrfs_extent_inline_ref *)(ei + 1);
3003        if (btrfs_extent_inline_ref_type(leaf, iref) !=
3004            BTRFS_EXTENT_DATA_REF_KEY)
3005                goto out;
3006
3007        ref = (struct btrfs_extent_data_ref *)(&iref->offset);
3008        if (btrfs_extent_refs(leaf, ei) !=
3009            btrfs_extent_data_ref_count(leaf, ref) ||
3010            btrfs_extent_data_ref_root(leaf, ref) !=
3011            root->root_key.objectid ||
3012            btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
3013            btrfs_extent_data_ref_offset(leaf, ref) != offset)
3014                goto out;
3015
3016        ret = 0;
3017out:
3018        return ret;
3019}
3020
3021int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
3022                          struct btrfs_root *root,
3023                          u64 objectid, u64 offset, u64 bytenr)
3024{
3025        struct btrfs_path *path;
3026        int ret;
3027        int ret2;
3028
3029        path = btrfs_alloc_path();
3030        if (!path)
3031                return -ENOENT;
3032
3033        do {
3034                ret = check_committed_ref(trans, root, path, objectid,
3035                                          offset, bytenr);
3036                if (ret && ret != -ENOENT)
3037                        goto out;
3038
3039                ret2 = check_delayed_ref(trans, root, path, objectid,
3040                                         offset, bytenr);
3041        } while (ret2 == -EAGAIN);
3042
3043        if (ret2 && ret2 != -ENOENT) {
3044                ret = ret2;
3045                goto out;
3046        }
3047
3048        if (ret != -ENOENT || ret2 != -ENOENT)
3049                ret = 0;
3050out:
3051        btrfs_free_path(path);
3052        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
3053                WARN_ON(ret > 0);
3054        return ret;
3055}
3056
3057static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3058                           struct btrfs_root *root,
3059                           struct extent_buffer *buf,
3060                           int full_backref, int inc, int no_quota)
3061{
3062        u64 bytenr;
3063        u64 num_bytes;
3064        u64 parent;
3065        u64 ref_root;
3066        u32 nritems;
3067        struct btrfs_key key;
3068        struct btrfs_file_extent_item *fi;
3069        int i;
3070        int level;
3071        int ret = 0;
3072        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3073                            u64, u64, u64, u64, u64, u64, int);
3074
3075#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3076        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
3077                return 0;
3078#endif
3079        ref_root = btrfs_header_owner(buf);
3080        nritems = btrfs_header_nritems(buf);
3081        level = btrfs_header_level(buf);
3082
3083        if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3084                return 0;
3085
3086        if (inc)
3087                process_func = btrfs_inc_extent_ref;
3088        else
3089                process_func = btrfs_free_extent;
3090
3091        if (full_backref)
3092                parent = buf->start;
3093        else
3094                parent = 0;
3095
3096        for (i = 0; i < nritems; i++) {
3097                if (level == 0) {
3098                        btrfs_item_key_to_cpu(buf, &key, i);
3099                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3100                                continue;
3101                        fi = btrfs_item_ptr(buf, i,
3102                                            struct btrfs_file_extent_item);
3103                        if (btrfs_file_extent_type(buf, fi) ==
3104                            BTRFS_FILE_EXTENT_INLINE)
3105                                continue;
3106                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3107                        if (bytenr == 0)
3108                                continue;
3109
3110                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3111                        key.offset -= btrfs_file_extent_offset(buf, fi);
3112                        ret = process_func(trans, root, bytenr, num_bytes,
3113                                           parent, ref_root, key.objectid,
3114                                           key.offset, no_quota);
3115                        if (ret)
3116                                goto fail;
3117                } else {
3118                        bytenr = btrfs_node_blockptr(buf, i);
3119                        num_bytes = btrfs_level_size(root, level - 1);
3120                        ret = process_func(trans, root, bytenr, num_bytes,
3121                                           parent, ref_root, level - 1, 0,
3122                                           no_quota);
3123                        if (ret)
3124                                goto fail;
3125                }
3126        }
3127        return 0;
3128fail:
3129        return ret;
3130}
3131
3132int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3133                  struct extent_buffer *buf, int full_backref, int no_quota)
3134{
3135        return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
3136}
3137
3138int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3139                  struct extent_buffer *buf, int full_backref, int no_quota)
3140{
3141        return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
3142}
3143
3144static int write_one_cache_group(struct btrfs_trans_handle *trans,
3145                                 struct btrfs_root *root,
3146                                 struct btrfs_path *path,
3147                                 struct btrfs_block_group_cache *cache)
3148{
3149        int ret;
3150        struct btrfs_root *extent_root = root->fs_info->extent_root;
3151        unsigned long bi;
3152        struct extent_buffer *leaf;
3153
3154        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3155        if (ret < 0)
3156                goto fail;
3157        BUG_ON(ret); /* Corruption */
3158
3159        leaf = path->nodes[0];
3160        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3161        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3162        btrfs_mark_buffer_dirty(leaf);
3163        btrfs_release_path(path);
3164fail:
3165        if (ret) {
3166                btrfs_abort_transaction(trans, root, ret);
3167                return ret;
3168        }
3169        return 0;
3170
3171}
3172
3173static struct btrfs_block_group_cache *
3174next_block_group(struct btrfs_root *root,
3175                 struct btrfs_block_group_cache *cache)
3176{
3177        struct rb_node *node;
3178        spin_lock(&root->fs_info->block_group_cache_lock);
3179        node = rb_next(&cache->cache_node);
3180        btrfs_put_block_group(cache);
3181        if (node) {
3182                cache = rb_entry(node, struct btrfs_block_group_cache,
3183                                 cache_node);
3184                btrfs_get_block_group(cache);
3185        } else
3186                cache = NULL;
3187        spin_unlock(&root->fs_info->block_group_cache_lock);
3188        return cache;
3189}
3190
3191static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3192                            struct btrfs_trans_handle *trans,
3193                            struct btrfs_path *path)
3194{
3195        struct btrfs_root *root = block_group->fs_info->tree_root;
3196        struct inode *inode = NULL;
3197        u64 alloc_hint = 0;
3198        int dcs = BTRFS_DC_ERROR;
3199        int num_pages = 0;
3200        int retries = 0;
3201        int ret = 0;
3202
3203        /*
3204         * If this block group is smaller than 100 megs don't bother caching the
3205         * block group.
3206         */
3207        if (block_group->key.offset < (100 * 1024 * 1024)) {
3208                spin_lock(&block_group->lock);
3209                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3210                spin_unlock(&block_group->lock);
3211                return 0;
3212        }
3213
3214again:
3215        inode = lookup_free_space_inode(root, block_group, path);
3216        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3217                ret = PTR_ERR(inode);
3218                btrfs_release_path(path);
3219                goto out;
3220        }
3221
3222        if (IS_ERR(inode)) {
3223                BUG_ON(retries);
3224                retries++;
3225
3226                if (block_group->ro)
3227                        goto out_free;
3228
3229                ret = create_free_space_inode(root, trans, block_group, path);
3230                if (ret)
3231                        goto out_free;
3232                goto again;
3233        }
3234
3235        /* We've already setup this transaction, go ahead and exit */
3236        if (block_group->cache_generation == trans->transid &&
3237            i_size_read(inode)) {
3238                dcs = BTRFS_DC_SETUP;
3239                goto out_put;
3240        }
3241
3242        /*
3243         * We want to set the generation to 0, that way if anything goes wrong
3244         * from here on out we know not to trust this cache when we load up next
3245         * time.
3246         */
3247        BTRFS_I(inode)->generation = 0;
3248        ret = btrfs_update_inode(trans, root, inode);
3249        WARN_ON(ret);
3250
3251        if (i_size_read(inode) > 0) {
3252                ret = btrfs_check_trunc_cache_free_space(root,
3253                                        &root->fs_info->global_block_rsv);
3254                if (ret)
3255                        goto out_put;
3256
3257                ret = btrfs_truncate_free_space_cache(root, trans, inode);
3258                if (ret)
3259                        goto out_put;
3260        }
3261
3262        spin_lock(&block_group->lock);
3263        if (block_group->cached != BTRFS_CACHE_FINISHED ||
3264            !btrfs_test_opt(root, SPACE_CACHE) ||
3265            block_group->delalloc_bytes) {
3266                /*
3267                 * don't bother trying to write stuff out _if_
3268                 * a) we're not cached,
3269                 * b) we're with nospace_cache mount option.
3270                 */
3271                dcs = BTRFS_DC_WRITTEN;
3272                spin_unlock(&block_group->lock);
3273                goto out_put;
3274        }
3275        spin_unlock(&block_group->lock);
3276
3277        /*
3278         * Try to preallocate enough space based on how big the block group is.
3279         * Keep in mind this has to include any pinned space which could end up
3280         * taking up quite a bit since it's not folded into the other space
3281         * cache.
3282         */
3283        num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3284        if (!num_pages)
3285                num_pages = 1;
3286
3287        num_pages *= 16;
3288        num_pages *= PAGE_CACHE_SIZE;
3289
3290        ret = btrfs_check_data_free_space(inode, num_pages);
3291        if (ret)
3292                goto out_put;
3293
3294        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3295                                              num_pages, num_pages,
3296                                              &alloc_hint);
3297        if (!ret)
3298                dcs = BTRFS_DC_SETUP;
3299        btrfs_free_reserved_data_space(inode, num_pages);
3300
3301out_put:
3302        iput(inode);
3303out_free:
3304        btrfs_release_path(path);
3305out:
3306        spin_lock(&block_group->lock);
3307        if (!ret && dcs == BTRFS_DC_SETUP)
3308                block_group->cache_generation = trans->transid;
3309        block_group->disk_cache_state = dcs;
3310        spin_unlock(&block_group->lock);
3311
3312        return ret;
3313}
3314
3315int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3316                                   struct btrfs_root *root)
3317{
3318        struct btrfs_block_group_cache *cache;
3319        int err = 0;
3320        struct btrfs_path *path;
3321        u64 last = 0;
3322
3323        path = btrfs_alloc_path();
3324        if (!path)
3325                return -ENOMEM;
3326
3327again:
3328        while (1) {
3329                cache = btrfs_lookup_first_block_group(root->fs_info, last);
3330                while (cache) {
3331                        if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3332                                break;
3333                        cache = next_block_group(root, cache);
3334                }
3335                if (!cache) {
3336                        if (last == 0)
3337                                break;
3338                        last = 0;
3339                        continue;
3340                }
3341                err = cache_save_setup(cache, trans, path);
3342                last = cache->key.objectid + cache->key.offset;
3343                btrfs_put_block_group(cache);
3344        }
3345
3346        while (1) {
3347                if (last == 0) {
3348                        err = btrfs_run_delayed_refs(trans, root,
3349                                                     (unsigned long)-1);
3350                        if (err) /* File system offline */
3351                                goto out;
3352                }
3353
3354                cache = btrfs_lookup_first_block_group(root->fs_info, last);
3355                while (cache) {
3356                        if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3357                                btrfs_put_block_group(cache);
3358                                goto again;
3359                        }
3360
3361                        if (cache->dirty)
3362                                break;
3363                        cache = next_block_group(root, cache);
3364                }
3365                if (!cache) {
3366                        if (last == 0)
3367                                break;
3368                        last = 0;
3369                        continue;
3370                }
3371
3372                if (cache->disk_cache_state == BTRFS_DC_SETUP)
3373                        cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3374                cache->dirty = 0;
3375                last = cache->key.objectid + cache->key.offset;
3376
3377                err = write_one_cache_group(trans, root, path, cache);
3378                btrfs_put_block_group(cache);
3379                if (err) /* File system offline */
3380                        goto out;
3381        }
3382
3383        while (1) {
3384                /*
3385                 * I don't think this is needed since we're just marking our
3386                 * preallocated extent as written, but just in case it can't
3387                 * hurt.
3388                 */
3389                if (last == 0) {
3390                        err = btrfs_run_delayed_refs(trans, root,
3391                                                     (unsigned long)-1);
3392                        if (err) /* File system offline */
3393                                goto out;
3394                }
3395
3396                cache = btrfs_lookup_first_block_group(root->fs_info, last);
3397                while (cache) {
3398                        /*
3399                         * Really this shouldn't happen, but it could if we
3400                         * couldn't write the entire preallocated extent and
3401                         * splitting the extent resulted in a new block.
3402                         */
3403                        if (cache->dirty) {
3404                                btrfs_put_block_group(cache);
3405                                goto again;
3406                        }
3407                        if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3408                                break;
3409                        cache = next_block_group(root, cache);
3410                }
3411                if (!cache) {
3412                        if (last == 0)
3413                                break;
3414                        last = 0;
3415                        continue;
3416                }
3417
3418                err = btrfs_write_out_cache(root, trans, cache, path);
3419
3420                /*
3421                 * If we didn't have an error then the cache state is still
3422                 * NEED_WRITE, so we can set it to WRITTEN.
3423                 */
3424                if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3425                        cache->disk_cache_state = BTRFS_DC_WRITTEN;
3426                last = cache->key.objectid + cache->key.offset;
3427                btrfs_put_block_group(cache);
3428        }
3429out:
3430
3431        btrfs_free_path(path);
3432        return err;
3433}
3434
3435int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3436{
3437        struct btrfs_block_group_cache *block_group;
3438        int readonly = 0;
3439
3440        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3441        if (!block_group || block_group->ro)
3442                readonly = 1;
3443        if (block_group)
3444                btrfs_put_block_group(block_group);
3445        return readonly;
3446}
3447
3448static const char *alloc_name(u64 flags)
3449{
3450        switch (flags) {
3451        case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
3452                return "mixed";
3453        case BTRFS_BLOCK_GROUP_METADATA:
3454                return "metadata";
3455        case BTRFS_BLOCK_GROUP_DATA:
3456                return "data";
3457        case BTRFS_BLOCK_GROUP_SYSTEM:
3458                return "system";
3459        default:
3460                WARN_ON(1);
3461                return "invalid-combination";
3462        };
3463}
3464
3465static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3466                             u64 total_bytes, u64 bytes_used,
3467                             struct btrfs_space_info **space_info)
3468{
3469        struct btrfs_space_info *found;
3470        int i;
3471        int factor;
3472        int ret;
3473
3474        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3475                     BTRFS_BLOCK_GROUP_RAID10))
3476                factor = 2;
3477        else
3478                factor = 1;
3479
3480        found = __find_space_info(info, flags);
3481        if (found) {
3482                spin_lock(&found->lock);
3483                found->total_bytes += total_bytes;
3484                found->disk_total += total_bytes * factor;
3485                found->bytes_used += bytes_used;
3486                found->disk_used += bytes_used * factor;
3487                found->full = 0;
3488                spin_unlock(&found->lock);
3489                *space_info = found;
3490                return 0;
3491        }
3492        found = kzalloc(sizeof(*found), GFP_NOFS);
3493        if (!found)
3494                return -ENOMEM;
3495
3496        ret = percpu_counter_init(&found->total_bytes_pinned, 0);
3497        if (ret) {
3498                kfree(found);
3499                return ret;
3500        }
3501
3502        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3503                INIT_LIST_HEAD(&found->block_groups[i]);
3504        init_rwsem(&found->groups_sem);
3505        spin_lock_init(&found->lock);
3506        found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3507        found->total_bytes = total_bytes;
3508        found->disk_total = total_bytes * factor;
3509        found->bytes_used = bytes_used;
3510        found->disk_used = bytes_used * factor;
3511        found->bytes_pinned = 0;
3512        found->bytes_reserved = 0;
3513        found->bytes_readonly = 0;
3514        found->bytes_may_use = 0;
3515        found->full = 0;
3516        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3517        found->chunk_alloc = 0;
3518        found->flush = 0;
3519        init_waitqueue_head(&found->wait);
3520
3521        ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
3522                                    info->space_info_kobj, "%s",
3523                                    alloc_name(found->flags));
3524        if (ret) {
3525                kfree(found);
3526                return ret;
3527        }
3528
3529        *space_info = found;
3530        list_add_rcu(&found->list, &info->space_info);
3531        if (flags & BTRFS_BLOCK_GROUP_DATA)
3532                info->data_sinfo = found;
3533
3534        return ret;
3535}
3536
3537static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3538{
3539        u64 extra_flags = chunk_to_extended(flags) &
3540                                BTRFS_EXTENDED_PROFILE_MASK;
3541
3542        write_seqlock(&fs_info->profiles_lock);
3543        if (flags & BTRFS_BLOCK_GROUP_DATA)
3544                fs_info->avail_data_alloc_bits |= extra_flags;
3545        if (flags & BTRFS_BLOCK_GROUP_METADATA)
3546                fs_info->avail_metadata_alloc_bits |= extra_flags;
3547        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3548                fs_info->avail_system_alloc_bits |= extra_flags;
3549        write_sequnlock(&fs_info->profiles_lock);
3550}
3551
3552/*
3553 * returns target flags in extended format or 0 if restripe for this
3554 * chunk_type is not in progress
3555 *
3556 * should be called with either volume_mutex or balance_lock held
3557 */
3558static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3559{
3560        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3561        u64 target = 0;
3562
3563        if (!bctl)
3564                return 0;
3565
3566        if (flags & BTRFS_BLOCK_GROUP_DATA &&
3567            bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3568                target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3569        } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3570                   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3571                target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3572        } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3573                   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3574                target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3575        }
3576
3577        return target;
3578}
3579
3580/*
3581 * @flags: available profiles in extended format (see ctree.h)
3582 *
3583 * Returns reduced profile in chunk format.  If profile changing is in
3584 * progress (either running or paused) picks the target profile (if it's
3585 * already available), otherwise falls back to plain reducing.
3586 */
3587static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3588{
3589        /*
3590         * we add in the count of missing devices because we want
3591         * to make sure that any RAID levels on a degraded FS
3592         * continue to be honored.
3593         */
3594        u64 num_devices = root->fs_info->fs_devices->rw_devices +
3595                root->fs_info->fs_devices->missing_devices;
3596        u64 target;
3597        u64 tmp;
3598
3599        /*
3600         * see if restripe for this chunk_type is in progress, if so
3601         * try to reduce to the target profile
3602         */
3603        spin_lock(&root->fs_info->balance_lock);
3604        target = get_restripe_target(root->fs_info, flags);
3605        if (target) {
3606                /* pick target profile only if it's already available */
3607                if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3608                        spin_unlock(&root->fs_info->balance_lock);
3609                        return extended_to_chunk(target);
3610                }
3611        }
3612        spin_unlock(&root->fs_info->balance_lock);
3613
3614        /* First, mask out the RAID levels which aren't possible */
3615        if (num_devices == 1)
3616                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3617                           BTRFS_BLOCK_GROUP_RAID5);
3618        if (num_devices < 3)
3619                flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3620        if (num_devices < 4)
3621                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3622
3623        tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3624                       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3625                       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3626        flags &= ~tmp;
3627
3628        if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3629                tmp = BTRFS_BLOCK_GROUP_RAID6;
3630        else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3631                tmp = BTRFS_BLOCK_GROUP_RAID5;
3632        else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3633                tmp = BTRFS_BLOCK_GROUP_RAID10;
3634        else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3635                tmp = BTRFS_BLOCK_GROUP_RAID1;
3636        else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3637                tmp = BTRFS_BLOCK_GROUP_RAID0;
3638
3639        return extended_to_chunk(flags | tmp);
3640}
3641
3642static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
3643{
3644        unsigned seq;
3645        u64 flags;
3646
3647        do {
3648                flags = orig_flags;
3649                seq = read_seqbegin(&root->fs_info->profiles_lock);
3650
3651                if (flags & BTRFS_BLOCK_GROUP_DATA)
3652                        flags |= root->fs_info->avail_data_alloc_bits;
3653                else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3654                        flags |= root->fs_info->avail_system_alloc_bits;
3655                else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3656                        flags |= root->fs_info->avail_metadata_alloc_bits;
3657        } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3658
3659        return btrfs_reduce_alloc_profile(root, flags);
3660}
3661
3662u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3663{
3664        u64 flags;
3665        u64 ret;
3666
3667        if (data)
3668                flags = BTRFS_BLOCK_GROUP_DATA;
3669        else if (root == root->fs_info->chunk_root)
3670                flags = BTRFS_BLOCK_GROUP_SYSTEM;
3671        else
3672                flags = BTRFS_BLOCK_GROUP_METADATA;
3673
3674        ret = get_alloc_profile(root, flags);
3675        return ret;
3676}
3677
3678/*
3679 * This will check the space that the inode allocates from to make sure we have
3680 * enough space for bytes.
3681 */
3682int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3683{
3684        struct btrfs_space_info *data_sinfo;
3685        struct btrfs_root *root = BTRFS_I(inode)->root;
3686        struct btrfs_fs_info *fs_info = root->fs_info;
3687        u64 used;
3688        int ret = 0, committed = 0, alloc_chunk = 1;
3689
3690        /* make sure bytes are sectorsize aligned */
3691        bytes = ALIGN(bytes, root->sectorsize);
3692
3693        if (btrfs_is_free_space_inode(inode)) {
3694                committed = 1;
3695                ASSERT(current->journal_info);
3696        }
3697
3698        data_sinfo = fs_info->data_sinfo;
3699        if (!data_sinfo)
3700                goto alloc;
3701
3702again:
3703        /* make sure we have enough space to handle the data first */
3704        spin_lock(&data_sinfo->lock);
3705        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3706                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3707                data_sinfo->bytes_may_use;
3708
3709        if (used + bytes > data_sinfo->total_bytes) {
3710                struct btrfs_trans_handle *trans;
3711
3712                /*
3713                 * if we don't have enough free bytes in this space then we need
3714                 * to alloc a new chunk.
3715                 */
3716                if (!data_sinfo->full && alloc_chunk) {
3717                        u64 alloc_target;
3718
3719                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3720                        spin_unlock(&data_sinfo->lock);
3721alloc:
3722                        alloc_target = btrfs_get_alloc_profile(root, 1);
3723                        /*
3724                         * It is ugly that we don't call nolock join
3725                         * transaction for the free space inode case here.
3726                         * But it is safe because we only do the data space
3727                         * reservation for the free space cache in the
3728                         * transaction context, the common join transaction
3729                         * just increase the counter of the current transaction
3730                         * handler, doesn't try to acquire the trans_lock of
3731                         * the fs.
3732                         */
3733                        trans = btrfs_join_transaction(root);
3734                        if (IS_ERR(trans))
3735                                return PTR_ERR(trans);
3736
3737                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3738                                             alloc_target,
3739                                             CHUNK_ALLOC_NO_FORCE);
3740                        btrfs_end_transaction(trans, root);
3741                        if (ret < 0) {
3742                                if (ret != -ENOSPC)
3743                                        return ret;
3744                                else
3745                                        goto commit_trans;
3746                        }
3747
3748                        if (!data_sinfo)
3749                                data_sinfo = fs_info->data_sinfo;
3750
3751                        goto again;
3752                }
3753
3754                /*
3755                 * If we don't have enough pinned space to deal with this
3756                 * allocation don't bother committing the transaction.
3757                 */
3758                if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
3759                                           bytes) < 0)
3760                        committed = 1;
3761                spin_unlock(&data_sinfo->lock);
3762
3763                /* commit the current transaction and try again */
3764commit_trans:
3765                if (!committed &&
3766                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
3767                        committed = 1;
3768
3769                        trans = btrfs_join_transaction(root);
3770                        if (IS_ERR(trans))
3771                                return PTR_ERR(trans);
3772                        ret = btrfs_commit_transaction(trans, root);
3773                        if (ret)
3774                                return ret;
3775                        goto again;
3776                }
3777
3778                trace_btrfs_space_reservation(root->fs_info,
3779                                              "space_info:enospc",
3780                                              data_sinfo->flags, bytes, 1);
3781                return -ENOSPC;
3782        }
3783        data_sinfo->bytes_may_use += bytes;
3784        trace_btrfs_space_reservation(root->fs_info, "space_info",
3785                                      data_sinfo->flags, bytes, 1);
3786        spin_unlock(&data_sinfo->lock);
3787
3788        return 0;
3789}
3790
3791/*
3792 * Called if we need to clear a data reservation for this inode.
3793 */
3794void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3795{
3796        struct btrfs_root *root = BTRFS_I(inode)->root;
3797        struct btrfs_space_info *data_sinfo;
3798
3799        /* make sure bytes are sectorsize aligned */
3800        bytes = ALIGN(bytes, root->sectorsize);
3801
3802        data_sinfo = root->fs_info->data_sinfo;
3803        spin_lock(&data_sinfo->lock);
3804        WARN_ON(data_sinfo->bytes_may_use < bytes);
3805        data_sinfo->bytes_may_use -= bytes;
3806        trace_btrfs_space_reservation(root->fs_info, "space_info",
3807                                      data_sinfo->flags, bytes, 0);
3808        spin_unlock(&data_sinfo->lock);
3809}
3810
3811static void force_metadata_allocation(struct btrfs_fs_info *info)
3812{
3813        struct list_head *head = &info->space_info;
3814        struct btrfs_space_info *found;
3815
3816        rcu_read_lock();
3817        list_for_each_entry_rcu(found, head, list) {
3818                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3819                        found->force_alloc = CHUNK_ALLOC_FORCE;
3820        }
3821        rcu_read_unlock();
3822}
3823
3824static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
3825{
3826        return (global->size << 1);
3827}
3828
3829static int should_alloc_chunk(struct btrfs_root *root,
3830                              struct btrfs_space_info *sinfo, int force)
3831{
3832        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3833        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3834        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3835        u64 thresh;
3836
3837        if (force == CHUNK_ALLOC_FORCE)
3838                return 1;
3839
3840        /*
3841         * We need to take into account the global rsv because for all intents
3842         * and purposes it's used space.  Don't worry about locking the
3843         * global_rsv, it doesn't change except when the transaction commits.
3844         */
3845        if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3846                num_allocated += calc_global_rsv_need_space(global_rsv);
3847
3848        /*
3849         * in limited mode, we want to have some free space up to
3850         * about 1% of the FS size.
3851         */
3852        if (force == CHUNK_ALLOC_LIMITED) {
3853                thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3854                thresh = max_t(u64, 64 * 1024 * 1024,
3855                               div_factor_fine(thresh, 1));
3856
3857                if (num_bytes - num_allocated < thresh)
3858                        return 1;
3859        }
3860
3861        if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3862                return 0;
3863        return 1;
3864}
3865
3866static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3867{
3868        u64 num_dev;
3869
3870        if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3871                    BTRFS_BLOCK_GROUP_RAID0 |
3872                    BTRFS_BLOCK_GROUP_RAID5 |
3873                    BTRFS_BLOCK_GROUP_RAID6))
3874                num_dev = root->fs_info->fs_devices->rw_devices;
3875        else if (type & BTRFS_BLOCK_GROUP_RAID1)
3876                num_dev = 2;
3877        else
3878                num_dev = 1;    /* DUP or single */
3879
3880        /* metadata for updaing devices and chunk tree */
3881        return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3882}
3883
3884static void check_system_chunk(struct btrfs_trans_handle *trans,
3885                               struct btrfs_root *root, u64 type)
3886{
3887        struct btrfs_space_info *info;
3888        u64 left;
3889        u64 thresh;
3890
3891        info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3892        spin_lock(&info->lock);
3893        left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3894                info->bytes_reserved - info->bytes_readonly;
3895        spin_unlock(&info->lock);
3896
3897        thresh = get_system_chunk_thresh(root, type);
3898        if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3899                btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
3900                        left, thresh, type);
3901                dump_space_info(info, 0, 0);
3902        }
3903
3904        if (left < thresh) {
3905                u64 flags;
3906
3907                flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3908                btrfs_alloc_chunk(trans, root, flags);
3909        }
3910}
3911
3912static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3913                          struct btrfs_root *extent_root, u64 flags, int force)
3914{
3915        struct btrfs_space_info *space_info;
3916        struct btrfs_fs_info *fs_info = extent_root->fs_info;
3917        int wait_for_alloc = 0;
3918        int ret = 0;
3919
3920        /* Don't re-enter if we're already allocating a chunk */
3921        if (trans->allocating_chunk)
3922                return -ENOSPC;
3923
3924        space_info = __find_space_info(extent_root->fs_info, flags);
3925        if (!space_info) {
3926                ret = update_space_info(extent_root->fs_info, flags,
3927                                        0, 0, &space_info);
3928                BUG_ON(ret); /* -ENOMEM */
3929        }
3930        BUG_ON(!space_info); /* Logic error */
3931
3932again:
3933        spin_lock(&space_info->lock);
3934        if (force < space_info->force_alloc)
3935                force = space_info->force_alloc;
3936        if (space_info->full) {
3937                if (should_alloc_chunk(extent_root, space_info, force))
3938                        ret = -ENOSPC;
3939                else
3940                        ret = 0;
3941                spin_unlock(&space_info->lock);
3942                return ret;
3943        }
3944
3945        if (!should_alloc_chunk(extent_root, space_info, force)) {
3946                spin_unlock(&space_info->lock);
3947                return 0;
3948        } else if (space_info->chunk_alloc) {
3949                wait_for_alloc = 1;
3950        } else {
3951                space_info->chunk_alloc = 1;
3952        }
3953
3954        spin_unlock(&space_info->lock);
3955
3956        mutex_lock(&fs_info->chunk_mutex);
3957
3958        /*
3959         * The chunk_mutex is held throughout the entirety of a chunk
3960         * allocation, so once we've acquired the chunk_mutex we know that the
3961         * other guy is done and we need to recheck and see if we should
3962         * allocate.
3963         */
3964        if (wait_for_alloc) {
3965                mutex_unlock(&fs_info->chunk_mutex);
3966                wait_for_alloc = 0;
3967                goto again;
3968        }
3969
3970        trans->allocating_chunk = true;
3971
3972        /*
3973         * If we have mixed data/metadata chunks we want to make sure we keep
3974         * allocating mixed chunks instead of individual chunks.
3975         */
3976        if (btrfs_mixed_space_info(space_info))
3977                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3978
3979        /*
3980         * if we're doing a data chunk, go ahead and make sure that
3981         * we keep a reasonable number of metadata chunks allocated in the
3982         * FS as well.
3983         */
3984        if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3985                fs_info->data_chunk_allocations++;
3986                if (!(fs_info->data_chunk_allocations %
3987                      fs_info->metadata_ratio))
3988                        force_metadata_allocation(fs_info);
3989        }
3990
3991        /*
3992         * Check if we have enough space in SYSTEM chunk because we may need
3993         * to update devices.
3994         */
3995        check_system_chunk(trans, extent_root, flags);
3996
3997        ret = btrfs_alloc_chunk(trans, extent_root, flags);
3998        trans->allocating_chunk = false;
3999
4000        spin_lock(&space_info->lock);
4001        if (ret < 0 && ret != -ENOSPC)
4002                goto out;
4003        if (ret)
4004                space_info->full = 1;
4005        else
4006                ret = 1;
4007
4008        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4009out:
4010        space_info->chunk_alloc = 0;
4011        spin_unlock(&space_info->lock);
4012        mutex_unlock(&fs_info->chunk_mutex);
4013        return ret;
4014}
4015
4016static int can_overcommit(struct btrfs_root *root,
4017                          struct btrfs_space_info *space_info, u64 bytes,
4018                          enum btrfs_reserve_flush_enum flush)
4019{
4020        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4021        u64 profile = btrfs_get_alloc_profile(root, 0);
4022        u64 space_size;
4023        u64 avail;
4024        u64 used;
4025
4026        used = space_info->bytes_used + space_info->bytes_reserved +
4027                space_info->bytes_pinned + space_info->bytes_readonly;
4028
4029        /*
4030         * We only want to allow over committing if we have lots of actual space
4031         * free, but if we don't have enough space to handle the global reserve
4032         * space then we could end up having a real enospc problem when trying
4033         * to allocate a chunk or some other such important allocation.
4034         */
4035        spin_lock(&global_rsv->lock);
4036        space_size = calc_global_rsv_need_space(global_rsv);
4037        spin_unlock(&global_rsv->lock);
4038        if (used + space_size >= space_info->total_bytes)
4039                return 0;
4040
4041        used += space_info->bytes_may_use;
4042
4043        spin_lock(&root->fs_info->free_chunk_lock);
4044        avail = root->fs_info->free_chunk_space;
4045        spin_unlock(&root->fs_info->free_chunk_lock);
4046
4047        /*
4048         * If we have dup, raid1 or raid10 then only half of the free
4049         * space is actually useable.  For raid56, the space info used
4050         * doesn't include the parity drive, so we don't have to
4051         * change the math
4052         */
4053        if (profile & (BTRFS_BLOCK_GROUP_DUP |
4054                       BTRFS_BLOCK_GROUP_RAID1 |
4055                       BTRFS_BLOCK_GROUP_RAID10))
4056                avail >>= 1;
4057
4058        /*
4059         * If we aren't flushing all things, let us overcommit up to
4060         * 1/2th of the space. If we can flush, don't let us overcommit
4061         * too much, let it overcommit up to 1/8 of the space.
4062         */
4063        if (flush == BTRFS_RESERVE_FLUSH_ALL)
4064                avail >>= 3;
4065        else
4066                avail >>= 1;
4067
4068        if (used + bytes < space_info->total_bytes + avail)
4069                return 1;
4070        return 0;
4071}
4072
4073static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4074                                         unsigned long nr_pages, int nr_items)
4075{
4076        struct super_block *sb = root->fs_info->sb;
4077
4078        if (down_read_trylock(&sb->s_umount)) {
4079                writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
4080                up_read(&sb->s_umount);
4081        } else {
4082                /*
4083                 * We needn't worry the filesystem going from r/w to r/o though
4084                 * we don't acquire ->s_umount mutex, because the filesystem
4085                 * should guarantee the delalloc inodes list be empty after
4086                 * the filesystem is readonly(all dirty pages are written to
4087                 * the disk).
4088                 */
4089                btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4090                if (!current->journal_info)
4091                        btrfs_wait_ordered_roots(root->fs_info, nr_items);
4092        }
4093}
4094
4095static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
4096{
4097        u64 bytes;
4098        int nr;
4099
4100        bytes = btrfs_calc_trans_metadata_size(root, 1);
4101        nr = (int)div64_u64(to_reclaim, bytes);
4102        if (!nr)
4103                nr = 1;
4104        return nr;
4105}
4106
4107#define EXTENT_SIZE_PER_ITEM    (256 * 1024)
4108
4109/*
4110 * shrink metadata reservation for delalloc
4111 */
4112static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4113                            bool wait_ordered)
4114{
4115        struct btrfs_block_rsv *block_rsv;
4116        struct btrfs_space_info *space_info;
4117        struct btrfs_trans_handle *trans;
4118        u64 delalloc_bytes;
4119        u64 max_reclaim;
4120        long time_left;
4121        unsigned long nr_pages;
4122        int loops;
4123        int items;
4124        enum btrfs_reserve_flush_enum flush;
4125
4126        /* Calc the number of the pages we need flush for space reservation */
4127        items = calc_reclaim_items_nr(root, to_reclaim);
4128        to_reclaim = items * EXTENT_SIZE_PER_ITEM;
4129
4130        trans = (struct btrfs_trans_handle *)current->journal_info;
4131        block_rsv = &root->fs_info->delalloc_block_rsv;
4132        space_info = block_rsv->space_info;
4133
4134        delalloc_bytes = percpu_counter_sum_positive(
4135                                                &root->fs_info->delalloc_bytes);
4136        if (delalloc_bytes == 0) {
4137                if (trans)
4138                        return;
4139                if (wait_ordered)
4140                        btrfs_wait_ordered_roots(root->fs_info, items);
4141                return;
4142        }
4143
4144        loops = 0;
4145        while (delalloc_bytes && loops < 3) {
4146                max_reclaim = min(delalloc_bytes, to_reclaim);
4147                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
4148                btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4149                /*
4150                 * We need to wait for the async pages to actually start before
4151                 * we do anything.
4152                 */
4153                max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
4154                if (!max_reclaim)
4155                        goto skip_async;
4156
4157                if (max_reclaim <= nr_pages)
4158                        max_reclaim = 0;
4159                else
4160                        max_reclaim -= nr_pages;
4161
4162                wait_event(root->fs_info->async_submit_wait,
4163                           atomic_read(&root->fs_info->async_delalloc_pages) <=
4164                           (int)max_reclaim);
4165skip_async:
4166                if (!trans)
4167                        flush = BTRFS_RESERVE_FLUSH_ALL;
4168                else
4169                        flush = BTRFS_RESERVE_NO_FLUSH;
4170                spin_lock(&space_info->lock);
4171                if (can_overcommit(root, space_info, orig, flush)) {
4172                        spin_unlock(&space_info->lock);
4173                        break;
4174                }
4175                spin_unlock(&space_info->lock);
4176
4177                loops++;
4178                if (wait_ordered && !trans) {
4179                        btrfs_wait_ordered_roots(root->fs_info, items);
4180                } else {
4181                        time_left = schedule_timeout_killable(1);
4182                        if (time_left)
4183                                break;
4184                }
4185                delalloc_bytes = percpu_counter_sum_positive(
4186                                                &root->fs_info->delalloc_bytes);
4187        }
4188}
4189
4190/**
4191 * maybe_commit_transaction - possibly commit the transaction if its ok to
4192 * @root - the root we're allocating for
4193 * @bytes - the number of bytes we want to reserve
4194 * @force - force the commit
4195 *
4196 * This will check to make sure that committing the transaction will actually
4197 * get us somewhere and then commit the transaction if it does.  Otherwise it
4198 * will return -ENOSPC.
4199 */
4200static int may_commit_transaction(struct btrfs_root *root,
4201                                  struct btrfs_space_info *space_info,
4202                                  u64 bytes, int force)
4203{
4204        struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4205        struct btrfs_trans_handle *trans;
4206
4207        trans = (struct btrfs_trans_handle *)current->journal_info;
4208        if (trans)
4209                return -EAGAIN;
4210
4211        if (force)
4212                goto commit;
4213
4214        /* See if there is enough pinned space to make this reservation */
4215        if (percpu_counter_compare(&space_info->total_bytes_pinned,
4216                                   bytes) >= 0)
4217                goto commit;
4218
4219        /*
4220         * See if there is some space in the delayed insertion reservation for
4221         * this reservation.
4222         */
4223        if (space_info != delayed_rsv->space_info)
4224                return -ENOSPC;
4225
4226        spin_lock(&delayed_rsv->lock);
4227        if (percpu_counter_compare(&space_info->total_bytes_pinned,
4228                                   bytes - delayed_rsv->size) >= 0) {
4229                spin_unlock(&delayed_rsv->lock);
4230                return -ENOSPC;
4231        }
4232        spin_unlock(&delayed_rsv->lock);
4233
4234commit:
4235        trans = btrfs_join_transaction(root);
4236        if (IS_ERR(trans))
4237                return -ENOSPC;
4238
4239        return btrfs_commit_transaction(trans, root);
4240}
4241
4242enum flush_state {
4243        FLUSH_DELAYED_ITEMS_NR  =       1,
4244        FLUSH_DELAYED_ITEMS     =       2,
4245        FLUSH_DELALLOC          =       3,
4246        FLUSH_DELALLOC_WAIT     =       4,
4247        ALLOC_CHUNK             =       5,
4248        COMMIT_TRANS            =       6,
4249};
4250
4251static int flush_space(struct btrfs_root *root,
4252                       struct btrfs_space_info *space_info, u64 num_bytes,
4253                       u64 orig_bytes, int state)
4254{
4255        struct btrfs_trans_handle *trans;
4256        int nr;
4257        int ret = 0;
4258
4259        switch (state) {
4260        case FLUSH_DELAYED_ITEMS_NR:
4261        case FLUSH_DELAYED_ITEMS:
4262                if (state == FLUSH_DELAYED_ITEMS_NR)
4263                        nr = calc_reclaim_items_nr(root, num_bytes) * 2;
4264                else
4265                        nr = -1;
4266
4267                trans = btrfs_join_transaction(root);
4268                if (IS_ERR(trans)) {
4269                        ret = PTR_ERR(trans);
4270                        break;
4271                }
4272                ret = btrfs_run_delayed_items_nr(trans, root, nr);
4273                btrfs_end_transaction(trans, root);
4274                break;
4275        case FLUSH_DELALLOC:
4276        case FLUSH_DELALLOC_WAIT:
4277                shrink_delalloc(root, num_bytes * 2, orig_bytes,
4278                                state == FLUSH_DELALLOC_WAIT);
4279                break;
4280        case ALLOC_CHUNK:
4281                trans = btrfs_join_transaction(root);
4282                if (IS_ERR(trans)) {
4283                        ret = PTR_ERR(trans);
4284                        break;
4285                }
4286                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4287                                     btrfs_get_alloc_profile(root, 0),
4288                                     CHUNK_ALLOC_NO_FORCE);
4289                btrfs_end_transaction(trans, root);
4290                if (ret == -ENOSPC)
4291                        ret = 0;
4292                break;
4293        case COMMIT_TRANS:
4294                ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4295                break;
4296        default:
4297                ret = -ENOSPC;
4298                break;
4299        }
4300
4301        return ret;
4302}
4303
4304static inline u64
4305btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
4306                                 struct btrfs_space_info *space_info)
4307{
4308        u64 used;
4309        u64 expected;
4310        u64 to_reclaim;
4311
4312        to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
4313                                16 * 1024 * 1024);
4314        spin_lock(&space_info->lock);
4315        if (can_overcommit(root, space_info, to_reclaim,
4316                           BTRFS_RESERVE_FLUSH_ALL)) {
4317                to_reclaim = 0;
4318                goto out;
4319        }
4320
4321        used = space_info->bytes_used + space_info->bytes_reserved +
4322               space_info->bytes_pinned + space_info->bytes_readonly +
4323               space_info->bytes_may_use;
4324        if (can_overcommit(root, space_info, 1024 * 1024,
4325                           BTRFS_RESERVE_FLUSH_ALL))
4326                expected = div_factor_fine(space_info->total_bytes, 95);
4327        else
4328                expected = div_factor_fine(space_info->total_bytes, 90);
4329
4330        if (used > expected)
4331                to_reclaim = used - expected;
4332        else
4333                to_reclaim = 0;
4334        to_reclaim = min(to_reclaim, space_info->bytes_may_use +
4335                                     space_info->bytes_reserved);
4336out:
4337        spin_unlock(&space_info->lock);
4338
4339        return to_reclaim;
4340}
4341
4342static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4343                                        struct btrfs_fs_info *fs_info, u64 used)
4344{
4345        return (used >= div_factor_fine(space_info->total_bytes, 98) &&
4346                !btrfs_fs_closing(fs_info) &&
4347                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4348}
4349
4350static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4351                                       struct btrfs_fs_info *fs_info)
4352{
4353        u64 used;
4354
4355        spin_lock(&space_info->lock);
4356        used = space_info->bytes_used + space_info->bytes_reserved +
4357               space_info->bytes_pinned + space_info->bytes_readonly +
4358               space_info->bytes_may_use;
4359        if (need_do_async_reclaim(space_info, fs_info, used)) {
4360                spin_unlock(&space_info->lock);
4361                return 1;
4362        }
4363        spin_unlock(&space_info->lock);
4364
4365        return 0;
4366}
4367
4368static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4369{
4370        struct btrfs_fs_info *fs_info;
4371        struct btrfs_space_info *space_info;
4372        u64 to_reclaim;
4373        int flush_state;
4374
4375        fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
4376        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4377
4378        to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
4379                                                      space_info);
4380        if (!to_reclaim)
4381                return;
4382
4383        flush_state = FLUSH_DELAYED_ITEMS_NR;
4384        do {
4385                flush_space(fs_info->fs_root, space_info, to_reclaim,
4386                            to_reclaim, flush_state);
4387                flush_state++;
4388                if (!btrfs_need_do_async_reclaim(space_info, fs_info))
4389                        return;
4390        } while (flush_state <= COMMIT_TRANS);
4391
4392        if (btrfs_need_do_async_reclaim(space_info, fs_info))
4393                queue_work(system_unbound_wq, work);
4394}
4395
4396void btrfs_init_async_reclaim_work(struct work_struct *work)
4397{
4398        INIT_WORK(work, btrfs_async_reclaim_metadata_space);
4399}
4400
4401/**
4402 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4403 * @root - the root we're allocating for
4404 * @block_rsv - the block_rsv we're allocating for
4405 * @orig_bytes - the number of bytes we want
4406 * @flush - whether or not we can flush to make our reservation
4407 *
4408 * This will reserve orgi_bytes number of bytes from the space info associated
4409 * with the block_rsv.  If there is not enough space it will make an attempt to
4410 * flush out space to make room.  It will do this by flushing delalloc if
4411 * possible or committing the transaction.  If flush is 0 then no attempts to
4412 * regain reservations will be made and this will fail if there is not enough
4413 * space already.
4414 */
4415static int reserve_metadata_bytes(struct btrfs_root *root,
4416                                  struct btrfs_block_rsv *block_rsv,
4417                                  u64 orig_bytes,
4418                                  enum btrfs_reserve_flush_enum flush)
4419{
4420        struct btrfs_space_info *space_info = block_rsv->space_info;
4421        u64 used;
4422        u64 num_bytes = orig_bytes;
4423        int flush_state = FLUSH_DELAYED_ITEMS_NR;
4424        int ret = 0;
4425        bool flushing = false;
4426
4427again:
4428        ret = 0;
4429        spin_lock(&space_info->lock);
4430        /*
4431         * We only want to wait if somebody other than us is flushing and we
4432         * are actually allowed to flush all things.
4433         */
4434        while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4435               space_info->flush) {
4436                spin_unlock(&space_info->lock);
4437                /*
4438                 * If we have a trans handle we can't wait because the flusher
4439                 * may have to commit the transaction, which would mean we would
4440                 * deadlock since we are waiting for the flusher to finish, but
4441                 * hold the current transaction open.
4442                 */
4443                if (current->journal_info)
4444                        return -EAGAIN;
4445                ret = wait_event_killable(space_info->wait, !space_info->flush);
4446                /* Must have been killed, return */
4447                if (ret)
4448                        return -EINTR;
4449
4450                spin_lock(&space_info->lock);
4451        }
4452
4453        ret = -ENOSPC;
4454        used = space_info->bytes_used + space_info->bytes_reserved +
4455                space_info->bytes_pinned + space_info->bytes_readonly +
4456                space_info->bytes_may_use;
4457
4458        /*
4459         * The idea here is that we've not already over-reserved the block group
4460         * then we can go ahead and save our reservation first and then start
4461         * flushing if we need to.  Otherwise if we've already overcommitted
4462         * lets start flushing stuff first and then come back and try to make
4463         * our reservation.
4464         */
4465        if (used <= space_info->total_bytes) {
4466                if (used + orig_bytes <= space_info->total_bytes) {
4467                        space_info->bytes_may_use += orig_bytes;
4468                        trace_btrfs_space_reservation(root->fs_info,
4469                                "space_info", space_info->flags, orig_bytes, 1);
4470                        ret = 0;
4471                } else {
4472                        /*
4473                         * Ok set num_bytes to orig_bytes since we aren't
4474                         * overocmmitted, this way we only try and reclaim what
4475                         * we need.
4476                         */
4477                        num_bytes = orig_bytes;
4478                }
4479        } else {
4480                /*
4481                 * Ok we're over committed, set num_bytes to the overcommitted
4482                 * amount plus the amount of bytes that we need for this
4483                 * reservation.
4484                 */
4485                num_bytes = used - space_info->total_bytes +
4486                        (orig_bytes * 2);
4487        }
4488
4489        if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4490                space_info->bytes_may_use += orig_bytes;
4491                trace_btrfs_space_reservation(root->fs_info, "space_info",
4492                                              space_info->flags, orig_bytes,
4493                                              1);
4494                ret = 0;
4495        }
4496
4497        /*
4498         * Couldn't make our reservation, save our place so while we're trying
4499         * to reclaim space we can actually use it instead of somebody else
4500         * stealing it from us.
4501         *
4502         * We make the other tasks wait for the flush only when we can flush
4503         * all things.
4504         */
4505        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4506                flushing = true;
4507                space_info->flush = 1;
4508        } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4509                used += orig_bytes;
4510                if (need_do_async_reclaim(space_info, root->fs_info, used) &&
4511                    !work_busy(&root->fs_info->async_reclaim_work))
4512                        queue_work(system_unbound_wq,
4513                                   &root->fs_info->async_reclaim_work);
4514        }
4515        spin_unlock(&space_info->lock);
4516
4517        if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4518                goto out;
4519
4520        ret = flush_space(root, space_info, num_bytes, orig_bytes,
4521                          flush_state);
4522        flush_state++;
4523
4524        /*
4525         * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4526         * would happen. So skip delalloc flush.
4527         */
4528        if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4529            (flush_state == FLUSH_DELALLOC ||
4530             flush_state == FLUSH_DELALLOC_WAIT))
4531                flush_state = ALLOC_CHUNK;
4532
4533        if (!ret)
4534                goto again;
4535        else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4536                 flush_state < COMMIT_TRANS)
4537                goto again;
4538        else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4539                 flush_state <= COMMIT_TRANS)
4540                goto again;
4541
4542out:
4543        if (ret == -ENOSPC &&
4544            unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4545                struct btrfs_block_rsv *global_rsv =
4546                        &root->fs_info->global_block_rsv;
4547
4548                if (block_rsv != global_rsv &&
4549                    !block_rsv_use_bytes(global_rsv, orig_bytes))
4550                        ret = 0;
4551        }
4552        if (ret == -ENOSPC)
4553                trace_btrfs_space_reservation(root->fs_info,
4554                                              "space_info:enospc",
4555                                              space_info->flags, orig_bytes, 1);
4556        if (flushing) {
4557                spin_lock(&space_info->lock);
4558                space_info->flush = 0;
4559                wake_up_all(&space_info->wait);
4560                spin_unlock(&space_info->lock);
4561        }
4562        return ret;
4563}
4564
4565static struct btrfs_block_rsv *get_block_rsv(
4566                                        const struct btrfs_trans_handle *trans,
4567                                        const struct btrfs_root *root)
4568{
4569        struct btrfs_block_rsv *block_rsv = NULL;
4570
4571        if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4572                block_rsv = trans->block_rsv;
4573
4574        if (root == root->fs_info->csum_root && trans->adding_csums)
4575                block_rsv = trans->block_rsv;
4576
4577        if (root == root->fs_info->uuid_root)
4578                block_rsv = trans->block_rsv;
4579
4580        if (!block_rsv)
4581                block_rsv = root->block_rsv;
4582
4583        if (!block_rsv)
4584                block_rsv = &root->fs_info->empty_block_rsv;
4585
4586        return block_rsv;
4587}
4588
4589static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4590                               u64 num_bytes)
4591{
4592        int ret = -ENOSPC;
4593        spin_lock(&block_rsv->lock);
4594        if (block_rsv->reserved >= num_bytes) {
4595                block_rsv->reserved -= num_bytes;
4596                if (block_rsv->reserved < block_rsv->size)
4597                        block_rsv->full = 0;
4598                ret = 0;
4599        }
4600        spin_unlock(&block_rsv->lock);
4601        return ret;
4602}
4603
4604static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4605                                u64 num_bytes, int update_size)
4606{
4607        spin_lock(&block_rsv->lock);
4608        block_rsv->reserved += num_bytes;
4609        if (update_size)
4610                block_rsv->size += num_bytes;
4611        else if (block_rsv->reserved >= block_rsv->size)
4612                block_rsv->full = 1;
4613        spin_unlock(&block_rsv->lock);
4614}
4615
4616int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
4617                             struct btrfs_block_rsv *dest, u64 num_bytes,
4618                             int min_factor)
4619{
4620        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4621        u64 min_bytes;
4622
4623        if (global_rsv->space_info != dest->space_info)
4624                return -ENOSPC;
4625
4626        spin_lock(&global_rsv->lock);
4627        min_bytes = div_factor(global_rsv->size, min_factor);
4628        if (global_rsv->reserved < min_bytes + num_bytes) {
4629                spin_unlock(&global_rsv->lock);
4630                return -ENOSPC;
4631        }
4632        global_rsv->reserved -= num_bytes;
4633        if (global_rsv->reserved < global_rsv->size)
4634                global_rsv->full = 0;
4635        spin_unlock(&global_rsv->lock);
4636
4637        block_rsv_add_bytes(dest, num_bytes, 1);
4638        return 0;
4639}
4640
4641static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4642                                    struct btrfs_block_rsv *block_rsv,
4643                                    struct btrfs_block_rsv *dest, u64 num_bytes)
4644{
4645        struct btrfs_space_info *space_info = block_rsv->space_info;
4646
4647        spin_lock(&block_rsv->lock);
4648        if (num_bytes == (u64)-1)
4649                num_bytes = block_rsv->size;
4650        block_rsv->size -= num_bytes;
4651        if (block_rsv->reserved >= block_rsv->size) {
4652                num_bytes = block_rsv->reserved - block_rsv->size;
4653                block_rsv->reserved = block_rsv->size;
4654                block_rsv->full = 1;
4655        } else {
4656                num_bytes = 0;
4657        }
4658        spin_unlock(&block_rsv->lock);
4659
4660        if (num_bytes > 0) {
4661                if (dest) {
4662                        spin_lock(&dest->lock);
4663                        if (!dest->full) {
4664                                u64 bytes_to_add;
4665
4666                                bytes_to_add = dest->size - dest->reserved;
4667                                bytes_to_add = min(num_bytes, bytes_to_add);
4668                                dest->reserved += bytes_to_add;
4669                                if (dest->reserved >= dest->size)
4670                                        dest->full = 1;
4671                                num_bytes -= bytes_to_add;
4672                        }
4673                        spin_unlock(&dest->lock);
4674                }
4675                if (num_bytes) {
4676                        spin_lock(&space_info->lock);
4677                        space_info->bytes_may_use -= num_bytes;
4678                        trace_btrfs_space_reservation(fs_info, "space_info",
4679                                        space_info->flags, num_bytes, 0);
4680                        spin_unlock(&space_info->lock);
4681                }
4682        }
4683}
4684
4685static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4686                                   struct btrfs_block_rsv *dst, u64 num_bytes)
4687{
4688        int ret;
4689
4690        ret = block_rsv_use_bytes(src, num_bytes);
4691        if (ret)
4692                return ret;
4693
4694        block_rsv_add_bytes(dst, num_bytes, 1);
4695        return 0;
4696}
4697
4698void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4699{
4700        memset(rsv, 0, sizeof(*rsv));
4701        spin_lock_init(&rsv->lock);
4702        rsv->type = type;
4703}
4704
4705struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4706                                              unsigned short type)
4707{
4708        struct btrfs_block_rsv *block_rsv;
4709        struct btrfs_fs_info *fs_info = root->fs_info;
4710
4711        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4712        if (!block_rsv)
4713                return NULL;
4714
4715        btrfs_init_block_rsv(block_rsv, type);
4716        block_rsv->space_info = __find_space_info(fs_info,
4717                                                  BTRFS_BLOCK_GROUP_METADATA);
4718        return block_rsv;
4719}
4720
4721void btrfs_free_block_rsv(struct btrfs_root *root,
4722                          struct btrfs_block_rsv *rsv)
4723{
4724        if (!rsv)
4725                return;
4726        btrfs_block_rsv_release(root, rsv, (u64)-1);
4727        kfree(rsv);
4728}
4729
4730int btrfs_block_rsv_add(struct btrfs_root *root,
4731                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4732                        enum btrfs_reserve_flush_enum flush)
4733{
4734        int ret;
4735
4736        if (num_bytes == 0)
4737                return 0;
4738
4739        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4740        if (!ret) {
4741                block_rsv_add_bytes(block_rsv, num_bytes, 1);
4742                return 0;
4743        }
4744
4745        return ret;
4746}
4747
4748int btrfs_block_rsv_check(struct btrfs_root *root,
4749                          struct btrfs_block_rsv *block_rsv, int min_factor)
4750{
4751        u64 num_bytes = 0;
4752        int ret = -ENOSPC;
4753
4754        if (!block_rsv)
4755                return 0;
4756
4757        spin_lock(&block_rsv->lock);
4758        num_bytes = div_factor(block_rsv->size, min_factor);
4759        if (block_rsv->reserved >= num_bytes)
4760                ret = 0;
4761        spin_unlock(&block_rsv->lock);
4762
4763        return ret;
4764}
4765
4766int btrfs_block_rsv_refill(struct btrfs_root *root,
4767                           struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4768                           enum btrfs_reserve_flush_enum flush)
4769{
4770        u64 num_bytes = 0;
4771        int ret = -ENOSPC;
4772
4773        if (!block_rsv)
4774                return 0;
4775
4776        spin_lock(&block_rsv->lock);
4777        num_bytes = min_reserved;
4778        if (block_rsv->reserved >= num_bytes)
4779                ret = 0;
4780        else
4781                num_bytes -= block_rsv->reserved;
4782        spin_unlock(&block_rsv->lock);
4783
4784        if (!ret)
4785                return 0;
4786
4787        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4788        if (!ret) {
4789                block_rsv_add_bytes(block_rsv, num_bytes, 0);
4790                return 0;
4791        }
4792
4793        return ret;
4794}
4795
4796int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4797                            struct btrfs_block_rsv *dst_rsv,
4798                            u64 num_bytes)
4799{
4800        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4801}
4802
4803void btrfs_block_rsv_release(struct btrfs_root *root,
4804                             struct btrfs_block_rsv *block_rsv,
4805                             u64 num_bytes)
4806{
4807        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4808        if (global_rsv == block_rsv ||
4809            block_rsv->space_info != global_rsv->space_info)
4810                global_rsv = NULL;
4811        block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4812                                num_bytes);
4813}
4814
4815/*
4816 * helper to calculate size of global block reservation.
4817 * the desired value is sum of space used by extent tree,
4818 * checksum tree and root tree
4819 */
4820static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4821{
4822        struct btrfs_space_info *sinfo;
4823        u64 num_bytes;
4824        u64 meta_used;
4825        u64 data_used;
4826        int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4827
4828        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4829        spin_lock(&sinfo->lock);
4830        data_used = sinfo->bytes_used;
4831        spin_unlock(&sinfo->lock);
4832
4833        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4834        spin_lock(&sinfo->lock);
4835        if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4836                data_used = 0;
4837        meta_used = sinfo->bytes_used;
4838        spin_unlock(&sinfo->lock);
4839
4840        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4841                    csum_size * 2;
4842        num_bytes += div64_u64(data_used + meta_used, 50);
4843
4844        if (num_bytes * 3 > meta_used)
4845                num_bytes = div64_u64(meta_used, 3);
4846
4847        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4848}
4849
4850static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4851{
4852        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4853        struct btrfs_space_info *sinfo = block_rsv->space_info;
4854        u64 num_bytes;
4855
4856        num_bytes = calc_global_metadata_size(fs_info);
4857
4858        spin_lock(&sinfo->lock);
4859        spin_lock(&block_rsv->lock);
4860
4861        block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
4862
4863        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4864                    sinfo->bytes_reserved + sinfo->bytes_readonly +
4865                    sinfo->bytes_may_use;
4866
4867        if (sinfo->total_bytes > num_bytes) {
4868                num_bytes = sinfo->total_bytes - num_bytes;
4869                block_rsv->reserved += num_bytes;
4870                sinfo->bytes_may_use += num_bytes;
4871                trace_btrfs_space_reservation(fs_info, "space_info",
4872                                      sinfo->flags, num_bytes, 1);
4873        }
4874
4875        if (block_rsv->reserved >= block_rsv->size) {
4876                num_bytes = block_rsv->reserved - block_rsv->size;
4877                sinfo->bytes_may_use -= num_bytes;
4878                trace_btrfs_space_reservation(fs_info, "space_info",
4879                                      sinfo->flags, num_bytes, 0);
4880                block_rsv->reserved = block_rsv->size;
4881                block_rsv->full = 1;
4882        }
4883
4884        spin_unlock(&block_rsv->lock);
4885        spin_unlock(&sinfo->lock);
4886}
4887
4888static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4889{
4890        struct btrfs_space_info *space_info;
4891
4892        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4893        fs_info->chunk_block_rsv.space_info = space_info;
4894
4895        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4896        fs_info->global_block_rsv.space_info = space_info;
4897        fs_info->delalloc_block_rsv.space_info = space_info;
4898        fs_info->trans_block_rsv.space_info = space_info;
4899        fs_info->empty_block_rsv.space_info = space_info;
4900        fs_info->delayed_block_rsv.space_info = space_info;
4901
4902        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4903        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4904        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4905        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4906        if (fs_info->quota_root)
4907                fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
4908        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4909
4910        update_global_block_rsv(fs_info);
4911}
4912
4913static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4914{
4915        block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4916                                (u64)-1);
4917        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4918        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4919        WARN_ON(fs_info->trans_block_rsv.size > 0);
4920        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4921        WARN_ON(fs_info->chunk_block_rsv.size > 0);
4922        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4923        WARN_ON(fs_info->delayed_block_rsv.size > 0);
4924        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4925}
4926
4927void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4928                                  struct btrfs_root *root)
4929{
4930        if (!trans->block_rsv)
4931                return;
4932
4933        if (!trans->bytes_reserved)
4934                return;
4935
4936        trace_btrfs_space_reservation(root->fs_info, "transaction",
4937                                      trans->transid, trans->bytes_reserved, 0);
4938        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4939        trans->bytes_reserved = 0;
4940}
4941
4942/* Can only return 0 or -ENOSPC */
4943int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4944                                  struct inode *inode)
4945{
4946        struct btrfs_root *root = BTRFS_I(inode)->root;
4947        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4948        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4949
4950        /*
4951         * We need to hold space in order to delete our orphan item once we've
4952         * added it, so this takes the reservation so we can release it later
4953         * when we are truly done with the orphan item.
4954         */
4955        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4956        trace_btrfs_space_reservation(root->fs_info, "orphan",
4957                                      btrfs_ino(inode), num_bytes, 1);
4958        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4959}
4960
4961void btrfs_orphan_release_metadata(struct inode *inode)
4962{
4963        struct btrfs_root *root = BTRFS_I(inode)->root;
4964        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4965        trace_btrfs_space_reservation(root->fs_info, "orphan",
4966                                      btrfs_ino(inode), num_bytes, 0);
4967        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4968}
4969
4970/*
4971 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4972 * root: the root of the parent directory
4973 * rsv: block reservation
4974 * items: the number of items that we need do reservation
4975 * qgroup_reserved: used to return the reserved size in qgroup
4976 *
4977 * This function is used to reserve the space for snapshot/subvolume
4978 * creation and deletion. Those operations are different with the
4979 * common file/directory operations, they change two fs/file trees
4980 * and root tree, the number of items that the qgroup reserves is
4981 * different with the free space reservation. So we can not use
4982 * the space reseravtion mechanism in start_transaction().
4983 */
4984int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4985                                     struct btrfs_block_rsv *rsv,
4986                                     int items,
4987                                     u64 *qgroup_reserved,
4988                                     bool use_global_rsv)
4989{
4990        u64 num_bytes;
4991        int ret;
4992        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4993
4994        if (root->fs_info->quota_enabled) {
4995                /* One for parent inode, two for dir entries */
4996                num_bytes = 3 * root->leafsize;
4997                ret = btrfs_qgroup_reserve(root, num_bytes);
4998                if (ret)
4999                        return ret;
5000        } else {
5001                num_bytes = 0;
5002        }
5003
5004        *qgroup_reserved = num_bytes;
5005
5006        num_bytes = btrfs_calc_trans_metadata_size(root, items);
5007        rsv->space_info = __find_space_info(root->fs_info,
5008                                            BTRFS_BLOCK_GROUP_METADATA);
5009        ret = btrfs_block_rsv_add(root, rsv, num_bytes,
5010                                  BTRFS_RESERVE_FLUSH_ALL);
5011
5012        if (ret == -ENOSPC && use_global_rsv)
5013                ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
5014
5015        if (ret) {
5016                if (*qgroup_reserved)
5017                        btrfs_qgroup_free(root, *qgroup_reserved);
5018        }
5019
5020        return ret;
5021}
5022
5023void btrfs_subvolume_release_metadata(struct btrfs_root *root,
5024                                      struct btrfs_block_rsv *rsv,
5025                                      u64 qgroup_reserved)
5026{
5027        btrfs_block_rsv_release(root, rsv, (u64)-1);
5028        if (qgroup_reserved)
5029                btrfs_qgroup_free(root, qgroup_reserved);
5030}
5031
5032/**
5033 * drop_outstanding_extent - drop an outstanding extent
5034 * @inode: the inode we're dropping the extent for
5035 *
5036 * This is called when we are freeing up an outstanding extent, either called
5037 * after an error or after an extent is written.  This will return the number of
5038 * reserved extents that need to be freed.  This must be called with
5039 * BTRFS_I(inode)->lock held.
5040 */
5041static unsigned drop_outstanding_extent(struct inode *inode)
5042{
5043        unsigned drop_inode_space = 0;
5044        unsigned dropped_extents = 0;
5045
5046        BUG_ON(!BTRFS_I(inode)->outstanding_extents);
5047        BTRFS_I(inode)->outstanding_extents--;
5048
5049        if (BTRFS_I(inode)->outstanding_extents == 0 &&
5050            test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5051                               &BTRFS_I(inode)->runtime_flags))
5052                drop_inode_space = 1;
5053
5054        /*
5055         * If we have more or the same amount of outsanding extents than we have
5056         * reserved then we need to leave the reserved extents count alone.
5057         */
5058        if (BTRFS_I(inode)->outstanding_extents >=
5059            BTRFS_I(inode)->reserved_extents)
5060                return drop_inode_space;
5061
5062        dropped_extents = BTRFS_I(inode)->reserved_extents -
5063                BTRFS_I(inode)->outstanding_extents;
5064        BTRFS_I(inode)->reserved_extents -= dropped_extents;
5065        return dropped_extents + drop_inode_space;
5066}
5067
5068/**
5069 * calc_csum_metadata_size - return the amount of metada space that must be
5070 *      reserved/free'd for the given bytes.
5071 * @inode: the inode we're manipulating
5072 * @num_bytes: the number of bytes in question
5073 * @reserve: 1 if we are reserving space, 0 if we are freeing space
5074 *
5075 * This adjusts the number of csum_bytes in the inode and then returns the
5076 * correct amount of metadata that must either be reserved or freed.  We
5077 * calculate how many checksums we can fit into one leaf and then divide the
5078 * number of bytes that will need to be checksumed by this value to figure out
5079 * how many checksums will be required.  If we are adding bytes then the number
5080 * may go up and we will return the number of additional bytes that must be
5081 * reserved.  If it is going down we will return the number of bytes that must
5082 * be freed.
5083 *
5084 * This must be called with BTRFS_I(inode)->lock held.
5085 */
5086static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5087                                   int reserve)
5088{
5089        struct btrfs_root *root = BTRFS_I(inode)->root;
5090        u64 csum_size;
5091        int num_csums_per_leaf;
5092        int num_csums;
5093        int old_csums;
5094
5095        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5096            BTRFS_I(inode)->csum_bytes == 0)
5097                return 0;
5098
5099        old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
5100        if (reserve)
5101                BTRFS_I(inode)->csum_bytes += num_bytes;
5102        else
5103                BTRFS_I(inode)->csum_bytes -= num_bytes;
5104        csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
5105        num_csums_per_leaf = (int)div64_u64(csum_size,
5106                                            sizeof(struct btrfs_csum_item) +
5107                                            sizeof(struct btrfs_disk_key));
5108        num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
5109        num_csums = num_csums + num_csums_per_leaf - 1;
5110        num_csums = num_csums / num_csums_per_leaf;
5111
5112        old_csums = old_csums + num_csums_per_leaf - 1;
5113        old_csums = old_csums / num_csums_per_leaf;
5114
5115        /* No change, no need to reserve more */
5116        if (old_csums == num_csums)
5117                return 0;
5118
5119        if (reserve)
5120                return btrfs_calc_trans_metadata_size(root,
5121                                                      num_csums - old_csums);
5122
5123        return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5124}
5125
5126int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5127{
5128        struct btrfs_root *root = BTRFS_I(inode)->root;
5129        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5130        u64 to_reserve = 0;
5131        u64 csum_bytes;
5132        unsigned nr_extents = 0;
5133        int extra_reserve = 0;
5134        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5135        int ret = 0;
5136        bool delalloc_lock = true;
5137        u64 to_free = 0;
5138        unsigned dropped;
5139
5140        /* If we are a free space inode we need to not flush since we will be in
5141         * the middle of a transaction commit.  We also don't need the delalloc
5142         * mutex since we won't race with anybody.  We need this mostly to make
5143         * lockdep shut its filthy mouth.
5144         */
5145        if (btrfs_is_free_space_inode(inode)) {
5146                flush = BTRFS_RESERVE_NO_FLUSH;
5147                delalloc_lock = false;
5148        }
5149
5150        if (flush != BTRFS_RESERVE_NO_FLUSH &&
5151            btrfs_transaction_in_commit(root->fs_info))
5152                schedule_timeout(1);
5153
5154        if (delalloc_lock)
5155                mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
5156
5157        num_bytes = ALIGN(num_bytes, root->sectorsize);
5158
5159        spin_lock(&BTRFS_I(inode)->lock);
5160        BTRFS_I(inode)->outstanding_extents++;
5161
5162        if (BTRFS_I(inode)->outstanding_extents >
5163            BTRFS_I(inode)->reserved_extents)
5164                nr_extents = BTRFS_I(inode)->outstanding_extents -
5165                        BTRFS_I(inode)->reserved_extents;
5166
5167        /*
5168         * Add an item to reserve for updating the inode when we complete the
5169         * delalloc io.
5170         */
5171        if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5172                      &BTRFS_I(inode)->runtime_flags)) {
5173                nr_extents++;
5174                extra_reserve = 1;
5175        }
5176
5177        to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
5178        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5179        csum_bytes = BTRFS_I(inode)->csum_bytes;
5180        spin_unlock(&BTRFS_I(inode)->lock);
5181
5182        if (root->fs_info->quota_enabled) {
5183                ret = btrfs_qgroup_reserve(root, num_bytes +
5184                                           nr_extents * root->leafsize);
5185                if (ret)
5186                        goto out_fail;
5187        }
5188
5189        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
5190        if (unlikely(ret)) {
5191                if (root->fs_info->quota_enabled)
5192                        btrfs_qgroup_free(root, num_bytes +
5193                                                nr_extents * root->leafsize);
5194                goto out_fail;
5195        }
5196
5197        spin_lock(&BTRFS_I(inode)->lock);
5198        if (extra_reserve) {
5199                set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5200                        &BTRFS_I(inode)->runtime_flags);
5201                nr_extents--;
5202        }
5203        BTRFS_I(inode)->reserved_extents += nr_extents;
5204        spin_unlock(&BTRFS_I(inode)->lock);
5205
5206        if (delalloc_lock)
5207                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5208
5209        if (to_reserve)
5210                trace_btrfs_space_reservation(root->fs_info, "delalloc",
5211                                              btrfs_ino(inode), to_reserve, 1);
5212        block_rsv_add_bytes(block_rsv, to_reserve, 1);
5213
5214        return 0;
5215
5216out_fail:
5217        spin_lock(&BTRFS_I(inode)->lock);
5218        dropped = drop_outstanding_extent(inode);
5219        /*
5220         * If the inodes csum_bytes is the same as the original
5221         * csum_bytes then we know we haven't raced with any free()ers
5222         * so we can just reduce our inodes csum bytes and carry on.
5223         */
5224        if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
5225                calc_csum_metadata_size(inode, num_bytes, 0);
5226        } else {
5227                u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
5228                u64 bytes;
5229
5230                /*
5231                 * This is tricky, but first we need to figure out how much we
5232                 * free'd from any free-ers that occured during this
5233                 * reservation, so we reset ->csum_bytes to the csum_bytes
5234                 * before we dropped our lock, and then call the free for the
5235                 * number of bytes that were freed while we were trying our
5236                 * reservation.
5237                 */
5238                bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
5239                BTRFS_I(inode)->csum_bytes = csum_bytes;
5240                to_free = calc_csum_metadata_size(inode, bytes, 0);
5241
5242
5243                /*
5244                 * Now we need to see how much we would have freed had we not
5245                 * been making this reservation and our ->csum_bytes were not
5246                 * artificially inflated.
5247                 */
5248                BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
5249                bytes = csum_bytes - orig_csum_bytes;
5250                bytes = calc_csum_metadata_size(inode, bytes, 0);
5251
5252                /*
5253                 * Now reset ->csum_bytes to what it should be.  If bytes is
5254                 * more than to_free then we would have free'd more space had we
5255                 * not had an artificially high ->csum_bytes, so we need to free
5256                 * the remainder.  If bytes is the same or less then we don't
5257                 * need to do anything, the other free-ers did the correct
5258                 * thing.
5259                 */
5260                BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
5261                if (bytes > to_free)
5262                        to_free = bytes - to_free;
5263                else
5264                        to_free = 0;
5265        }
5266        spin_unlock(&BTRFS_I(inode)->lock);
5267        if (dropped)
5268                to_free += btrfs_calc_trans_metadata_size(root, dropped);
5269
5270        if (to_free) {
5271                btrfs_block_rsv_release(root, block_rsv, to_free);
5272                trace_btrfs_space_reservation(root->fs_info, "delalloc",
5273                                              btrfs_ino(inode), to_free, 0);
5274        }
5275        if (delalloc_lock)
5276                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5277        return ret;
5278}
5279
5280/**
5281 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
5282 * @inode: the inode to release the reservation for
5283 * @num_bytes: the number of bytes we're releasing
5284 *
5285 * This will release the metadata reservation for an inode.  This can be called
5286 * once we complete IO for a given set of bytes to release their metadata
5287 * reservations.
5288 */
5289void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5290{
5291        struct btrfs_root *root = BTRFS_I(inode)->root;
5292        u64 to_free = 0;
5293        unsigned dropped;
5294
5295        num_bytes = ALIGN(num_bytes, root->sectorsize);
5296        spin_lock(&BTRFS_I(inode)->lock);
5297        dropped = drop_outstanding_extent(inode);
5298
5299        if (num_bytes)
5300                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
5301        spin_unlock(&BTRFS_I(inode)->lock);
5302        if (dropped > 0)
5303                to_free += btrfs_calc_trans_metadata_size(root, dropped);
5304
5305        trace_btrfs_space_reservation(root->fs_info, "delalloc",
5306                                      btrfs_ino(inode), to_free, 0);
5307        if (root->fs_info->quota_enabled) {
5308                btrfs_qgroup_free(root, num_bytes +
5309                                        dropped * root->leafsize);
5310        }
5311
5312        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5313                                to_free);
5314}
5315
5316/**
5317 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
5318 * @inode: inode we're writing to
5319 * @num_bytes: the number of bytes we want to allocate
5320 *
5321 * This will do the following things
5322 *
5323 * o reserve space in the data space info for num_bytes
5324 * o reserve space in the metadata space info based on number of outstanding
5325 *   extents and how much csums will be needed
5326 * o add to the inodes ->delalloc_bytes
5327 * o add it to the fs_info's delalloc inodes list.
5328 *
5329 * This will return 0 for success and -ENOSPC if there is no space left.
5330 */
5331int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
5332{
5333        int ret;
5334
5335        ret = btrfs_check_data_free_space(inode, num_bytes);
5336        if (ret)
5337                return ret;
5338
5339        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
5340        if (ret) {
5341                btrfs_free_reserved_data_space(inode, num_bytes);
5342                return ret;
5343        }
5344
5345        return 0;
5346}
5347
5348/**
5349 * btrfs_delalloc_release_space - release data and metadata space for delalloc
5350 * @inode: inode we're releasing space for
5351 * @num_bytes: the number of bytes we want to free up
5352 *
5353 * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
5354 * called in the case that we don't need the metadata AND data reservations
5355 * anymore.  So if there is an error or we insert an inline extent.
5356 *
5357 * This function will release the metadata space that was not used and will
5358 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5359 * list if there are no delalloc bytes left.
5360 */
5361void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
5362{
5363        btrfs_delalloc_release_metadata(inode, num_bytes);
5364        btrfs_free_reserved_data_space(inode, num_bytes);
5365}
5366
5367static int update_block_group(struct btrfs_root *root,
5368                              u64 bytenr, u64 num_bytes, int alloc)
5369{
5370        struct btrfs_block_group_cache *cache = NULL;
5371        struct btrfs_fs_info *info = root->fs_info;
5372        u64 total = num_bytes;
5373        u64 old_val;
5374        u64 byte_in_group;
5375        int factor;
5376
5377        /* block accounting for super block */
5378        spin_lock(&info->delalloc_root_lock);
5379        old_val = btrfs_super_bytes_used(info->super_copy);
5380        if (alloc)
5381                old_val += num_bytes;
5382        else
5383                old_val -= num_bytes;
5384        btrfs_set_super_bytes_used(info->super_copy, old_val);
5385        spin_unlock(&info->delalloc_root_lock);
5386
5387        while (total) {
5388                cache = btrfs_lookup_block_group(info, bytenr);
5389                if (!cache)
5390                        return -ENOENT;
5391                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5392                                    BTRFS_BLOCK_GROUP_RAID1 |
5393                                    BTRFS_BLOCK_GROUP_RAID10))
5394                        factor = 2;
5395                else
5396                        factor = 1;
5397                /*
5398                 * If this block group has free space cache written out, we
5399                 * need to make sure to load it if we are removing space.  This
5400                 * is because we need the unpinning stage to actually add the
5401                 * space back to the block group, otherwise we will leak space.
5402                 */
5403                if (!alloc && cache->cached == BTRFS_CACHE_NO)
5404                        cache_block_group(cache, 1);
5405
5406                byte_in_group = bytenr - cache->key.objectid;
5407                WARN_ON(byte_in_group > cache->key.offset);
5408
5409                spin_lock(&cache->space_info->lock);
5410                spin_lock(&cache->lock);
5411
5412                if (btrfs_test_opt(root, SPACE_CACHE) &&
5413                    cache->disk_cache_state < BTRFS_DC_CLEAR)
5414                        cache->disk_cache_state = BTRFS_DC_CLEAR;
5415
5416                cache->dirty = 1;
5417                old_val = btrfs_block_group_used(&cache->item);
5418                num_bytes = min(total, cache->key.offset - byte_in_group);
5419                if (alloc) {
5420                        old_val += num_bytes;
5421                        btrfs_set_block_group_used(&cache->item, old_val);
5422                        cache->reserved -= num_bytes;
5423                        cache->space_info->bytes_reserved -= num_bytes;
5424                        cache->space_info->bytes_used += num_bytes;
5425                        cache->space_info->disk_used += num_bytes * factor;
5426                        spin_unlock(&cache->lock);
5427                        spin_unlock(&cache->space_info->lock);
5428                } else {
5429                        old_val -= num_bytes;
5430                        btrfs_set_block_group_used(&cache->item, old_val);
5431                        cache->pinned += num_bytes;
5432                        cache->space_info->bytes_pinned += num_bytes;
5433                        cache->space_info->bytes_used -= num_bytes;
5434                        cache->space_info->disk_used -= num_bytes * factor;
5435                        spin_unlock(&cache->lock);
5436                        spin_unlock(&cache->space_info->lock);
5437
5438                        set_extent_dirty(info->pinned_extents,
5439                                         bytenr, bytenr + num_bytes - 1,
5440                                         GFP_NOFS | __GFP_NOFAIL);
5441                }
5442                btrfs_put_block_group(cache);
5443                total -= num_bytes;
5444                bytenr += num_bytes;
5445        }
5446        return 0;
5447}
5448
5449static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5450{
5451        struct btrfs_block_group_cache *cache;
5452        u64 bytenr;
5453
5454        spin_lock(&root->fs_info->block_group_cache_lock);
5455        bytenr = root->fs_info->first_logical_byte;
5456        spin_unlock(&root->fs_info->block_group_cache_lock);
5457
5458        if (bytenr < (u64)-1)
5459                return bytenr;
5460
5461        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5462        if (!cache)
5463                return 0;
5464
5465        bytenr = cache->key.objectid;
5466        btrfs_put_block_group(cache);
5467
5468        return bytenr;
5469}
5470
5471static int pin_down_extent(struct btrfs_root *root,
5472                           struct btrfs_block_group_cache *cache,
5473                           u64 bytenr, u64 num_bytes, int reserved)
5474{
5475        spin_lock(&cache->space_info->lock);
5476        spin_lock(&cache->lock);
5477        cache->pinned += num_bytes;
5478        cache->space_info->bytes_pinned += num_bytes;
5479        if (reserved) {
5480                cache->reserved -= num_bytes;
5481                cache->space_info->bytes_reserved -= num_bytes;
5482        }
5483        spin_unlock(&cache->lock);
5484        spin_unlock(&cache->space_info->lock);
5485
5486        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5487                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5488        if (reserved)
5489                trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
5490        return 0;
5491}
5492
5493/*
5494 * this function must be called within transaction
5495 */
5496int btrfs_pin_extent(struct btrfs_root *root,
5497                     u64 bytenr, u64 num_bytes, int reserved)
5498{
5499        struct btrfs_block_group_cache *cache;
5500
5501        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5502        BUG_ON(!cache); /* Logic error */
5503
5504        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
5505
5506        btrfs_put_block_group(cache);
5507        return 0;
5508}
5509
5510/*
5511 * this function must be called within transaction
5512 */
5513int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5514                                    u64 bytenr, u64 num_bytes)
5515{
5516        struct btrfs_block_group_cache *cache;
5517        int ret;
5518
5519        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5520        if (!cache)
5521                return -EINVAL;
5522
5523        /*
5524         * pull in the free space cache (if any) so that our pin
5525         * removes the free space from the cache.  We have load_only set
5526         * to one because the slow code to read in the free extents does check
5527         * the pinned extents.
5528         */
5529        cache_block_group(cache, 1);
5530
5531        pin_down_extent(root, cache, bytenr, num_bytes, 0);
5532
5533        /* remove us from the free space cache (if we're there at all) */
5534        ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
5535        btrfs_put_block_group(cache);
5536        return ret;
5537}
5538
5539static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
5540{
5541        int ret;
5542        struct btrfs_block_group_cache *block_group;
5543        struct btrfs_caching_control *caching_ctl;
5544
5545        block_group = btrfs_lookup_block_group(root->fs_info, start);
5546        if (!block_group)
5547                return -EINVAL;
5548
5549        cache_block_group(block_group, 0);
5550        caching_ctl = get_caching_control(block_group);
5551
5552        if (!caching_ctl) {
5553                /* Logic error */
5554                BUG_ON(!block_group_cache_done(block_group));
5555                ret = btrfs_remove_free_space(block_group, start, num_bytes);
5556        } else {
5557                mutex_lock(&caching_ctl->mutex);
5558
5559                if (start >= caching_ctl->progress) {
5560                        ret = add_excluded_extent(root, start, num_bytes);
5561                } else if (start + num_bytes <= caching_ctl->progress) {
5562                        ret = btrfs_remove_free_space(block_group,
5563                                                      start, num_bytes);
5564                } else {
5565                        num_bytes = caching_ctl->progress - start;
5566                        ret = btrfs_remove_free_space(block_group,
5567                                                      start, num_bytes);
5568                        if (ret)
5569                                goto out_lock;
5570
5571                        num_bytes = (start + num_bytes) -
5572                                caching_ctl->progress;
5573                        start = caching_ctl->progress;
5574                        ret = add_excluded_extent(root, start, num_bytes);
5575                }
5576out_lock:
5577                mutex_unlock(&caching_ctl->mutex);
5578                put_caching_control(caching_ctl);
5579        }
5580        btrfs_put_block_group(block_group);
5581        return ret;
5582}
5583
5584int btrfs_exclude_logged_extents(struct btrfs_root *log,
5585                                 struct extent_buffer *eb)
5586{
5587        struct btrfs_file_extent_item *item;
5588        struct btrfs_key key;
5589        int found_type;
5590        int i;
5591
5592        if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
5593                return 0;
5594
5595        for (i = 0; i < btrfs_header_nritems(eb); i++) {
5596                btrfs_item_key_to_cpu(eb, &key, i);
5597                if (key.type != BTRFS_EXTENT_DATA_KEY)
5598                        continue;
5599                item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
5600                found_type = btrfs_file_extent_type(eb, item);
5601                if (found_type == BTRFS_FILE_EXTENT_INLINE)
5602                        continue;
5603                if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
5604                        continue;
5605                key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
5606                key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
5607                __exclude_logged_extent(log, key.objectid, key.offset);
5608        }
5609
5610        return 0;
5611}
5612
5613/**
5614 * btrfs_update_reserved_bytes - update the block_group and space info counters
5615 * @cache:      The cache we are manipulating
5616 * @num_bytes:  The number of bytes in question
5617 * @reserve:    One of the reservation enums
5618 * @delalloc:   The blocks are allocated for the delalloc write
5619 *
5620 * This is called by the allocator when it reserves space, or by somebody who is
5621 * freeing space that was never actually used on disk.  For example if you
5622 * reserve some space for a new leaf in transaction A and before transaction A
5623 * commits you free that leaf, you call this with reserve set to 0 in order to
5624 * clear the reservation.
5625 *
5626 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
5627 * ENOSPC accounting.  For data we handle the reservation through clearing the
5628 * delalloc bits in the io_tree.  We have to do this since we could end up
5629 * allocating less disk space for the amount of data we have reserved in the
5630 * case of compression.
5631 *
5632 * If this is a reservation and the block group has become read only we cannot
5633 * make the reservation and return -EAGAIN, otherwise this function always
5634 * succeeds.
5635 */
5636static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5637                                       u64 num_bytes, int reserve, int delalloc)
5638{
5639        struct btrfs_space_info *space_info = cache->space_info;
5640        int ret = 0;
5641
5642        spin_lock(&space_info->lock);
5643        spin_lock(&cache->lock);
5644        if (reserve != RESERVE_FREE) {
5645                if (cache->ro) {
5646                        ret = -EAGAIN;
5647                } else {
5648                        cache->reserved += num_bytes;
5649                        space_info->bytes_reserved += num_bytes;
5650                        if (reserve == RESERVE_ALLOC) {
5651                                trace_btrfs_space_reservation(cache->fs_info,
5652                                                "space_info", space_info->flags,
5653                                                num_bytes, 0);
5654                                space_info->bytes_may_use -= num_bytes;
5655                        }
5656
5657                        if (delalloc)
5658                                cache->delalloc_bytes += num_bytes;
5659                }
5660        } else {
5661                if (cache->ro)
5662                        space_info->bytes_readonly += num_bytes;
5663                cache->reserved -= num_bytes;
5664                space_info->bytes_reserved -= num_bytes;
5665
5666                if (delalloc)
5667                        cache->delalloc_bytes -= num_bytes;
5668        }
5669        spin_unlock(&cache->lock);
5670        spin_unlock(&space_info->lock);
5671        return ret;
5672}
5673
5674void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5675                                struct btrfs_root *root)
5676{
5677        struct btrfs_fs_info *fs_info = root->fs_info;
5678        struct btrfs_caching_control *next;
5679        struct btrfs_caching_control *caching_ctl;
5680        struct btrfs_block_group_cache *cache;
5681
5682        down_write(&fs_info->commit_root_sem);
5683
5684        list_for_each_entry_safe(caching_ctl, next,
5685                                 &fs_info->caching_block_groups, list) {
5686                cache = caching_ctl->block_group;
5687                if (block_group_cache_done(cache)) {
5688                        cache->last_byte_to_unpin = (u64)-1;
5689                        list_del_init(&caching_ctl->list);
5690                        put_caching_control(caching_ctl);
5691                } else {
5692                        cache->last_byte_to_unpin = caching_ctl->progress;
5693                }
5694        }
5695
5696        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5697                fs_info->pinned_extents = &fs_info->freed_extents[1];
5698        else
5699                fs_info->pinned_extents = &fs_info->freed_extents[0];
5700
5701        up_write(&fs_info->commit_root_sem);
5702
5703        update_global_block_rsv(fs_info);
5704}
5705
5706static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5707{
5708        struct btrfs_fs_info *fs_info = root->fs_info;
5709        struct btrfs_block_group_cache *cache = NULL;
5710        struct btrfs_space_info *space_info;
5711        struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5712        u64 len;
5713        bool readonly;
5714
5715        while (start <= end) {
5716                readonly = false;
5717                if (!cache ||
5718                    start >= cache->key.objectid + cache->key.offset) {
5719                        if (cache)
5720                                btrfs_put_block_group(cache);
5721                        cache = btrfs_lookup_block_group(fs_info, start);
5722                        BUG_ON(!cache); /* Logic error */
5723                }
5724
5725                len = cache->key.objectid + cache->key.offset - start;
5726                len = min(len, end + 1 - start);
5727
5728                if (start < cache->last_byte_to_unpin) {
5729                        len = min(len, cache->last_byte_to_unpin - start);
5730                        btrfs_add_free_space(cache, start, len);
5731                }
5732
5733                start += len;
5734                space_info = cache->space_info;
5735
5736                spin_lock(&space_info->lock);
5737                spin_lock(&cache->lock);
5738                cache->pinned -= len;
5739                space_info->bytes_pinned -= len;
5740                percpu_counter_add(&space_info->total_bytes_pinned, -len);
5741                if (cache->ro) {
5742                        space_info->bytes_readonly += len;
5743                        readonly = true;
5744                }
5745                spin_unlock(&cache->lock);
5746                if (!readonly && global_rsv->space_info == space_info) {
5747                        spin_lock(&global_rsv->lock);
5748                        if (!global_rsv->full) {
5749                                len = min(len, global_rsv->size -
5750                                          global_rsv->reserved);
5751                                global_rsv->reserved += len;
5752                                space_info->bytes_may_use += len;
5753                                if (global_rsv->reserved >= global_rsv->size)
5754                                        global_rsv->full = 1;
5755                        }
5756                        spin_unlock(&global_rsv->lock);
5757                }
5758                spin_unlock(&space_info->lock);
5759        }
5760
5761        if (cache)
5762                btrfs_put_block_group(cache);
5763        return 0;
5764}
5765
5766int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5767                               struct btrfs_root *root)
5768{
5769        struct btrfs_fs_info *fs_info = root->fs_info;
5770        struct extent_io_tree *unpin;
5771        u64 start;
5772        u64 end;
5773        int ret;
5774
5775        if (trans->aborted)
5776                return 0;
5777
5778        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5779                unpin = &fs_info->freed_extents[1];
5780        else
5781                unpin = &fs_info->freed_extents[0];
5782
5783        while (1) {
5784                ret = find_first_extent_bit(unpin, 0, &start, &end,
5785                                            EXTENT_DIRTY, NULL);
5786                if (ret)
5787                        break;
5788
5789                if (btrfs_test_opt(root, DISCARD))
5790                        ret = btrfs_discard_extent(root, start,
5791                                                   end + 1 - start, NULL);
5792
5793                clear_extent_dirty(unpin, start, end, GFP_NOFS);
5794                unpin_extent_range(root, start, end);
5795                cond_resched();
5796        }
5797
5798        return 0;
5799}
5800
5801static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
5802                             u64 owner, u64 root_objectid)
5803{
5804        struct btrfs_space_info *space_info;
5805        u64 flags;
5806
5807        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5808                if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
5809                        flags = BTRFS_BLOCK_GROUP_SYSTEM;
5810                else
5811                        flags = BTRFS_BLOCK_GROUP_METADATA;
5812        } else {
5813                flags = BTRFS_BLOCK_GROUP_DATA;
5814        }
5815
5816        space_info = __find_space_info(fs_info, flags);
5817        BUG_ON(!space_info); /* Logic bug */
5818        percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
5819}
5820
5821
5822static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5823                                struct btrfs_root *root,
5824                                u64 bytenr, u64 num_bytes, u64 parent,
5825                                u64 root_objectid, u64 owner_objectid,
5826                                u64 owner_offset, int refs_to_drop,
5827                                struct btrfs_delayed_extent_op *extent_op,
5828                                int no_quota)
5829{
5830        struct btrfs_key key;
5831        struct btrfs_path *path;
5832        struct btrfs_fs_info *info = root->fs_info;
5833        struct btrfs_root *extent_root = info->extent_root;
5834        struct extent_buffer *leaf;
5835        struct btrfs_extent_item *ei;
5836        struct btrfs_extent_inline_ref *iref;
5837        int ret;
5838        int is_data;
5839        int extent_slot = 0;
5840        int found_extent = 0;
5841        int num_to_del = 1;
5842        u32 item_size;
5843        u64 refs;
5844        int last_ref = 0;
5845        enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
5846        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
5847                                                 SKINNY_METADATA);
5848
5849        if (!info->quota_enabled || !is_fstree(root_objectid))
5850                no_quota = 1;
5851
5852        path = btrfs_alloc_path();
5853        if (!path)
5854                return -ENOMEM;
5855
5856        path->reada = 1;
5857        path->leave_spinning = 1;
5858
5859        is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5860        BUG_ON(!is_data && refs_to_drop != 1);
5861
5862        if (is_data)
5863                skinny_metadata = 0;
5864
5865        ret = lookup_extent_backref(trans, extent_root, path, &iref,
5866                                    bytenr, num_bytes, parent,
5867                                    root_objectid, owner_objectid,
5868                                    owner_offset);
5869        if (ret == 0) {
5870                extent_slot = path->slots[0];
5871                while (extent_slot >= 0) {
5872                        btrfs_item_key_to_cpu(path->nodes[0], &key,
5873                                              extent_slot);
5874                        if (key.objectid != bytenr)
5875                                break;
5876                        if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5877                            key.offset == num_bytes) {
5878                                found_extent = 1;
5879                                break;
5880                        }
5881                        if (key.type == BTRFS_METADATA_ITEM_KEY &&
5882                            key.offset == owner_objectid) {
5883                                found_extent = 1;
5884                                break;
5885                        }
5886                        if (path->slots[0] - extent_slot > 5)
5887                                break;
5888                        extent_slot--;
5889                }
5890#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5891                item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5892                if (found_extent && item_size < sizeof(*ei))
5893                        found_extent = 0;
5894#endif
5895                if (!found_extent) {
5896                        BUG_ON(iref);
5897                        ret = remove_extent_backref(trans, extent_root, path,
5898                                                    NULL, refs_to_drop,
5899                                                    is_data, &last_ref);
5900                        if (ret) {
5901                                btrfs_abort_transaction(trans, extent_root, ret);
5902                                goto out;
5903                        }
5904                        btrfs_release_path(path);
5905                        path->leave_spinning = 1;
5906
5907                        key.objectid = bytenr;
5908                        key.type = BTRFS_EXTENT_ITEM_KEY;
5909                        key.offset = num_bytes;
5910
5911                        if (!is_data && skinny_metadata) {
5912                                key.type = BTRFS_METADATA_ITEM_KEY;
5913                                key.offset = owner_objectid;
5914                        }
5915
5916                        ret = btrfs_search_slot(trans, extent_root,
5917                                                &key, path, -1, 1);
5918                        if (ret > 0 && skinny_metadata && path->slots[0]) {
5919                                /*
5920                                 * Couldn't find our skinny metadata item,
5921                                 * see if we have ye olde extent item.
5922                                 */
5923                                path->slots[0]--;
5924                                btrfs_item_key_to_cpu(path->nodes[0], &key,
5925                                                      path->slots[0]);
5926                                if (key.objectid == bytenr &&
5927                                    key.type == BTRFS_EXTENT_ITEM_KEY &&
5928                                    key.offset == num_bytes)
5929                                        ret = 0;
5930                        }
5931
5932                        if (ret > 0 && skinny_metadata) {
5933                                skinny_metadata = false;
5934                                key.objectid = bytenr;
5935                                key.type = BTRFS_EXTENT_ITEM_KEY;
5936                                key.offset = num_bytes;
5937                                btrfs_release_path(path);
5938                                ret = btrfs_search_slot(trans, extent_root,
5939                                                        &key, path, -1, 1);
5940                        }
5941
5942                        if (ret) {
5943                                btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5944                                        ret, bytenr);
5945                                if (ret > 0)
5946                                        btrfs_print_leaf(extent_root,
5947                                                         path->nodes[0]);
5948                        }
5949                        if (ret < 0) {
5950                                btrfs_abort_transaction(trans, extent_root, ret);
5951                                goto out;
5952                        }
5953                        extent_slot = path->slots[0];
5954                }
5955        } else if (WARN_ON(ret == -ENOENT)) {
5956                btrfs_print_leaf(extent_root, path->nodes[0]);
5957                btrfs_err(info,
5958                        "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
5959                        bytenr, parent, root_objectid, owner_objectid,
5960                        owner_offset);
5961                btrfs_abort_transaction(trans, extent_root, ret);
5962                goto out;
5963        } else {
5964                btrfs_abort_transaction(trans, extent_root, ret);
5965                goto out;
5966        }
5967
5968        leaf = path->nodes[0];
5969        item_size = btrfs_item_size_nr(leaf, extent_slot);
5970#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5971        if (item_size < sizeof(*ei)) {
5972                BUG_ON(found_extent || extent_slot != path->slots[0]);
5973                ret = convert_extent_item_v0(trans, extent_root, path,
5974                                             owner_objectid, 0);
5975                if (ret < 0) {
5976                        btrfs_abort_transaction(trans, extent_root, ret);
5977                        goto out;
5978                }
5979
5980                btrfs_release_path(path);
5981                path->leave_spinning = 1;
5982
5983                key.objectid = bytenr;
5984                key.type = BTRFS_EXTENT_ITEM_KEY;
5985                key.offset = num_bytes;
5986
5987                ret = btrfs_search_slot(trans, extent_root, &key, path,
5988                                        -1, 1);
5989                if (ret) {
5990                        btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5991                                ret, bytenr);
5992                        btrfs_print_leaf(extent_root, path->nodes[0]);
5993                }
5994                if (ret < 0) {
5995                        btrfs_abort_transaction(trans, extent_root, ret);
5996                        goto out;
5997                }
5998
5999                extent_slot = path->slots[0];
6000                leaf = path->nodes[0];
6001                item_size = btrfs_item_size_nr(leaf, extent_slot);
6002        }
6003#endif
6004        BUG_ON(item_size < sizeof(*ei));
6005        ei = btrfs_item_ptr(leaf, extent_slot,
6006                            struct btrfs_extent_item);
6007        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
6008            key.type == BTRFS_EXTENT_ITEM_KEY) {
6009                struct btrfs_tree_block_info *bi;
6010                BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
6011                bi = (struct btrfs_tree_block_info *)(ei + 1);
6012                WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
6013        }
6014
6015        refs = btrfs_extent_refs(leaf, ei);
6016        if (refs < refs_to_drop) {
6017                btrfs_err(info, "trying to drop %d refs but we only have %Lu "
6018                          "for bytenr %Lu", refs_to_drop, refs, bytenr);
6019                ret = -EINVAL;
6020                btrfs_abort_transaction(trans, extent_root, ret);
6021                goto out;
6022        }
6023        refs -= refs_to_drop;
6024
6025        if (refs > 0) {
6026                type = BTRFS_QGROUP_OPER_SUB_SHARED;
6027                if (extent_op)
6028                        __run_delayed_extent_op(extent_op, leaf, ei);
6029                /*
6030                 * In the case of inline back ref, reference count will
6031                 * be updated by remove_extent_backref
6032                 */
6033                if (iref) {
6034                        BUG_ON(!found_extent);
6035                } else {
6036                        btrfs_set_extent_refs(leaf, ei, refs);
6037                        btrfs_mark_buffer_dirty(leaf);
6038                }
6039                if (found_extent) {
6040                        ret = remove_extent_backref(trans, extent_root, path,
6041                                                    iref, refs_to_drop,
6042                                                    is_data, &last_ref);
6043                        if (ret) {
6044                                btrfs_abort_transaction(trans, extent_root, ret);
6045                                goto out;
6046                        }
6047                }
6048                add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
6049                                 root_objectid);
6050        } else {
6051                if (found_extent) {
6052                        BUG_ON(is_data && refs_to_drop !=
6053                               extent_data_ref_count(root, path, iref));
6054                        if (iref) {
6055                                BUG_ON(path->slots[0] != extent_slot);
6056                        } else {
6057                                BUG_ON(path->slots[0] != extent_slot + 1);
6058                                path->slots[0] = extent_slot;
6059                                num_to_del = 2;
6060                        }
6061                }
6062
6063                last_ref = 1;
6064                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
6065                                      num_to_del);
6066                if (ret) {
6067                        btrfs_abort_transaction(trans, extent_root, ret);
6068                        goto out;
6069                }
6070                btrfs_release_path(path);
6071
6072                if (is_data) {
6073                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
6074                        if (ret) {
6075                                btrfs_abort_transaction(trans, extent_root, ret);
6076                                goto out;
6077                        }
6078                }
6079
6080                ret = update_block_group(root, bytenr, num_bytes, 0);
6081                if (ret) {
6082                        btrfs_abort_transaction(trans, extent_root, ret);
6083                        goto out;
6084                }
6085        }
6086        btrfs_release_path(path);
6087
6088        /* Deal with the quota accounting */
6089        if (!ret && last_ref && !no_quota) {
6090                int mod_seq = 0;
6091
6092                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
6093                    type == BTRFS_QGROUP_OPER_SUB_SHARED)
6094                        mod_seq = 1;
6095
6096                ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
6097                                              bytenr, num_bytes, type,
6098                                              mod_seq);
6099        }
6100out:
6101        btrfs_free_path(path);
6102        return ret;
6103}
6104
6105/*
6106 * when we free an block, it is possible (and likely) that we free the last
6107 * delayed ref for that extent as well.  This searches the delayed ref tree for
6108 * a given extent, and if there are no other delayed refs to be processed, it
6109 * removes it from the tree.
6110 */
6111static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6112                                      struct btrfs_root *root, u64 bytenr)
6113{
6114        struct btrfs_delayed_ref_head *head;
6115        struct btrfs_delayed_ref_root *delayed_refs;
6116        int ret = 0;
6117
6118        delayed_refs = &trans->transaction->delayed_refs;
6119        spin_lock(&delayed_refs->lock);
6120        head = btrfs_find_delayed_ref_head(trans, bytenr);
6121        if (!head)
6122                goto out_delayed_unlock;
6123
6124        spin_lock(&head->lock);
6125        if (rb_first(&head->ref_root))
6126                goto out;
6127
6128        if (head->extent_op) {
6129                if (!head->must_insert_reserved)
6130                        goto out;
6131                btrfs_free_delayed_extent_op(head->extent_op);
6132                head->extent_op = NULL;
6133        }
6134
6135        /*
6136         * waiting for the lock here would deadlock.  If someone else has it
6137         * locked they are already in the process of dropping it anyway
6138         */
6139        if (!mutex_trylock(&head->mutex))
6140                goto out;
6141
6142        /*
6143         * at this point we have a head with no other entries.  Go
6144         * ahead and process it.
6145         */
6146        head->node.in_tree = 0;
6147        rb_erase(&head->href_node, &delayed_refs->href_root);
6148
6149        atomic_dec(&delayed_refs->num_entries);
6150
6151        /*
6152         * we don't take a ref on the node because we're removing it from the
6153         * tree, so we just steal the ref the tree was holding.
6154         */
6155        delayed_refs->num_heads--;
6156        if (head->processing == 0)
6157                delayed_refs->num_heads_ready--;
6158        head->processing = 0;
6159        spin_unlock(&head->lock);
6160        spin_unlock(&delayed_refs->lock);
6161
6162        BUG_ON(head->extent_op);
6163        if (head->must_insert_reserved)
6164                ret = 1;
6165
6166        mutex_unlock(&head->mutex);
6167        btrfs_put_delayed_ref(&head->node);
6168        return ret;
6169out:
6170        spin_unlock(&head->lock);
6171
6172out_delayed_unlock:
6173        spin_unlock(&delayed_refs->lock);
6174        return 0;
6175}
6176
6177void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6178                           struct btrfs_root *root,
6179                           struct extent_buffer *buf,
6180                           u64 parent, int last_ref)
6181{
6182        struct btrfs_block_group_cache *cache = NULL;
6183        int pin = 1;
6184        int ret;
6185
6186        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6187                ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6188                                        buf->start, buf->len,
6189                                        parent, root->root_key.objectid,
6190                                        btrfs_header_level(buf),
6191                                        BTRFS_DROP_DELAYED_REF, NULL, 0);
6192                BUG_ON(ret); /* -ENOMEM */
6193        }
6194
6195        if (!last_ref)
6196                return;
6197
6198        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
6199
6200        if (btrfs_header_generation(buf) == trans->transid) {
6201                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
6202                        ret = check_ref_cleanup(trans, root, buf->start);
6203                        if (!ret)
6204                                goto out;
6205                }
6206
6207                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
6208                        pin_down_extent(root, cache, buf->start, buf->len, 1);
6209                        goto out;
6210                }
6211
6212                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
6213
6214                btrfs_add_free_space(cache, buf->start, buf->len);
6215                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
6216                trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
6217                pin = 0;
6218        }
6219out:
6220        if (pin)
6221                add_pinned_bytes(root->fs_info, buf->len,
6222                                 btrfs_header_level(buf),
6223                                 root->root_key.objectid);
6224
6225        /*
6226         * Deleting the buffer, clear the corrupt flag since it doesn't matter
6227         * anymore.
6228         */
6229        clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
6230        btrfs_put_block_group(cache);
6231}
6232
6233/* Can return -ENOMEM */
6234int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6235                      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6236                      u64 owner, u64 offset, int no_quota)
6237{
6238        int ret;
6239        struct btrfs_fs_info *fs_info = root->fs_info;
6240
6241#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6242        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
6243                return 0;
6244#endif
6245        add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6246
6247        /*
6248         * tree log blocks never actually go into the extent allocation
6249         * tree, just update pinning info and exit early.
6250         */
6251        if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
6252                WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
6253                /* unlocks the pinned mutex */
6254                btrfs_pin_extent(root, bytenr, num_bytes, 1);
6255                ret = 0;
6256        } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
6257                ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6258                                        num_bytes,
6259                                        parent, root_objectid, (int)owner,
6260                                        BTRFS_DROP_DELAYED_REF, NULL, no_quota);
6261        } else {
6262                ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6263                                                num_bytes,
6264                                                parent, root_objectid, owner,
6265                                                offset, BTRFS_DROP_DELAYED_REF,
6266                                                NULL, no_quota);
6267        }
6268        return ret;
6269}
6270
6271static u64 stripe_align(struct btrfs_root *root,
6272                        struct btrfs_block_group_cache *cache,
6273                        u64 val, u64 num_bytes)
6274{
6275        u64 ret = ALIGN(val, root->stripesize);
6276        return ret;
6277}
6278
6279/*
6280 * when we wait for progress in the block group caching, its because
6281 * our allocation attempt failed at least once.  So, we must sleep
6282 * and let some progress happen before we try again.
6283 *
6284 * This function will sleep at least once waiting for new free space to
6285 * show up, and then it will check the block group free space numbers
6286 * for our min num_bytes.  Another option is to have it go ahead
6287 * and look in the rbtree for a free extent of a given size, but this
6288 * is a good start.
6289 *
6290 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
6291 * any of the information in this block group.
6292 */
6293static noinline void
6294wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
6295                                u64 num_bytes)
6296{
6297        struct btrfs_caching_control *caching_ctl;
6298
6299        caching_ctl = get_caching_control(cache);
6300        if (!caching_ctl)
6301                return;
6302
6303        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
6304                   (cache->free_space_ctl->free_space >= num_bytes));
6305
6306        put_caching_control(caching_ctl);
6307}
6308
6309static noinline int
6310wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
6311{
6312        struct btrfs_caching_control *caching_ctl;
6313        int ret = 0;
6314
6315        caching_ctl = get_caching_control(cache);
6316        if (!caching_ctl)
6317                return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
6318
6319        wait_event(caching_ctl->wait, block_group_cache_done(cache));
6320        if (cache->cached == BTRFS_CACHE_ERROR)
6321                ret = -EIO;
6322        put_caching_control(caching_ctl);
6323        return ret;
6324}
6325
6326int __get_raid_index(u64 flags)
6327{
6328        if (flags & BTRFS_BLOCK_GROUP_RAID10)
6329                return BTRFS_RAID_RAID10;
6330        else if (flags & BTRFS_BLOCK_GROUP_RAID1)
6331                return BTRFS_RAID_RAID1;
6332        else if (flags & BTRFS_BLOCK_GROUP_DUP)
6333                return BTRFS_RAID_DUP;
6334        else if (flags & BTRFS_BLOCK_GROUP_RAID0)
6335                return BTRFS_RAID_RAID0;
6336        else if (flags & BTRFS_BLOCK_GROUP_RAID5)
6337                return BTRFS_RAID_RAID5;
6338        else if (flags & BTRFS_BLOCK_GROUP_RAID6)
6339                return BTRFS_RAID_RAID6;
6340
6341        return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
6342}
6343
6344int get_block_group_index(struct btrfs_block_group_cache *cache)
6345{
6346        return __get_raid_index(cache->flags);
6347}
6348
6349static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
6350        [BTRFS_RAID_RAID10]     = "raid10",
6351        [BTRFS_RAID_RAID1]      = "raid1",
6352        [BTRFS_RAID_DUP]        = "dup",
6353        [BTRFS_RAID_RAID0]      = "raid0",
6354        [BTRFS_RAID_SINGLE]     = "single",
6355        [BTRFS_RAID_RAID5]      = "raid5",
6356        [BTRFS_RAID_RAID6]      = "raid6",
6357};
6358
6359static const char *get_raid_name(enum btrfs_raid_types type)
6360{
6361        if (type >= BTRFS_NR_RAID_TYPES)
6362                return NULL;
6363
6364        return btrfs_raid_type_names[type];
6365}
6366
6367enum btrfs_loop_type {
6368        LOOP_CACHING_NOWAIT = 0,
6369        LOOP_CACHING_WAIT = 1,
6370        LOOP_ALLOC_CHUNK = 2,
6371        LOOP_NO_EMPTY_SIZE = 3,
6372};
6373
6374static inline void
6375btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
6376                       int delalloc)
6377{
6378        if (delalloc)
6379                down_read(&cache->data_rwsem);
6380}
6381
6382static inline void
6383btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
6384                       int delalloc)
6385{
6386        btrfs_get_block_group(cache);
6387        if (delalloc)
6388                down_read(&cache->data_rwsem);
6389}
6390
6391static struct btrfs_block_group_cache *
6392btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
6393                   struct btrfs_free_cluster *cluster,
6394                   int delalloc)
6395{
6396        struct btrfs_block_group_cache *used_bg;
6397        bool locked = false;
6398again:
6399        spin_lock(&cluster->refill_lock);
6400        if (locked) {
6401                if (used_bg == cluster->block_group)
6402                        return used_bg;
6403
6404                up_read(&used_bg->data_rwsem);
6405                btrfs_put_block_group(used_bg);
6406        }
6407
6408        used_bg = cluster->block_group;
6409        if (!used_bg)
6410                return NULL;
6411
6412        if (used_bg == block_group)
6413                return used_bg;
6414
6415        btrfs_get_block_group(used_bg);
6416
6417        if (!delalloc)
6418                return used_bg;
6419
6420        if (down_read_trylock(&used_bg->data_rwsem))
6421                return used_bg;
6422
6423        spin_unlock(&cluster->refill_lock);
6424        down_read(&used_bg->data_rwsem);
6425        locked = true;
6426        goto again;
6427}
6428
6429static inline void
6430btrfs_release_block_group(struct btrfs_block_group_cache *cache,
6431                         int delalloc)
6432{
6433        if (delalloc)
6434                up_read(&cache->data_rwsem);
6435        btrfs_put_block_group(cache);
6436}
6437
6438/*
6439 * walks the btree of allocated extents and find a hole of a given size.
6440 * The key ins is changed to record the hole:
6441 * ins->objectid == start position
6442 * ins->flags = BTRFS_EXTENT_ITEM_KEY
6443 * ins->offset == the size of the hole.
6444 * Any available blocks before search_start are skipped.
6445 *
6446 * If there is no suitable free space, we will record the max size of
6447 * the free space extent currently.
6448 */
6449static noinline int find_free_extent(struct btrfs_root *orig_root,
6450                                     u64 num_bytes, u64 empty_size,
6451                                     u64 hint_byte, struct btrfs_key *ins,
6452                                     u64 flags, int delalloc)
6453{
6454        int ret = 0;
6455        struct btrfs_root *root = orig_root->fs_info->extent_root;
6456        struct btrfs_free_cluster *last_ptr = NULL;
6457        struct btrfs_block_group_cache *block_group = NULL;
6458        u64 search_start = 0;
6459        u64 max_extent_size = 0;
6460        int empty_cluster = 2 * 1024 * 1024;
6461        struct btrfs_space_info *space_info;
6462        int loop = 0;
6463        int index = __get_raid_index(flags);
6464        int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
6465                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
6466        bool failed_cluster_refill = false;
6467        bool failed_alloc = false;
6468        bool use_cluster = true;
6469        bool have_caching_bg = false;
6470
6471        WARN_ON(num_bytes < root->sectorsize);
6472        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
6473        ins->objectid = 0;
6474        ins->offset = 0;
6475
6476        trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
6477
6478        space_info = __find_space_info(root->fs_info, flags);
6479        if (!space_info) {
6480                btrfs_err(root->fs_info, "No space info for %llu", flags);
6481                return -ENOSPC;
6482        }
6483
6484        /*
6485         * If the space info is for both data and metadata it means we have a
6486         * small filesystem and we can't use the clustering stuff.
6487         */
6488        if (btrfs_mixed_space_info(space_info))
6489                use_cluster = false;
6490
6491        if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
6492                last_ptr = &root->fs_info->meta_alloc_cluster;
6493                if (!btrfs_test_opt(root, SSD))
6494                        empty_cluster = 64 * 1024;
6495        }
6496
6497        if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
6498            btrfs_test_opt(root, SSD)) {
6499                last_ptr = &root->fs_info->data_alloc_cluster;
6500        }
6501
6502        if (last_ptr) {
6503                spin_lock(&last_ptr->lock);
6504                if (last_ptr->block_group)
6505                        hint_byte = last_ptr->window_start;
6506                spin_unlock(&last_ptr->lock);
6507        }
6508
6509        search_start = max(search_start, first_logical_byte(root, 0));
6510        search_start = max(search_start, hint_byte);
6511
6512        if (!last_ptr)
6513                empty_cluster = 0;
6514
6515        if (search_start == hint_byte) {
6516                block_group = btrfs_lookup_block_group(root->fs_info,
6517                                                       search_start);
6518                /*
6519                 * we don't want to use the block group if it doesn't match our
6520                 * allocation bits, or if its not cached.
6521                 *
6522                 * However if we are re-searching with an ideal block group
6523                 * picked out then we don't care that the block group is cached.
6524                 */
6525                if (block_group && block_group_bits(block_group, flags) &&
6526                    block_group->cached != BTRFS_CACHE_NO) {
6527                        down_read(&space_info->groups_sem);
6528                        if (list_empty(&block_group->list) ||
6529                            block_group->ro) {
6530                                /*
6531                                 * someone is removing this block group,
6532                                 * we can't jump into the have_block_group
6533                                 * target because our list pointers are not
6534                                 * valid
6535                                 */
6536                                btrfs_put_block_group(block_group);
6537                                up_read(&space_info->groups_sem);
6538                        } else {
6539                                index = get_block_group_index(block_group);
6540                                btrfs_lock_block_group(block_group, delalloc);
6541                                goto have_block_group;
6542                        }
6543                } else if (block_group) {
6544                        btrfs_put_block_group(block_group);
6545                }
6546        }
6547search:
6548        have_caching_bg = false;
6549        down_read(&space_info->groups_sem);
6550        list_for_each_entry(block_group, &space_info->block_groups[index],
6551                            list) {
6552                u64 offset;
6553                int cached;
6554
6555                btrfs_grab_block_group(block_group, delalloc);
6556                search_start = block_group->key.objectid;
6557
6558                /*
6559                 * this can happen if we end up cycling through all the
6560                 * raid types, but we want to make sure we only allocate
6561                 * for the proper type.
6562                 */
6563                if (!block_group_bits(block_group, flags)) {
6564                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
6565                                BTRFS_BLOCK_GROUP_RAID1 |
6566                                BTRFS_BLOCK_GROUP_RAID5 |
6567                                BTRFS_BLOCK_GROUP_RAID6 |
6568                                BTRFS_BLOCK_GROUP_RAID10;
6569
6570                        /*
6571                         * if they asked for extra copies and this block group
6572                         * doesn't provide them, bail.  This does allow us to
6573                         * fill raid0 from raid1.
6574                         */
6575                        if ((flags & extra) && !(block_group->flags & extra))
6576                                goto loop;
6577                }
6578
6579have_block_group:
6580                cached = block_group_cache_done(block_group);
6581                if (unlikely(!cached)) {
6582                        ret = cache_block_group(block_group, 0);
6583                        BUG_ON(ret < 0);
6584                        ret = 0;
6585                }
6586
6587                if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
6588                        goto loop;
6589                if (unlikely(block_group->ro))
6590                        goto loop;
6591
6592                /*
6593                 * Ok we want to try and use the cluster allocator, so
6594                 * lets look there
6595                 */
6596                if (last_ptr) {
6597                        struct btrfs_block_group_cache *used_block_group;
6598                        unsigned long aligned_cluster;
6599                        /*
6600                         * the refill lock keeps out other
6601                         * people trying to start a new cluster
6602                         */
6603                        used_block_group = btrfs_lock_cluster(block_group,
6604                                                              last_ptr,
6605                                                              delalloc);
6606                        if (!used_block_group)
6607                                goto refill_cluster;
6608
6609                        if (used_block_group != block_group &&
6610                            (used_block_group->ro ||
6611                             !block_group_bits(used_block_group, flags)))
6612                                goto release_cluster;
6613
6614                        offset = btrfs_alloc_from_cluster(used_block_group,
6615                                                last_ptr,
6616                                                num_bytes,
6617                                                used_block_group->key.objectid,
6618                                                &max_extent_size);
6619                        if (offset) {
6620                                /* we have a block, we're done */
6621                                spin_unlock(&last_ptr->refill_lock);
6622                                trace_btrfs_reserve_extent_cluster(root,
6623                                                used_block_group,
6624                                                search_start, num_bytes);
6625                                if (used_block_group != block_group) {
6626                                        btrfs_release_block_group(block_group,
6627                                                                  delalloc);
6628                                        block_group = used_block_group;
6629                                }
6630                                goto checks;
6631                        }
6632
6633                        WARN_ON(last_ptr->block_group != used_block_group);
6634release_cluster:
6635                        /* If we are on LOOP_NO_EMPTY_SIZE, we can't
6636                         * set up a new clusters, so lets just skip it
6637                         * and let the allocator find whatever block
6638                         * it can find.  If we reach this point, we
6639                         * will have tried the cluster allocator
6640                         * plenty of times and not have found
6641                         * anything, so we are likely way too
6642                         * fragmented for the clustering stuff to find
6643                         * anything.
6644                         *
6645                         * However, if the cluster is taken from the
6646                         * current block group, release the cluster
6647                         * first, so that we stand a better chance of
6648                         * succeeding in the unclustered
6649                         * allocation.  */
6650                        if (loop >= LOOP_NO_EMPTY_SIZE &&
6651                            used_block_group != block_group) {
6652                                spin_unlock(&last_ptr->refill_lock);
6653                                btrfs_release_block_group(used_block_group,
6654                                                          delalloc);
6655                                goto unclustered_alloc;
6656                        }
6657
6658                        /*
6659                         * this cluster didn't work out, free it and
6660                         * start over
6661                         */
6662                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
6663
6664                        if (used_block_group != block_group)
6665                                btrfs_release_block_group(used_block_group,
6666                                                          delalloc);
6667refill_cluster:
6668                        if (loop >= LOOP_NO_EMPTY_SIZE) {
6669                                spin_unlock(&last_ptr->refill_lock);
6670                                goto unclustered_alloc;
6671                        }
6672
6673                        aligned_cluster = max_t(unsigned long,
6674                                                empty_cluster + empty_size,
6675                                              block_group->full_stripe_len);
6676
6677                        /* allocate a cluster in this block group */
6678                        ret = btrfs_find_space_cluster(root, block_group,
6679                                                       last_ptr, search_start,
6680                                                       num_bytes,
6681                                                       aligned_cluster);
6682                        if (ret == 0) {
6683                                /*
6684                                 * now pull our allocation out of this
6685                                 * cluster
6686                                 */
6687                                offset = btrfs_alloc_from_cluster(block_group,
6688                                                        last_ptr,
6689                                                        num_bytes,
6690                                                        search_start,
6691                                                        &max_extent_size);
6692                                if (offset) {
6693                                        /* we found one, proceed */
6694                                        spin_unlock(&last_ptr->refill_lock);
6695                                        trace_btrfs_reserve_extent_cluster(root,
6696                                                block_group, search_start,
6697                                                num_bytes);
6698                                        goto checks;
6699                                }
6700                        } else if (!cached && loop > LOOP_CACHING_NOWAIT
6701                                   && !failed_cluster_refill) {
6702                                spin_unlock(&last_ptr->refill_lock);
6703
6704                                failed_cluster_refill = true;
6705                                wait_block_group_cache_progress(block_group,
6706                                       num_bytes + empty_cluster + empty_size);
6707                                goto have_block_group;
6708                        }
6709
6710                        /*
6711                         * at this point we either didn't find a cluster
6712                         * or we weren't able to allocate a block from our
6713                         * cluster.  Free the cluster we've been trying
6714                         * to use, and go to the next block group
6715                         */
6716                        btrfs_return_cluster_to_free_space(NULL, last_ptr);
6717                        spin_unlock(&last_ptr->refill_lock);
6718                        goto loop;
6719                }
6720
6721unclustered_alloc:
6722                spin_lock(&block_group->free_space_ctl->tree_lock);
6723                if (cached &&
6724                    block_group->free_space_ctl->free_space <
6725                    num_bytes + empty_cluster + empty_size) {
6726                        if (block_group->free_space_ctl->free_space >
6727                            max_extent_size)
6728                                max_extent_size =
6729                                        block_group->free_space_ctl->free_space;
6730                        spin_unlock(&block_group->free_space_ctl->tree_lock);
6731                        goto loop;
6732                }
6733                spin_unlock(&block_group->free_space_ctl->tree_lock);
6734
6735                offset = btrfs_find_space_for_alloc(block_group, search_start,
6736                                                    num_bytes, empty_size,
6737                                                    &max_extent_size);
6738                /*
6739                 * If we didn't find a chunk, and we haven't failed on this
6740                 * block group before, and this block group is in the middle of
6741                 * caching and we are ok with waiting, then go ahead and wait
6742                 * for progress to be made, and set failed_alloc to true.
6743                 *
6744                 * If failed_alloc is true then we've already waited on this
6745                 * block group once and should move on to the next block group.
6746                 */
6747                if (!offset && !failed_alloc && !cached &&
6748                    loop > LOOP_CACHING_NOWAIT) {
6749                        wait_block_group_cache_progress(block_group,
6750                                                num_bytes + empty_size);
6751                        failed_alloc = true;
6752                        goto have_block_group;
6753                } else if (!offset) {
6754                        if (!cached)
6755                                have_caching_bg = true;
6756                        goto loop;
6757                }
6758checks:
6759                search_start = stripe_align(root, block_group,
6760                                            offset, num_bytes);
6761
6762                /* move on to the next group */
6763                if (search_start + num_bytes >
6764                    block_group->key.objectid + block_group->key.offset) {
6765                        btrfs_add_free_space(block_group, offset, num_bytes);
6766                        goto loop;
6767                }
6768
6769                if (offset < search_start)
6770                        btrfs_add_free_space(block_group, offset,
6771                                             search_start - offset);
6772                BUG_ON(offset > search_start);
6773
6774                ret = btrfs_update_reserved_bytes(block_group, num_bytes,
6775                                                  alloc_type, delalloc);
6776                if (ret == -EAGAIN) {
6777                        btrfs_add_free_space(block_group, offset, num_bytes);
6778                        goto loop;
6779                }
6780
6781                /* we are all good, lets return */
6782                ins->objectid = search_start;
6783                ins->offset = num_bytes;
6784
6785                trace_btrfs_reserve_extent(orig_root, block_group,
6786                                           search_start, num_bytes);
6787                btrfs_release_block_group(block_group, delalloc);
6788                break;
6789loop:
6790                failed_cluster_refill = false;
6791                failed_alloc = false;
6792                BUG_ON(index != get_block_group_index(block_group));
6793                btrfs_release_block_group(block_group, delalloc);
6794        }
6795        up_read(&space_info->groups_sem);
6796
6797        if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
6798                goto search;
6799
6800        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
6801                goto search;
6802
6803        /*
6804         * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
6805         *                      caching kthreads as we move along
6806         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
6807         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
6808         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
6809         *                      again
6810         */
6811        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
6812                index = 0;
6813                loop++;
6814                if (loop == LOOP_ALLOC_CHUNK) {
6815                        struct btrfs_trans_handle *trans;
6816                        int exist = 0;
6817
6818                        trans = current->journal_info;
6819                        if (trans)
6820                                exist = 1;
6821                        else
6822                                trans = btrfs_join_transaction(root);
6823
6824                        if (IS_ERR(trans)) {
6825                                ret = PTR_ERR(trans);
6826                                goto out;
6827                        }
6828
6829                        ret = do_chunk_alloc(trans, root, flags,
6830                                             CHUNK_ALLOC_FORCE);
6831                        /*
6832                         * Do not bail out on ENOSPC since we
6833                         * can do more things.
6834                         */
6835                        if (ret < 0 && ret != -ENOSPC)
6836                                btrfs_abort_transaction(trans,
6837                                                        root, ret);
6838                        else
6839                                ret = 0;
6840                        if (!exist)
6841                                btrfs_end_transaction(trans, root);
6842                        if (ret)
6843                                goto out;
6844                }
6845
6846                if (loop == LOOP_NO_EMPTY_SIZE) {
6847                        empty_size = 0;
6848                        empty_cluster = 0;
6849                }
6850
6851                goto search;
6852        } else if (!ins->objectid) {
6853                ret = -ENOSPC;
6854        } else if (ins->objectid) {
6855                ret = 0;
6856        }
6857out:
6858        if (ret == -ENOSPC)
6859                ins->offset = max_extent_size;
6860        return ret;
6861}
6862
6863static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
6864                            int dump_block_groups)
6865{
6866        struct btrfs_block_group_cache *cache;
6867        int index = 0;
6868
6869        spin_lock(&info->lock);
6870        printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
6871               info->flags,
6872               info->total_bytes - info->bytes_used - info->bytes_pinned -
6873               info->bytes_reserved - info->bytes_readonly,
6874               (info->full) ? "" : "not ");
6875        printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
6876               "reserved=%llu, may_use=%llu, readonly=%llu\n",
6877               info->total_bytes, info->bytes_used, info->bytes_pinned,
6878               info->bytes_reserved, info->bytes_may_use,
6879               info->bytes_readonly);
6880        spin_unlock(&info->lock);
6881
6882        if (!dump_block_groups)
6883                return;
6884
6885        down_read(&info->groups_sem);
6886again:
6887        list_for_each_entry(cache, &info->block_groups[index], list) {
6888                spin_lock(&cache->lock);
6889                printk(KERN_INFO "BTRFS: "
6890                           "block group %llu has %llu bytes, "
6891                           "%llu used %llu pinned %llu reserved %s\n",
6892                       cache->key.objectid, cache->key.offset,
6893                       btrfs_block_group_used(&cache->item), cache->pinned,
6894                       cache->reserved, cache->ro ? "[readonly]" : "");
6895                btrfs_dump_free_space(cache, bytes);
6896                spin_unlock(&cache->lock);
6897        }
6898        if (++index < BTRFS_NR_RAID_TYPES)
6899                goto again;
6900        up_read(&info->groups_sem);
6901}
6902
6903int btrfs_reserve_extent(struct btrfs_root *root,
6904                         u64 num_bytes, u64 min_alloc_size,
6905                         u64 empty_size, u64 hint_byte,
6906                         struct btrfs_key *ins, int is_data, int delalloc)
6907{
6908        bool final_tried = false;
6909        u64 flags;
6910        int ret;
6911
6912        flags = btrfs_get_alloc_profile(root, is_data);
6913again:
6914        WARN_ON(num_bytes < root->sectorsize);
6915        ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
6916                               flags, delalloc);
6917
6918        if (ret == -ENOSPC) {
6919                if (!final_tried && ins->offset) {
6920                        num_bytes = min(num_bytes >> 1, ins->offset);
6921                        num_bytes = round_down(num_bytes, root->sectorsize);
6922                        num_bytes = max(num_bytes, min_alloc_size);
6923                        if (num_bytes == min_alloc_size)
6924                                final_tried = true;
6925                        goto again;
6926                } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6927                        struct btrfs_space_info *sinfo;
6928
6929                        sinfo = __find_space_info(root->fs_info, flags);
6930                        btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
6931                                flags, num_bytes);
6932                        if (sinfo)
6933                                dump_space_info(sinfo, num_bytes, 1);
6934                }
6935        }
6936
6937        return ret;
6938}
6939
6940static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6941                                        u64 start, u64 len,
6942                                        int pin, int delalloc)
6943{
6944        struct btrfs_block_group_cache *cache;
6945        int ret = 0;
6946
6947        cache = btrfs_lookup_block_group(root->fs_info, start);
6948        if (!cache) {
6949                btrfs_err(root->fs_info, "Unable to find block group for %llu",
6950                        start);
6951                return -ENOSPC;
6952        }
6953
6954        if (btrfs_test_opt(root, DISCARD))
6955                ret = btrfs_discard_extent(root, start, len, NULL);
6956
6957        if (pin)
6958                pin_down_extent(root, cache, start, len, 1);
6959        else {
6960                btrfs_add_free_space(cache, start, len);
6961                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
6962        }
6963        btrfs_put_block_group(cache);
6964
6965        trace_btrfs_reserved_extent_free(root, start, len);
6966
6967        return ret;
6968}
6969
6970int btrfs_free_reserved_extent(struct btrfs_root *root,
6971                               u64 start, u64 len, int delalloc)
6972{
6973        return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
6974}
6975
6976int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6977                                       u64 start, u64 len)
6978{
6979        return __btrfs_free_reserved_extent(root, start, len, 1, 0);
6980}
6981
6982static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6983                                      struct btrfs_root *root,
6984                                      u64 parent, u64 root_objectid,
6985                                      u64 flags, u64 owner, u64 offset,
6986                                      struct btrfs_key *ins, int ref_mod)
6987{
6988        int ret;
6989        struct btrfs_fs_info *fs_info = root->fs_info;
6990        struct btrfs_extent_item *extent_item;
6991        struct btrfs_extent_inline_ref *iref;
6992        struct btrfs_path *path;
6993        struct extent_buffer *leaf;
6994        int type;
6995        u32 size;
6996
6997        if (parent > 0)
6998                type = BTRFS_SHARED_DATA_REF_KEY;
6999        else
7000                type = BTRFS_EXTENT_DATA_REF_KEY;
7001
7002        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
7003
7004        path = btrfs_alloc_path();
7005        if (!path)
7006                return -ENOMEM;
7007
7008        path->leave_spinning = 1;
7009        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7010                                      ins, size);
7011        if (ret) {
7012                btrfs_free_path(path);
7013                return ret;
7014        }
7015
7016        leaf = path->nodes[0];
7017        extent_item = btrfs_item_ptr(leaf, path->slots[0],
7018                                     struct btrfs_extent_item);
7019        btrfs_set_extent_refs(leaf, extent_item, ref_mod);
7020        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7021        btrfs_set_extent_flags(leaf, extent_item,
7022                               flags | BTRFS_EXTENT_FLAG_DATA);
7023
7024        iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7025        btrfs_set_extent_inline_ref_type(leaf, iref, type);
7026        if (parent > 0) {
7027                struct btrfs_shared_data_ref *ref;
7028                ref = (struct btrfs_shared_data_ref *)(iref + 1);
7029                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7030                btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
7031        } else {
7032                struct btrfs_extent_data_ref *ref;
7033                ref = (struct btrfs_extent_data_ref *)(&iref->offset);
7034                btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
7035                btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
7036                btrfs_set_extent_data_ref_offset(leaf, ref, offset);
7037                btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
7038        }
7039
7040        btrfs_mark_buffer_dirty(path->nodes[0]);
7041        btrfs_free_path(path);
7042
7043        /* Always set parent to 0 here since its exclusive anyway. */
7044        ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
7045                                      ins->objectid, ins->offset,
7046                                      BTRFS_QGROUP_OPER_ADD_EXCL, 0);
7047        if (ret)
7048                return ret;
7049
7050        ret = update_block_group(root, ins->objectid, ins->offset, 1);
7051        if (ret) { /* -ENOENT, logic error */
7052                btrfs_err(fs_info, "update block group failed for %llu %llu",
7053                        ins->objectid, ins->offset);
7054                BUG();
7055        }
7056        trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
7057        return ret;
7058}
7059
7060static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7061                                     struct btrfs_root *root,
7062                                     u64 parent, u64 root_objectid,
7063                                     u64 flags, struct btrfs_disk_key *key,
7064                                     int level, struct btrfs_key *ins,
7065                                     int no_quota)
7066{
7067        int ret;
7068        struct btrfs_fs_info *fs_info = root->fs_info;
7069        struct btrfs_extent_item *extent_item;
7070        struct btrfs_tree_block_info *block_info;
7071        struct btrfs_extent_inline_ref *iref;
7072        struct btrfs_path *path;
7073        struct extent_buffer *leaf;
7074        u32 size = sizeof(*extent_item) + sizeof(*iref);
7075        u64 num_bytes = ins->offset;
7076        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7077                                                 SKINNY_METADATA);
7078
7079        if (!skinny_metadata)
7080                size += sizeof(*block_info);
7081
7082        path = btrfs_alloc_path();
7083        if (!path) {
7084                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7085                                                   root->leafsize);
7086                return -ENOMEM;
7087        }
7088
7089        path->leave_spinning = 1;
7090        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7091                                      ins, size);
7092        if (ret) {
7093                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7094                                                   root->leafsize);
7095                btrfs_free_path(path);
7096                return ret;
7097        }
7098
7099        leaf = path->nodes[0];
7100        extent_item = btrfs_item_ptr(leaf, path->slots[0],
7101                                     struct btrfs_extent_item);
7102        btrfs_set_extent_refs(leaf, extent_item, 1);
7103        btrfs_set_extent_generation(leaf, extent_item, trans->transid);
7104        btrfs_set_extent_flags(leaf, extent_item,
7105                               flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
7106
7107        if (skinny_metadata) {
7108                iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7109                num_bytes = root->leafsize;
7110        } else {
7111                block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7112                btrfs_set_tree_block_key(leaf, block_info, key);
7113                btrfs_set_tree_block_level(leaf, block_info, level);
7114                iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
7115        }
7116
7117        if (parent > 0) {
7118                BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
7119                btrfs_set_extent_inline_ref_type(leaf, iref,
7120                                                 BTRFS_SHARED_BLOCK_REF_KEY);
7121                btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
7122        } else {
7123                btrfs_set_extent_inline_ref_type(leaf, iref,
7124                                                 BTRFS_TREE_BLOCK_REF_KEY);
7125                btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
7126        }
7127
7128        btrfs_mark_buffer_dirty(leaf);
7129        btrfs_free_path(path);
7130
7131        if (!no_quota) {
7132                ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
7133                                              ins->objectid, num_bytes,
7134                                              BTRFS_QGROUP_OPER_ADD_EXCL, 0);
7135                if (ret)
7136                        return ret;
7137        }
7138
7139        ret = update_block_group(root, ins->objectid, root->leafsize, 1);
7140        if (ret) { /* -ENOENT, logic error */
7141                btrfs_err(fs_info, "update block group failed for %llu %llu",
7142                        ins->objectid, ins->offset);
7143                BUG();
7144        }
7145
7146        trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
7147        return ret;
7148}
7149
7150int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7151                                     struct btrfs_root *root,
7152                                     u64 root_objectid, u64 owner,
7153                                     u64 offset, struct btrfs_key *ins)
7154{
7155        int ret;
7156
7157        BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
7158
7159        ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
7160                                         ins->offset, 0,
7161                                         root_objectid, owner, offset,
7162                                         BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
7163        return ret;
7164}
7165
7166/*
7167 * this is used by the tree logging recovery code.  It records that
7168 * an extent has been allocated and makes sure to clear the free
7169 * space cache bits as well
7170 */
7171int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
7172                                   struct btrfs_root *root,
7173                                   u64 root_objectid, u64 owner, u64 offset,
7174                                   struct btrfs_key *ins)
7175{
7176        int ret;
7177        struct btrfs_block_group_cache *block_group;
7178
7179        /*
7180         * Mixed block groups will exclude before processing the log so we only
7181         * need to do the exlude dance if this fs isn't mixed.
7182         */
7183        if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
7184                ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
7185                if (ret)
7186                        return ret;
7187        }
7188
7189        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
7190        if (!block_group)
7191                return -EINVAL;
7192
7193        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
7194                                          RESERVE_ALLOC_NO_ACCOUNT, 0);
7195        BUG_ON(ret); /* logic error */
7196        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
7197                                         0, owner, offset, ins, 1);
7198        btrfs_put_block_group(block_group);
7199        return ret;
7200}
7201
7202static struct extent_buffer *
7203btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7204                      u64 bytenr, u32 blocksize, int level)
7205{
7206        struct extent_buffer *buf;
7207
7208        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
7209        if (!buf)
7210                return ERR_PTR(-ENOMEM);
7211        btrfs_set_header_generation(buf, trans->transid);
7212        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
7213        btrfs_tree_lock(buf);
7214        clean_tree_block(trans, root, buf);
7215        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
7216
7217        btrfs_set_lock_blocking(buf);
7218        btrfs_set_buffer_uptodate(buf);
7219
7220        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7221                /*
7222                 * we allow two log transactions at a time, use different
7223                 * EXENT bit to differentiate dirty pages.
7224                 */
7225                if (root->log_transid % 2 == 0)
7226                        set_extent_dirty(&root->dirty_log_pages, buf->start,
7227                                        buf->start + buf->len - 1, GFP_NOFS);
7228                else
7229                        set_extent_new(&root->dirty_log_pages, buf->start,
7230                                        buf->start + buf->len - 1, GFP_NOFS);
7231        } else {
7232                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7233                         buf->start + buf->len - 1, GFP_NOFS);
7234        }
7235        trans->blocks_used++;
7236        /* this returns a buffer locked for blocking */
7237        return buf;
7238}
7239
7240static struct btrfs_block_rsv *
7241use_block_rsv(struct btrfs_trans_handle *trans,
7242              struct btrfs_root *root, u32 blocksize)
7243{
7244        struct btrfs_block_rsv *block_rsv;
7245        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
7246        int ret;
7247        bool global_updated = false;
7248
7249        block_rsv = get_block_rsv(trans, root);
7250
7251        if (unlikely(block_rsv->size == 0))
7252                goto try_reserve;
7253again:
7254        ret = block_rsv_use_bytes(block_rsv, blocksize);
7255        if (!ret)
7256                return block_rsv;
7257
7258        if (block_rsv->failfast)
7259                return ERR_PTR(ret);
7260
7261        if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
7262                global_updated = true;
7263                update_global_block_rsv(root->fs_info);
7264                goto again;
7265        }
7266
7267        if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
7268                static DEFINE_RATELIMIT_STATE(_rs,
7269                                DEFAULT_RATELIMIT_INTERVAL * 10,
7270                                /*DEFAULT_RATELIMIT_BURST*/ 1);
7271                if (__ratelimit(&_rs))
7272                        WARN(1, KERN_DEBUG
7273                                "BTRFS: block rsv returned %d\n", ret);
7274        }
7275try_reserve:
7276        ret = reserve_metadata_bytes(root, block_rsv, blocksize,
7277                                     BTRFS_RESERVE_NO_FLUSH);
7278        if (!ret)
7279                return block_rsv;
7280        /*
7281         * If we couldn't reserve metadata bytes try and use some from
7282         * the global reserve if its space type is the same as the global
7283         * reservation.
7284         */
7285        if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
7286            block_rsv->space_info == global_rsv->space_info) {
7287                ret = block_rsv_use_bytes(global_rsv, blocksize);
7288                if (!ret)
7289                        return global_rsv;
7290        }
7291        return ERR_PTR(ret);
7292}
7293
7294static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7295                            struct btrfs_block_rsv *block_rsv, u32 blocksize)
7296{
7297        block_rsv_add_bytes(block_rsv, blocksize, 0);
7298        block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
7299}
7300
7301/*
7302 * finds a free extent and does all the dirty work required for allocation
7303 * returns the key for the extent through ins, and a tree buffer for
7304 * the first block of the extent through buf.
7305 *
7306 * returns the tree buffer or NULL.
7307 */
7308struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
7309                                        struct btrfs_root *root, u32 blocksize,
7310                                        u64 parent, u64 root_objectid,
7311                                        struct btrfs_disk_key *key, int level,
7312                                        u64 hint, u64 empty_size)
7313{
7314        struct btrfs_key ins;
7315        struct btrfs_block_rsv *block_rsv;
7316        struct extent_buffer *buf;
7317        u64 flags = 0;
7318        int ret;
7319        bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7320                                                 SKINNY_METADATA);
7321
7322#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
7323        if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
7324                buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7325                                            blocksize, level);
7326                if (!IS_ERR(buf))
7327                        root->alloc_bytenr += blocksize;
7328                return buf;
7329        }
7330#endif
7331        block_rsv = use_block_rsv(trans, root, blocksize);
7332        if (IS_ERR(block_rsv))
7333                return ERR_CAST(block_rsv);
7334
7335        ret = btrfs_reserve_extent(root, blocksize, blocksize,
7336                                   empty_size, hint, &ins, 0, 0);
7337        if (ret) {
7338                unuse_block_rsv(root->fs_info, block_rsv, blocksize);
7339                return ERR_PTR(ret);
7340        }
7341
7342        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
7343                                    blocksize, level);
7344        BUG_ON(IS_ERR(buf)); /* -ENOMEM */
7345
7346        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
7347                if (parent == 0)
7348                        parent = ins.objectid;
7349                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
7350        } else
7351                BUG_ON(parent > 0);
7352
7353        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
7354                struct btrfs_delayed_extent_op *extent_op;
7355                extent_op = btrfs_alloc_delayed_extent_op();
7356                BUG_ON(!extent_op); /* -ENOMEM */
7357                if (key)
7358                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
7359                else
7360                        memset(&extent_op->key, 0, sizeof(extent_op->key));
7361                extent_op->flags_to_set = flags;
7362                if (skinny_metadata)
7363                        extent_op->update_key = 0;
7364                else
7365                        extent_op->update_key = 1;
7366                extent_op->update_flags = 1;
7367                extent_op->is_data = 0;
7368                extent_op->level = level;
7369
7370                ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
7371                                        ins.objectid,
7372                                        ins.offset, parent, root_objectid,
7373                                        level, BTRFS_ADD_DELAYED_EXTENT,
7374                                        extent_op, 0);
7375                BUG_ON(ret); /* -ENOMEM */
7376        }
7377        return buf;
7378}
7379
7380struct walk_control {
7381        u64 refs[BTRFS_MAX_LEVEL];
7382        u64 flags[BTRFS_MAX_LEVEL];
7383        struct btrfs_key update_progress;
7384        int stage;
7385        int level;
7386        int shared_level;
7387        int update_ref;
7388        int keep_locks;
7389        int reada_slot;
7390        int reada_count;
7391        int for_reloc;
7392};
7393
7394#define DROP_REFERENCE  1
7395#define UPDATE_BACKREF  2
7396
7397static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7398                                     struct btrfs_root *root,
7399                                     struct walk_control *wc,
7400                                     struct btrfs_path *path)
7401{
7402        u64 bytenr;
7403        u64 generation;
7404        u64 refs;
7405        u64 flags;
7406        u32 nritems;
7407        u32 blocksize;
7408        struct btrfs_key key;
7409        struct extent_buffer *eb;
7410        int ret;
7411        int slot;
7412        int nread = 0;
7413
7414        if (path->slots[wc->level] < wc->reada_slot) {
7415                wc->reada_count = wc->reada_count * 2 / 3;
7416                wc->reada_count = max(wc->reada_count, 2);
7417        } else {
7418                wc->reada_count = wc->reada_count * 3 / 2;
7419                wc->reada_count = min_t(int, wc->reada_count,
7420                                        BTRFS_NODEPTRS_PER_BLOCK(root));
7421        }
7422
7423        eb = path->nodes[wc->level];
7424        nritems = btrfs_header_nritems(eb);
7425        blocksize = btrfs_level_size(root, wc->level - 1);
7426
7427        for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7428                if (nread >= wc->reada_count)
7429                        break;
7430
7431                cond_resched();
7432                bytenr = btrfs_node_blockptr(eb, slot);
7433                generation = btrfs_node_ptr_generation(eb, slot);
7434
7435                if (slot == path->slots[wc->level])
7436                        goto reada;
7437
7438                if (wc->stage == UPDATE_BACKREF &&
7439                    generation <= root->root_key.offset)
7440                        continue;
7441
7442                /* We don't lock the tree block, it's OK to be racy here */
7443                ret = btrfs_lookup_extent_info(trans, root, bytenr,
7444                                               wc->level - 1, 1, &refs,
7445                                               &flags);
7446                /* We don't care about errors in readahead. */
7447                if (ret < 0)
7448                        continue;
7449                BUG_ON(refs == 0);
7450
7451                if (wc->stage == DROP_REFERENCE) {
7452                        if (refs == 1)
7453                                goto reada;
7454
7455                        if (wc->level == 1 &&
7456                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7457                                continue;
7458                        if (!wc->update_ref ||
7459                            generation <= root->root_key.offset)
7460                                continue;
7461                        btrfs_node_key_to_cpu(eb, &key, slot);
7462                        ret = btrfs_comp_cpu_keys(&key,
7463                                                  &wc->update_progress);
7464                        if (ret < 0)
7465                                continue;
7466                } else {
7467                        if (wc->level == 1 &&
7468                            (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7469                                continue;
7470                }
7471reada:
7472                ret = readahead_tree_block(root, bytenr, blocksize,
7473                                           generation);
7474                if (ret)
7475                        break;
7476                nread++;
7477        }
7478        wc->reada_slot = slot;
7479}
7480
7481/*
7482 * helper to process tree block while walking down the tree.
7483 *
7484 * when wc->stage == UPDATE_BACKREF, this function updates
7485 * back refs for pointers in the block.
7486 *
7487 * NOTE: return value 1 means we should stop walking down.
7488 */
7489static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
7490                                   struct btrfs_root *root,
7491                                   struct btrfs_path *path,
7492                                   struct walk_control *wc, int lookup_info)
7493{
7494        int level = wc->level;
7495        struct extent_buffer *eb = path->nodes[level];
7496        u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7497        int ret;
7498
7499        if (wc->stage == UPDATE_BACKREF &&
7500            btrfs_header_owner(eb) != root->root_key.objectid)
7501                return 1;
7502
7503        /*
7504         * when reference count of tree block is 1, it won't increase
7505         * again. once full backref flag is set, we never clear it.
7506         */
7507        if (lookup_info &&
7508            ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
7509             (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
7510                BUG_ON(!path->locks[level]);
7511                ret = btrfs_lookup_extent_info(trans, root,
7512                                               eb->start, level, 1,
7513                                               &wc->refs[level],
7514                                               &wc->flags[level]);
7515                BUG_ON(ret == -ENOMEM);
7516                if (ret)
7517                        return ret;
7518                BUG_ON(wc->refs[level] == 0);
7519        }
7520
7521        if (wc->stage == DROP_REFERENCE) {
7522                if (wc->refs[level] > 1)
7523                        return 1;
7524
7525                if (path->locks[level] && !wc->keep_locks) {
7526                        btrfs_tree_unlock_rw(eb, path->locks[level]);
7527                        path->locks[level] = 0;
7528                }
7529                return 0;
7530        }
7531
7532        /* wc->stage == UPDATE_BACKREF */
7533        if (!(wc->flags[level] & flag)) {
7534                BUG_ON(!path->locks[level]);
7535                ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
7536                BUG_ON(ret); /* -ENOMEM */
7537                ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
7538                BUG_ON(ret); /* -ENOMEM */
7539                ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
7540                                                  eb->len, flag,
7541                                                  btrfs_header_level(eb), 0);
7542                BUG_ON(ret); /* -ENOMEM */
7543                wc->flags[level] |= flag;
7544        }
7545
7546        /*
7547         * the block is shared by multiple trees, so it's not good to
7548         * keep the tree lock
7549         */
7550        if (path->locks[level] && level > 0) {
7551                btrfs_tree_unlock_rw(eb, path->locks[level]);
7552                path->locks[level] = 0;
7553        }
7554        return 0;
7555}
7556
7557/*
7558 * helper to process tree block pointer.
7559 *
7560 * when wc->stage == DROP_REFERENCE, this function checks
7561 * reference count of the block pointed to. if the block
7562 * is shared and we need update back refs for the subtree
7563 * rooted at the block, this function changes wc->stage to
7564 * UPDATE_BACKREF. if the block is shared and there is no
7565 * need to update back, this function drops the reference
7566 * to the block.
7567 *
7568 * NOTE: return value 1 means we should stop walking down.
7569 */
7570static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7571                                 struct btrfs_root *root,
7572                                 struct btrfs_path *path,
7573                                 struct walk_control *wc, int *lookup_info)
7574{
7575        u64 bytenr;
7576        u64 generation;
7577        u64 parent;
7578        u32 blocksize;
7579        struct btrfs_key key;
7580        struct extent_buffer *next;
7581        int level = wc->level;
7582        int reada = 0;
7583        int ret = 0;
7584
7585        generation = btrfs_node_ptr_generation(path->nodes[level],
7586                                               path->slots[level]);
7587        /*
7588         * if the lower level block was created before the snapshot
7589         * was created, we know there is no need to update back refs
7590         * for the subtree
7591         */
7592        if (wc->stage == UPDATE_BACKREF &&
7593            generation <= root->root_key.offset) {
7594                *lookup_info = 1;
7595                return 1;
7596        }
7597
7598        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7599        blocksize = btrfs_level_size(root, level - 1);
7600
7601        next = btrfs_find_tree_block(root, bytenr, blocksize);
7602        if (!next) {
7603                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
7604                if (!next)
7605                        return -ENOMEM;
7606                btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
7607                                               level - 1);
7608                reada = 1;
7609        }
7610        btrfs_tree_lock(next);
7611        btrfs_set_lock_blocking(next);
7612
7613        ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
7614                                       &wc->refs[level - 1],
7615                                       &wc->flags[level - 1]);
7616        if (ret < 0) {
7617                btrfs_tree_unlock(next);
7618                return ret;
7619        }
7620
7621        if (unlikely(wc->refs[level - 1] == 0)) {
7622                btrfs_err(root->fs_info, "Missing references.");
7623                BUG();
7624        }
7625        *lookup_info = 0;
7626
7627        if (wc->stage == DROP_REFERENCE) {
7628                if (wc->refs[level - 1] > 1) {
7629                        if (level == 1 &&
7630                            (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7631                                goto skip;
7632
7633                        if (!wc->update_ref ||
7634                            generation <= root->root_key.offset)
7635                                goto skip;
7636
7637                        btrfs_node_key_to_cpu(path->nodes[level], &key,
7638                                              path->slots[level]);
7639                        ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
7640                        if (ret < 0)
7641                                goto skip;
7642
7643                        wc->stage = UPDATE_BACKREF;
7644                        wc->shared_level = level - 1;
7645                }
7646        } else {
7647                if (level == 1 &&
7648                    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7649                        goto skip;
7650        }
7651
7652        if (!btrfs_buffer_uptodate(next, generation, 0)) {
7653                btrfs_tree_unlock(next);
7654                free_extent_buffer(next);
7655                next = NULL;
7656                *lookup_info = 1;
7657        }
7658
7659        if (!next) {
7660                if (reada && level == 1)
7661                        reada_walk_down(trans, root, wc, path);
7662                next = read_tree_block(root, bytenr, blocksize, generation);
7663                if (!next || !extent_buffer_uptodate(next)) {
7664                        free_extent_buffer(next);
7665                        return -EIO;
7666                }
7667                btrfs_tree_lock(next);
7668                btrfs_set_lock_blocking(next);
7669        }
7670
7671        level--;
7672        BUG_ON(level != btrfs_header_level(next));
7673        path->nodes[level] = next;
7674        path->slots[level] = 0;
7675        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7676        wc->level = level;
7677        if (wc->level == 1)
7678                wc->reada_slot = 0;
7679        return 0;
7680skip:
7681        wc->refs[level - 1] = 0;
7682        wc->flags[level - 1] = 0;
7683        if (wc->stage == DROP_REFERENCE) {
7684                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
7685                        parent = path->nodes[level]->start;
7686                } else {
7687                        BUG_ON(root->root_key.objectid !=
7688                               btrfs_header_owner(path->nodes[level]));
7689                        parent = 0;
7690                }
7691
7692                ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
7693                                root->root_key.objectid, level - 1, 0, 0);
7694                BUG_ON(ret); /* -ENOMEM */
7695        }
7696        btrfs_tree_unlock(next);
7697        free_extent_buffer(next);
7698        *lookup_info = 1;
7699        return 1;
7700}
7701
7702/*
7703 * helper to process tree block while walking up the tree.
7704 *
7705 * when wc->stage == DROP_REFERENCE, this function drops
7706 * reference count on the block.
7707 *
7708 * when wc->stage == UPDATE_BACKREF, this function changes
7709 * wc->stage back to DROP_REFERENCE if we changed wc->stage
7710 * to UPDATE_BACKREF previously while processing the block.
7711 *
7712 * NOTE: return value 1 means we should stop walking up.
7713 */
7714static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7715                                 struct btrfs_root *root,
7716                                 struct btrfs_path *path,
7717                                 struct walk_control *wc)
7718{
7719        int ret;
7720        int level = wc->level;
7721        struct extent_buffer *eb = path->nodes[level];
7722        u64 parent = 0;
7723
7724        if (wc->stage == UPDATE_BACKREF) {
7725                BUG_ON(wc->shared_level < level);
7726                if (level < wc->shared_level)
7727                        goto out;
7728
7729                ret = find_next_key(path, level + 1, &wc->update_progress);
7730                if (ret > 0)
7731                        wc->update_ref = 0;
7732
7733                wc->stage = DROP_REFERENCE;
7734                wc->shared_level = -1;
7735                path->slots[level] = 0;
7736
7737                /*
7738                 * check reference count again if the block isn't locked.
7739                 * we should start walking down the tree again if reference
7740                 * count is one.
7741                 */
7742                if (!path->locks[level]) {
7743                        BUG_ON(level == 0);
7744                        btrfs_tree_lock(eb);
7745                        btrfs_set_lock_blocking(eb);
7746                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7747
7748                        ret = btrfs_lookup_extent_info(trans, root,
7749                                                       eb->start, level, 1,
7750                                                       &wc->refs[level],
7751                                                       &wc->flags[level]);
7752                        if (ret < 0) {
7753                                btrfs_tree_unlock_rw(eb, path->locks[level]);
7754                                path->locks[level] = 0;
7755                                return ret;
7756                        }
7757                        BUG_ON(wc->refs[level] == 0);
7758                        if (wc->refs[level] == 1) {
7759                                btrfs_tree_unlock_rw(eb, path->locks[level]);
7760                                path->locks[level] = 0;
7761                                return 1;
7762                        }
7763                }
7764        }
7765
7766        /* wc->stage == DROP_REFERENCE */
7767        BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
7768
7769        if (wc->refs[level] == 1) {
7770                if (level == 0) {
7771                        if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7772                                ret = btrfs_dec_ref(trans, root, eb, 1,
7773                                                    wc->for_reloc);
7774                        else
7775                                ret = btrfs_dec_ref(trans, root, eb, 0,
7776                                                    wc->for_reloc);
7777                        BUG_ON(ret); /* -ENOMEM */
7778                }
7779                /* make block locked assertion in clean_tree_block happy */
7780                if (!path->locks[level] &&
7781                    btrfs_header_generation(eb) == trans->transid) {
7782                        btrfs_tree_lock(eb);
7783                        btrfs_set_lock_blocking(eb);
7784                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7785                }
7786                clean_tree_block(trans, root, eb);
7787        }
7788
7789        if (eb == root->node) {
7790                if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7791                        parent = eb->start;
7792                else
7793                        BUG_ON(root->root_key.objectid !=
7794                               btrfs_header_owner(eb));
7795        } else {
7796                if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7797                        parent = path->nodes[level + 1]->start;
7798                else
7799                        BUG_ON(root->root_key.objectid !=
7800                               btrfs_header_owner(path->nodes[level + 1]));
7801        }
7802
7803        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
7804out:
7805        wc->refs[level] = 0;
7806        wc->flags[level] = 0;
7807        return 0;
7808}
7809
7810static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
7811                                   struct btrfs_root *root,
7812                                   struct btrfs_path *path,
7813                                   struct walk_control *wc)
7814{
7815        int level = wc->level;
7816        int lookup_info = 1;
7817        int ret;
7818
7819        while (level >= 0) {
7820                ret = walk_down_proc(trans, root, path, wc, lookup_info);
7821                if (ret > 0)
7822                        break;
7823
7824                if (level == 0)
7825                        break;
7826
7827                if (path->slots[level] >=
7828                    btrfs_header_nritems(path->nodes[level]))
7829                        break;
7830
7831                ret = do_walk_down(trans, root, path, wc, &lookup_info);
7832                if (ret > 0) {
7833                        path->slots[level]++;
7834                        continue;
7835                } else if (ret < 0)
7836                        return ret;
7837                level = wc->level;
7838        }
7839        return 0;
7840}
7841
7842static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
7843                                 struct btrfs_root *root,
7844                                 struct btrfs_path *path,
7845                                 struct walk_control *wc, int max_level)
7846{
7847        int level = wc->level;
7848        int ret;
7849
7850        path->slots[level] = btrfs_header_nritems(path->nodes[level]);
7851        while (level < max_level && path->nodes[level]) {
7852                wc->level = level;
7853                if (path->slots[level] + 1 <
7854                    btrfs_header_nritems(path->nodes[level])) {
7855                        path->slots[level]++;
7856                        return 0;
7857                } else {
7858                        ret = walk_up_proc(trans, root, path, wc);
7859                        if (ret > 0)
7860                                return 0;
7861
7862                        if (path->locks[level]) {
7863                                btrfs_tree_unlock_rw(path->nodes[level],
7864                                                     path->locks[level]);
7865                                path->locks[level] = 0;
7866                        }
7867                        free_extent_buffer(path->nodes[level]);
7868                        path->nodes[level] = NULL;
7869                        level++;
7870                }
7871        }
7872        return 1;
7873}
7874
7875/*
7876 * drop a subvolume tree.
7877 *
7878 * this function traverses the tree freeing any blocks that only
7879 * referenced by the tree.
7880 *
7881 * when a shared tree block is found. this function decreases its
7882 * reference count by one. if update_ref is true, this function
7883 * also make sure backrefs for the shared block and all lower level
7884 * blocks are properly updated.
7885 *
7886 * If called with for_reloc == 0, may exit early with -EAGAIN
7887 */
7888int btrfs_drop_snapshot(struct btrfs_root *root,
7889                         struct btrfs_block_rsv *block_rsv, int update_ref,
7890                         int for_reloc)
7891{
7892        struct btrfs_path *path;
7893        struct btrfs_trans_handle *trans;
7894        struct btrfs_root *tree_root = root->fs_info->tree_root;
7895        struct btrfs_root_item *root_item = &root->root_item;
7896        struct walk_control *wc;
7897        struct btrfs_key key;
7898        int err = 0;
7899        int ret;
7900        int level;
7901        bool root_dropped = false;
7902
7903        path = btrfs_alloc_path();
7904        if (!path) {
7905                err = -ENOMEM;
7906                goto out;
7907        }
7908
7909        wc = kzalloc(sizeof(*wc), GFP_NOFS);
7910        if (!wc) {
7911                btrfs_free_path(path);
7912                err = -ENOMEM;
7913                goto out;
7914        }
7915
7916        trans = btrfs_start_transaction(tree_root, 0);
7917        if (IS_ERR(trans)) {
7918                err = PTR_ERR(trans);
7919                goto out_free;
7920        }
7921
7922        if (block_rsv)
7923                trans->block_rsv = block_rsv;
7924
7925        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
7926                level = btrfs_header_level(root->node);
7927                path->nodes[level] = btrfs_lock_root_node(root);
7928                btrfs_set_lock_blocking(path->nodes[level]);
7929                path->slots[level] = 0;
7930                path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7931                memset(&wc->update_progress, 0,
7932                       sizeof(wc->update_progress));
7933        } else {
7934                btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
7935                memcpy(&wc->update_progress, &key,
7936                       sizeof(wc->update_progress));
7937
7938                level = root_item->drop_level;
7939                BUG_ON(level == 0);
7940                path->lowest_level = level;
7941                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7942                path->lowest_level = 0;
7943                if (ret < 0) {
7944                        err = ret;
7945                        goto out_end_trans;
7946                }
7947                WARN_ON(ret > 0);
7948
7949                /*
7950                 * unlock our path, this is safe because only this
7951                 * function is allowed to delete this snapshot
7952                 */
7953                btrfs_unlock_up_safe(path, 0);
7954
7955                level = btrfs_header_level(root->node);
7956                while (1) {
7957                        btrfs_tree_lock(path->nodes[level]);
7958                        btrfs_set_lock_blocking(path->nodes[level]);
7959                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7960
7961                        ret = btrfs_lookup_extent_info(trans, root,
7962                                                path->nodes[level]->start,
7963                                                level, 1, &wc->refs[level],
7964                                                &wc->flags[level]);
7965                        if (ret < 0) {
7966                                err = ret;
7967                                goto out_end_trans;
7968                        }
7969                        BUG_ON(wc->refs[level] == 0);
7970
7971                        if (level == root_item->drop_level)
7972                                break;
7973
7974                        btrfs_tree_unlock(path->nodes[level]);
7975                        path->locks[level] = 0;
7976                        WARN_ON(wc->refs[level] != 1);
7977                        level--;
7978                }
7979        }
7980
7981        wc->level = level;
7982        wc->shared_level = -1;
7983        wc->stage = DROP_REFERENCE;
7984        wc->update_ref = update_ref;
7985        wc->keep_locks = 0;
7986        wc->for_reloc = for_reloc;
7987        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7988
7989        while (1) {
7990
7991                ret = walk_down_tree(trans, root, path, wc);
7992                if (ret < 0) {
7993                        err = ret;
7994                        break;
7995                }
7996
7997                ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7998                if (ret < 0) {
7999                        err = ret;
8000                        break;
8001                }
8002
8003                if (ret > 0) {
8004                        BUG_ON(wc->stage != DROP_REFERENCE);
8005                        break;
8006                }
8007
8008                if (wc->stage == DROP_REFERENCE) {
8009                        level = wc->level;
8010                        btrfs_node_key(path->nodes[level],
8011                                       &root_item->drop_progress,
8012                                       path->slots[level]);
8013                        root_item->drop_level = level;
8014                }
8015
8016                BUG_ON(wc->level == 0);
8017                if (btrfs_should_end_transaction(trans, tree_root) ||
8018                    (!for_reloc && btrfs_need_cleaner_sleep(root))) {
8019                        ret = btrfs_update_root(trans, tree_root,
8020                                                &root->root_key,
8021                                                root_item);
8022                        if (ret) {
8023                                btrfs_abort_transaction(trans, tree_root, ret);
8024                                err = ret;
8025                                goto out_end_trans;
8026                        }
8027
8028                        btrfs_end_transaction_throttle(trans, tree_root);
8029                        if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8030                                pr_debug("BTRFS: drop snapshot early exit\n");
8031                                err = -EAGAIN;
8032                                goto out_free;
8033                        }
8034
8035                        trans = btrfs_start_transaction(tree_root, 0);
8036                        if (IS_ERR(trans)) {
8037                                err = PTR_ERR(trans);
8038                                goto out_free;
8039                        }
8040                        if (block_rsv)
8041                                trans->block_rsv = block_rsv;
8042                }
8043        }
8044        btrfs_release_path(path);
8045        if (err)
8046                goto out_end_trans;
8047
8048        ret = btrfs_del_root(trans, tree_root, &root->root_key);
8049        if (ret) {
8050                btrfs_abort_transaction(trans, tree_root, ret);
8051                goto out_end_trans;
8052        }
8053
8054        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
8055                ret = btrfs_find_root(tree_root, &root->root_key, path,
8056                                      NULL, NULL);
8057                if (ret < 0) {
8058                        btrfs_abort_transaction(trans, tree_root, ret);
8059                        err = ret;
8060                        goto out_end_trans;
8061                } else if (ret > 0) {
8062                        /* if we fail to delete the orphan item this time
8063                         * around, it'll get picked up the next time.
8064                         *
8065                         * The most common failure here is just -ENOENT.
8066                         */
8067                        btrfs_del_orphan_item(trans, tree_root,
8068                                              root->root_key.objectid);
8069                }
8070        }
8071
8072        if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
8073                btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
8074        } else {
8075                free_extent_buffer(root->node);
8076                free_extent_buffer(root->commit_root);
8077                btrfs_put_fs_root(root);
8078        }
8079        root_dropped = true;
8080out_end_trans:
8081        btrfs_end_transaction_throttle(trans, tree_root);
8082out_free:
8083        kfree(wc);
8084        btrfs_free_path(path);
8085out:
8086        /*
8087         * So if we need to stop dropping the snapshot for whatever reason we
8088         * need to make sure to add it back to the dead root list so that we
8089         * keep trying to do the work later.  This also cleans up roots if we
8090         * don't have it in the radix (like when we recover after a power fail
8091         * or unmount) so we don't leak memory.
8092         */
8093        if (!for_reloc && root_dropped == false)
8094                btrfs_add_dead_root(root);
8095        if (err && err != -EAGAIN)
8096                btrfs_std_error(root->fs_info, err);
8097        return err;
8098}
8099
8100/*
8101 * drop subtree rooted at tree block 'node'.
8102 *
8103 * NOTE: this function will unlock and release tree block 'node'
8104 * only used by relocation code
8105 */
8106int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
8107                        struct btrfs_root *root,
8108                        struct extent_buffer *node,
8109                        struct extent_buffer *parent)
8110{
8111        struct btrfs_path *path;
8112        struct walk_control *wc;
8113        int level;
8114        int parent_level;
8115        int ret = 0;
8116        int wret;
8117
8118        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
8119
8120        path = btrfs_alloc_path();
8121        if (!path)
8122                return -ENOMEM;
8123
8124        wc = kzalloc(sizeof(*wc), GFP_NOFS);
8125        if (!wc) {
8126                btrfs_free_path(path);
8127                return -ENOMEM;
8128        }
8129
8130        btrfs_assert_tree_locked(parent);
8131        parent_level = btrfs_header_level(parent);
8132        extent_buffer_get(parent);
8133        path->nodes[parent_level] = parent;
8134        path->slots[parent_level] = btrfs_header_nritems(parent);
8135
8136        btrfs_assert_tree_locked(node);
8137        level = btrfs_header_level(node);
8138        path->nodes[level] = node;
8139        path->slots[level] = 0;
8140        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8141
8142        wc->refs[parent_level] = 1;
8143        wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
8144        wc->level = level;
8145        wc->shared_level = -1;
8146        wc->stage = DROP_REFERENCE;
8147        wc->update_ref = 0;
8148        wc->keep_locks = 1;
8149        wc->for_reloc = 1;
8150        wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
8151
8152        while (1) {
8153                wret = walk_down_tree(trans, root, path, wc);
8154                if (wret < 0) {
8155                        ret = wret;
8156                        break;
8157                }
8158
8159                wret = walk_up_tree(trans, root, path, wc, parent_level);
8160                if (wret < 0)
8161                        ret = wret;
8162                if (wret != 0)
8163                        break;
8164        }
8165
8166        kfree(wc);
8167        btrfs_free_path(path);
8168        return ret;
8169}
8170
8171static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
8172{
8173        u64 num_devices;
8174        u64 stripped;
8175
8176        /*
8177         * if restripe for this chunk_type is on pick target profile and
8178         * return, otherwise do the usual balance
8179         */
8180        stripped = get_restripe_target(root->fs_info, flags);
8181        if (stripped)
8182                return extended_to_chunk(stripped);
8183
8184        /*
8185         * we add in the count of missing devices because we want
8186         * to make sure that any RAID levels on a degraded FS
8187         * continue to be honored.
8188         */
8189        num_devices = root->fs_info->fs_devices->rw_devices +
8190                root->fs_info->fs_devices->missing_devices;
8191
8192        stripped = BTRFS_BLOCK_GROUP_RAID0 |
8193                BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
8194                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
8195
8196        if (num_devices == 1) {
8197                stripped |= BTRFS_BLOCK_GROUP_DUP;
8198                stripped = flags & ~stripped;
8199
8200                /* turn raid0 into single device chunks */
8201                if (flags & BTRFS_BLOCK_GROUP_RAID0)
8202                        return stripped;
8203
8204                /* turn mirroring into duplication */
8205                if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
8206                             BTRFS_BLOCK_GROUP_RAID10))
8207                        return stripped | BTRFS_BLOCK_GROUP_DUP;
8208        } else {
8209                /* they already had raid on here, just return */
8210                if (flags & stripped)
8211                        return flags;
8212
8213                stripped |= BTRFS_BLOCK_GROUP_DUP;
8214                stripped = flags & ~stripped;
8215
8216                /* switch duplicated blocks with raid1 */
8217                if (flags & BTRFS_BLOCK_GROUP_DUP)
8218                        return stripped | BTRFS_BLOCK_GROUP_RAID1;
8219
8220                /* this is drive concat, leave it alone */
8221        }
8222
8223        return flags;
8224}
8225
8226static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
8227{
8228        struct btrfs_space_info *sinfo = cache->space_info;
8229        u64 num_bytes;
8230        u64 min_allocable_bytes;
8231        int ret = -ENOSPC;
8232
8233
8234        /*
8235         * We need some metadata space and system metadata space for
8236         * allocating chunks in some corner cases until we force to set
8237         * it to be readonly.
8238         */
8239        if ((sinfo->flags &
8240             (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
8241            !force)
8242                min_allocable_bytes = 1 * 1024 * 1024;
8243        else
8244                min_allocable_bytes = 0;
8245
8246        spin_lock(&sinfo->lock);
8247        spin_lock(&cache->lock);
8248
8249        if (cache->ro) {
8250                ret = 0;
8251                goto out;
8252        }
8253
8254        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8255                    cache->bytes_super - btrfs_block_group_used(&cache->item);
8256
8257        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
8258            sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
8259            min_allocable_bytes <= sinfo->total_bytes) {
8260                sinfo->bytes_readonly += num_bytes;
8261                cache->ro = 1;
8262                ret = 0;
8263        }
8264out:
8265        spin_unlock(&cache->lock);
8266        spin_unlock(&sinfo->lock);
8267        return ret;
8268}
8269
8270int btrfs_set_block_group_ro(struct btrfs_root *root,
8271                             struct btrfs_block_group_cache *cache)
8272
8273{
8274        struct btrfs_trans_handle *trans;
8275        u64 alloc_flags;
8276        int ret;
8277
8278        BUG_ON(cache->ro);
8279
8280        trans = btrfs_join_transaction(root);
8281        if (IS_ERR(trans))
8282                return PTR_ERR(trans);
8283
8284        alloc_flags = update_block_group_flags(root, cache->flags);
8285        if (alloc_flags != cache->flags) {
8286                ret = do_chunk_alloc(trans, root, alloc_flags,
8287                                     CHUNK_ALLOC_FORCE);
8288                if (ret < 0)
8289                        goto out;
8290        }
8291
8292        ret = set_block_group_ro(cache, 0);
8293        if (!ret)
8294                goto out;
8295        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
8296        ret = do_chunk_alloc(trans, root, alloc_flags,
8297                             CHUNK_ALLOC_FORCE);
8298        if (ret < 0)
8299                goto out;
8300        ret = set_block_group_ro(cache, 0);
8301out:
8302        btrfs_end_transaction(trans, root);
8303        return ret;
8304}
8305
8306int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8307                            struct btrfs_root *root, u64 type)
8308{
8309        u64 alloc_flags = get_alloc_profile(root, type);
8310        return do_chunk_alloc(trans, root, alloc_flags,
8311                              CHUNK_ALLOC_FORCE);
8312}
8313
8314/*
8315 * helper to account the unused space of all the readonly block group in the
8316 * list. takes mirrors into account.
8317 */
8318static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8319{
8320        struct btrfs_block_group_cache *block_group;
8321        u64 free_bytes = 0;
8322        int factor;
8323
8324        list_for_each_entry(block_group, groups_list, list) {
8325                spin_lock(&block_group->lock);
8326
8327                if (!block_group->ro) {
8328                        spin_unlock(&block_group->lock);
8329                        continue;
8330                }
8331
8332                if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
8333                                          BTRFS_BLOCK_GROUP_RAID10 |
8334                                          BTRFS_BLOCK_GROUP_DUP))
8335                        factor = 2;
8336                else
8337                        factor = 1;
8338
8339                free_bytes += (block_group->key.offset -
8340                               btrfs_block_group_used(&block_group->item)) *
8341                               factor;
8342
8343                spin_unlock(&block_group->lock);
8344        }
8345
8346        return free_bytes;
8347}
8348
8349/*
8350 * helper to account the unused space of all the readonly block group in the
8351 * space_info. takes mirrors into account.
8352 */
8353u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8354{
8355        int i;
8356        u64 free_bytes = 0;
8357
8358        spin_lock(&sinfo->lock);
8359
8360        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8361                if (!list_empty(&sinfo->block_groups[i]))
8362                        free_bytes += __btrfs_get_ro_block_group_free_space(
8363                                                &sinfo->block_groups[i]);
8364
8365        spin_unlock(&sinfo->lock);
8366
8367        return free_bytes;
8368}
8369
8370void btrfs_set_block_group_rw(struct btrfs_root *root,
8371                              struct btrfs_block_group_cache *cache)
8372{
8373        struct btrfs_space_info *sinfo = cache->space_info;
8374        u64 num_bytes;
8375
8376        BUG_ON(!cache->ro);
8377
8378        spin_lock(&sinfo->lock);
8379        spin_lock(&cache->lock);
8380        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8381                    cache->bytes_super - btrfs_block_group_used(&cache->item);
8382        sinfo->bytes_readonly -= num_bytes;
8383        cache->ro = 0;
8384        spin_unlock(&cache->lock);
8385        spin_unlock(&sinfo->lock);
8386}
8387
8388/*
8389 * checks to see if its even possible to relocate this block group.
8390 *
8391 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
8392 * ok to go ahead and try.
8393 */
8394int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8395{
8396        struct btrfs_block_group_cache *block_group;
8397        struct btrfs_space_info *space_info;
8398        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
8399        struct btrfs_device *device;
8400        struct btrfs_trans_handle *trans;
8401        u64 min_free;
8402        u64 dev_min = 1;
8403        u64 dev_nr = 0;
8404        u64 target;
8405        int index;
8406        int full = 0;
8407        int ret = 0;
8408
8409        block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
8410
8411        /* odd, couldn't find the block group, leave it alone */
8412        if (!block_group)
8413                return -1;
8414
8415        min_free = btrfs_block_group_used(&block_group->item);
8416
8417        /* no bytes used, we're good */
8418        if (!min_free)
8419                goto out;
8420
8421        space_info = block_group->space_info;
8422        spin_lock(&space_info->lock);
8423
8424        full = space_info->full;
8425
8426        /*
8427         * if this is the last block group we have in this space, we can't
8428         * relocate it unless we're able to allocate a new chunk below.
8429         *
8430         * Otherwise, we need to make sure we have room in the space to handle
8431         * all of the extents from this block group.  If we can, we're good
8432         */
8433        if ((space_info->total_bytes != block_group->key.offset) &&
8434            (space_info->bytes_used + space_info->bytes_reserved +
8435             space_info->bytes_pinned + space_info->bytes_readonly +
8436             min_free < space_info->total_bytes)) {
8437                spin_unlock(&space_info->lock);
8438                goto out;
8439        }
8440        spin_unlock(&space_info->lock);
8441
8442        /*
8443         * ok we don't have enough space, but maybe we have free space on our
8444         * devices to allocate new chunks for relocation, so loop through our
8445         * alloc devices and guess if we have enough space.  if this block
8446         * group is going to be restriped, run checks against the target
8447         * profile instead of the current one.
8448         */
8449        ret = -1;
8450
8451        /*
8452         * index:
8453         *      0: raid10
8454         *      1: raid1
8455         *      2: dup
8456         *      3: raid0
8457         *      4: single
8458         */
8459        target = get_restripe_target(root->fs_info, block_group->flags);
8460        if (target) {
8461                index = __get_raid_index(extended_to_chunk(target));
8462        } else {
8463                /*
8464                 * this is just a balance, so if we were marked as full
8465                 * we know there is no space for a new chunk
8466                 */
8467                if (full)
8468                        goto out;
8469
8470                index = get_block_group_index(block_group);
8471        }
8472
8473        if (index == BTRFS_RAID_RAID10) {
8474                dev_min = 4;
8475                /* Divide by 2 */
8476                min_free >>= 1;
8477        } else if (index == BTRFS_RAID_RAID1) {
8478                dev_min = 2;
8479        } else if (index == BTRFS_RAID_DUP) {
8480                /* Multiply by 2 */
8481                min_free <<= 1;
8482        } else if (index == BTRFS_RAID_RAID0) {
8483                dev_min = fs_devices->rw_devices;
8484                do_div(min_free, dev_min);
8485        }
8486
8487        /* We need to do this so that we can look at pending chunks */
8488        trans = btrfs_join_transaction(root);
8489        if (IS_ERR(trans)) {
8490                ret = PTR_ERR(trans);
8491                goto out;
8492        }
8493
8494        mutex_lock(&root->fs_info->chunk_mutex);
8495        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
8496                u64 dev_offset;
8497
8498                /*
8499                 * check to make sure we can actually find a chunk with enough
8500                 * space to fit our block group in.
8501                 */
8502                if (device->total_bytes > device->bytes_used + min_free &&
8503                    !device->is_tgtdev_for_dev_replace) {
8504                        ret = find_free_dev_extent(trans, device, min_free,
8505                                                   &dev_offset, NULL);
8506                        if (!ret)
8507                                dev_nr++;
8508
8509                        if (dev_nr >= dev_min)
8510                                break;
8511
8512                        ret = -1;
8513                }
8514        }
8515        mutex_unlock(&root->fs_info->chunk_mutex);
8516        btrfs_end_transaction(trans, root);
8517out:
8518        btrfs_put_block_group(block_group);
8519        return ret;
8520}
8521
8522static int find_first_block_group(struct btrfs_root *root,
8523                struct btrfs_path *path, struct btrfs_key *key)
8524{
8525        int ret = 0;
8526        struct btrfs_key found_key;
8527        struct extent_buffer *leaf;
8528        int slot;
8529
8530        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
8531        if (ret < 0)
8532                goto out;
8533
8534        while (1) {
8535                slot = path->slots[0];
8536                leaf = path->nodes[0];
8537                if (slot >= btrfs_header_nritems(leaf)) {
8538                        ret = btrfs_next_leaf(root, path);
8539                        if (ret == 0)
8540                                continue;
8541                        if (ret < 0)
8542                                goto out;
8543                        break;
8544                }
8545                btrfs_item_key_to_cpu(leaf, &found_key, slot);
8546
8547                if (found_key.objectid >= key->objectid &&
8548                    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
8549                        ret = 0;
8550                        goto out;
8551                }
8552                path->slots[0]++;
8553        }
8554out:
8555        return ret;
8556}
8557
8558void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8559{
8560        struct btrfs_block_group_cache *block_group;
8561        u64 last = 0;
8562
8563        while (1) {
8564                struct inode *inode;
8565
8566                block_group = btrfs_lookup_first_block_group(info, last);
8567                while (block_group) {
8568                        spin_lock(&block_group->lock);
8569                        if (block_group->iref)
8570                                break;
8571                        spin_unlock(&block_group->lock);
8572                        block_group = next_block_group(info->tree_root,
8573                                                       block_group);
8574                }
8575                if (!block_group) {
8576                        if (last == 0)
8577                                break;
8578                        last = 0;
8579                        continue;
8580                }
8581
8582                inode = block_group->inode;
8583                block_group->iref = 0;
8584                block_group->inode = NULL;
8585                spin_unlock(&block_group->lock);
8586                iput(inode);
8587                last = block_group->key.objectid + block_group->key.offset;
8588                btrfs_put_block_group(block_group);
8589        }
8590}
8591
8592int btrfs_free_block_groups(struct btrfs_fs_info *info)
8593{
8594        struct btrfs_block_group_cache *block_group;
8595        struct btrfs_space_info *space_info;
8596        struct btrfs_caching_control *caching_ctl;
8597        struct rb_node *n;
8598
8599        down_write(&info->commit_root_sem);
8600        while (!list_empty(&info->caching_block_groups)) {
8601                caching_ctl = list_entry(info->caching_block_groups.next,
8602                                         struct btrfs_caching_control, list);
8603                list_del(&caching_ctl->list);
8604                put_caching_control(caching_ctl);
8605        }
8606        up_write(&info->commit_root_sem);
8607
8608        spin_lock(&info->block_group_cache_lock);
8609        while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8610                block_group = rb_entry(n, struct btrfs_block_group_cache,
8611                                       cache_node);
8612                rb_erase(&block_group->cache_node,
8613                         &info->block_group_cache_tree);
8614                spin_unlock(&info->block_group_cache_lock);
8615
8616                down_write(&block_group->space_info->groups_sem);
8617                list_del(&block_group->list);
8618                up_write(&block_group->space_info->groups_sem);
8619
8620                if (block_group->cached == BTRFS_CACHE_STARTED)
8621                        wait_block_group_cache_done(block_group);
8622
8623                /*
8624                 * We haven't cached this block group, which means we could
8625                 * possibly have excluded extents on this block group.
8626                 */
8627                if (block_group->cached == BTRFS_CACHE_NO ||
8628                    block_group->cached == BTRFS_CACHE_ERROR)
8629                        free_excluded_extents(info->extent_root, block_group);
8630
8631                btrfs_remove_free_space_cache(block_group);
8632                btrfs_put_block_group(block_group);
8633
8634                spin_lock(&info->block_group_cache_lock);
8635        }
8636        spin_unlock(&info->block_group_cache_lock);
8637
8638        /* now that all the block groups are freed, go through and
8639         * free all the space_info structs.  This is only called during
8640         * the final stages of unmount, and so we know nobody is
8641         * using them.  We call synchronize_rcu() once before we start,
8642         * just to be on the safe side.
8643         */
8644        synchronize_rcu();
8645
8646        release_global_block_rsv(info);
8647
8648        while (!list_empty(&info->space_info)) {
8649                int i;
8650
8651                space_info = list_entry(info->space_info.next,
8652                                        struct btrfs_space_info,
8653                                        list);
8654                if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
8655                        if (WARN_ON(space_info->bytes_pinned > 0 ||
8656                            space_info->bytes_reserved > 0 ||
8657                            space_info->bytes_may_use > 0)) {
8658                                dump_space_info(space_info, 0, 0);
8659                        }
8660                }
8661                list_del(&space_info->list);
8662                for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
8663                        struct kobject *kobj;
8664                        kobj = space_info->block_group_kobjs[i];
8665                        space_info->block_group_kobjs[i] = NULL;
8666                        if (kobj) {
8667                                kobject_del(kobj);
8668                                kobject_put(kobj);
8669                        }
8670                }
8671                kobject_del(&space_info->kobj);
8672                kobject_put(&space_info->kobj);
8673        }
8674        return 0;
8675}
8676
8677static void __link_block_group(struct btrfs_space_info *space_info,
8678                               struct btrfs_block_group_cache *cache)
8679{
8680        int index = get_block_group_index(cache);
8681        bool first = false;
8682
8683        down_write(&space_info->groups_sem);
8684        if (list_empty(&space_info->block_groups[index]))
8685                first = true;
8686        list_add_tail(&cache->list, &space_info->block_groups[index]);
8687        up_write(&space_info->groups_sem);
8688
8689        if (first) {
8690                struct raid_kobject *rkobj;
8691                int ret;
8692
8693                rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
8694                if (!rkobj)
8695                        goto out_err;
8696                rkobj->raid_type = index;
8697                kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
8698                ret = kobject_add(&rkobj->kobj, &space_info->kobj,
8699                                  "%s", get_raid_name(index));
8700                if (ret) {
8701                        kobject_put(&rkobj->kobj);
8702                        goto out_err;
8703                }
8704                space_info->block_group_kobjs[index] = &rkobj->kobj;
8705        }
8706
8707        return;
8708out_err:
8709        pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
8710}
8711
8712static struct btrfs_block_group_cache *
8713btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
8714{
8715        struct btrfs_block_group_cache *cache;
8716
8717        cache = kzalloc(sizeof(*cache), GFP_NOFS);
8718        if (!cache)
8719                return NULL;
8720
8721        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8722                                        GFP_NOFS);
8723        if (!cache->free_space_ctl) {
8724                kfree(cache);
8725                return NULL;
8726        }
8727
8728        cache->key.objectid = start;
8729        cache->key.offset = size;
8730        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8731
8732        cache->sectorsize = root->sectorsize;
8733        cache->fs_info = root->fs_info;
8734        cache->full_stripe_len = btrfs_full_stripe_len(root,
8735                                               &root->fs_info->mapping_tree,
8736                                               start);
8737        atomic_set(&cache->count, 1);
8738        spin_lock_init(&cache->lock);
8739        init_rwsem(&cache->data_rwsem);
8740        INIT_LIST_HEAD(&cache->list);
8741        INIT_LIST_HEAD(&cache->cluster_list);
8742        INIT_LIST_HEAD(&cache->new_bg_list);
8743        btrfs_init_free_space_ctl(cache);
8744
8745        return cache;
8746}
8747
8748int btrfs_read_block_groups(struct btrfs_root *root)
8749{
8750        struct btrfs_path *path;
8751        int ret;
8752        struct btrfs_block_group_cache *cache;
8753        struct btrfs_fs_info *info = root->fs_info;
8754        struct btrfs_space_info *space_info;
8755        struct btrfs_key key;
8756        struct btrfs_key found_key;
8757        struct extent_buffer *leaf;
8758        int need_clear = 0;
8759        u64 cache_gen;
8760
8761        root = info->extent_root;
8762        key.objectid = 0;
8763        key.offset = 0;
8764        btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
8765        path = btrfs_alloc_path();
8766        if (!path)
8767                return -ENOMEM;
8768        path->reada = 1;
8769
8770        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
8771        if (btrfs_test_opt(root, SPACE_CACHE) &&
8772            btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
8773                need_clear = 1;
8774        if (btrfs_test_opt(root, CLEAR_CACHE))
8775                need_clear = 1;
8776
8777        while (1) {
8778                ret = find_first_block_group(root, path, &key);
8779                if (ret > 0)
8780                        break;
8781                if (ret != 0)
8782                        goto error;
8783
8784                leaf = path->nodes[0];
8785                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8786
8787                cache = btrfs_create_block_group_cache(root, found_key.objectid,
8788                                                       found_key.offset);
8789                if (!cache) {
8790                        ret = -ENOMEM;
8791                        goto error;
8792                }
8793
8794                if (need_clear) {
8795                        /*
8796                         * When we mount with old space cache, we need to
8797                         * set BTRFS_DC_CLEAR and set dirty flag.
8798                         *
8799                         * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
8800                         *    truncate the old free space cache inode and
8801                         *    setup a new one.
8802                         * b) Setting 'dirty flag' makes sure that we flush
8803                         *    the new space cache info onto disk.
8804                         */
8805                        cache->disk_cache_state = BTRFS_DC_CLEAR;
8806                        if (btrfs_test_opt(root, SPACE_CACHE))
8807                                cache->dirty = 1;
8808                }
8809
8810                read_extent_buffer(leaf, &cache->item,
8811                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
8812                                   sizeof(cache->item));
8813                cache->flags = btrfs_block_group_flags(&cache->item);
8814
8815                key.objectid = found_key.objectid + found_key.offset;
8816                btrfs_release_path(path);
8817
8818                /*
8819                 * We need to exclude the super stripes now so that the space
8820                 * info has super bytes accounted for, otherwise we'll think
8821                 * we have more space than we actually do.
8822                 */
8823                ret = exclude_super_stripes(root, cache);
8824                if (ret) {
8825                        /*
8826                         * We may have excluded something, so call this just in
8827                         * case.
8828                         */
8829                        free_excluded_extents(root, cache);
8830                        btrfs_put_block_group(cache);
8831                        goto error;
8832                }
8833
8834                /*
8835                 * check for two cases, either we are full, and therefore
8836                 * don't need to bother with the caching work since we won't
8837                 * find any space, or we are empty, and we can just add all
8838                 * the space in and be done with it.  This saves us _alot_ of
8839                 * time, particularly in the full case.
8840                 */
8841                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8842                        cache->last_byte_to_unpin = (u64)-1;
8843                        cache->cached = BTRFS_CACHE_FINISHED;
8844                        free_excluded_extents(root, cache);
8845                } else if (btrfs_block_group_used(&cache->item) == 0) {
8846                        cache->last_byte_to_unpin = (u64)-1;
8847                        cache->cached = BTRFS_CACHE_FINISHED;
8848                        add_new_free_space(cache, root->fs_info,
8849                                           found_key.objectid,
8850                                           found_key.objectid +
8851                                           found_key.offset);
8852                        free_excluded_extents(root, cache);
8853                }
8854
8855                ret = btrfs_add_block_group_cache(root->fs_info, cache);
8856                if (ret) {
8857                        btrfs_remove_free_space_cache(cache);
8858                        btrfs_put_block_group(cache);
8859                        goto error;
8860                }
8861
8862                ret = update_space_info(info, cache->flags, found_key.offset,
8863                                        btrfs_block_group_used(&cache->item),
8864                                        &space_info);
8865                if (ret) {
8866                        btrfs_remove_free_space_cache(cache);
8867                        spin_lock(&info->block_group_cache_lock);
8868                        rb_erase(&cache->cache_node,
8869                                 &info->block_group_cache_tree);
8870                        spin_unlock(&info->block_group_cache_lock);
8871                        btrfs_put_block_group(cache);
8872                        goto error;
8873                }
8874
8875                cache->space_info = space_info;
8876                spin_lock(&cache->space_info->lock);
8877                cache->space_info->bytes_readonly += cache->bytes_super;
8878                spin_unlock(&cache->space_info->lock);
8879
8880                __link_block_group(space_info, cache);
8881
8882                set_avail_alloc_bits(root->fs_info, cache->flags);
8883                if (btrfs_chunk_readonly(root, cache->key.objectid))
8884                        set_block_group_ro(cache, 1);
8885        }
8886
8887        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
8888                if (!(get_alloc_profile(root, space_info->flags) &
8889                      (BTRFS_BLOCK_GROUP_RAID10 |
8890                       BTRFS_BLOCK_GROUP_RAID1 |
8891                       BTRFS_BLOCK_GROUP_RAID5 |
8892                       BTRFS_BLOCK_GROUP_RAID6 |
8893                       BTRFS_BLOCK_GROUP_DUP)))
8894                        continue;
8895                /*
8896                 * avoid allocating from un-mirrored block group if there are
8897                 * mirrored block groups.
8898                 */
8899                list_for_each_entry(cache,
8900                                &space_info->block_groups[BTRFS_RAID_RAID0],
8901                                list)
8902                        set_block_group_ro(cache, 1);
8903                list_for_each_entry(cache,
8904                                &space_info->block_groups[BTRFS_RAID_SINGLE],
8905                                list)
8906                        set_block_group_ro(cache, 1);
8907        }
8908
8909        init_global_block_rsv(info);
8910        ret = 0;
8911error:
8912        btrfs_free_path(path);
8913        return ret;
8914}
8915
8916void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8917                                       struct btrfs_root *root)
8918{
8919        struct btrfs_block_group_cache *block_group, *tmp;
8920        struct btrfs_root *extent_root = root->fs_info->extent_root;
8921        struct btrfs_block_group_item item;
8922        struct btrfs_key key;
8923        int ret = 0;
8924
8925        list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
8926                                 new_bg_list) {
8927                list_del_init(&block_group->new_bg_list);
8928
8929                if (ret)
8930                        continue;
8931
8932                spin_lock(&block_group->lock);
8933                memcpy(&item, &block_group->item, sizeof(item));
8934                memcpy(&key, &block_group->key, sizeof(key));
8935                spin_unlock(&block_group->lock);
8936
8937                ret = btrfs_insert_item(trans, extent_root, &key, &item,
8938                                        sizeof(item));
8939                if (ret)
8940                        btrfs_abort_transaction(trans, extent_root, ret);
8941                ret = btrfs_finish_chunk_alloc(trans, extent_root,
8942                                               key.objectid, key.offset);
8943                if (ret)
8944                        btrfs_abort_transaction(trans, extent_root, ret);
8945        }
8946}
8947
8948int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8949                           struct btrfs_root *root, u64 bytes_used,
8950                           u64 type, u64 chunk_objectid, u64 chunk_offset,
8951                           u64 size)
8952{
8953        int ret;
8954        struct btrfs_root *extent_root;
8955        struct btrfs_block_group_cache *cache;
8956
8957        extent_root = root->fs_info->extent_root;
8958
8959        btrfs_set_log_full_commit(root->fs_info, trans);
8960
8961        cache = btrfs_create_block_group_cache(root, chunk_offset, size);
8962        if (!cache)
8963                return -ENOMEM;
8964
8965        btrfs_set_block_group_used(&cache->item, bytes_used);
8966        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8967        btrfs_set_block_group_flags(&cache->item, type);
8968
8969        cache->flags = type;
8970        cache->last_byte_to_unpin = (u64)-1;
8971        cache->cached = BTRFS_CACHE_FINISHED;
8972        ret = exclude_super_stripes(root, cache);
8973        if (ret) {
8974                /*
8975                 * We may have excluded something, so call this just in
8976                 * case.
8977                 */
8978                free_excluded_extents(root, cache);
8979                btrfs_put_block_group(cache);
8980                return ret;
8981        }
8982
8983        add_new_free_space(cache, root->fs_info, chunk_offset,
8984                           chunk_offset + size);
8985
8986        free_excluded_extents(root, cache);
8987
8988        ret = btrfs_add_block_group_cache(root->fs_info, cache);
8989        if (ret) {
8990                btrfs_remove_free_space_cache(cache);
8991                btrfs_put_block_group(cache);
8992                return ret;
8993        }
8994
8995        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
8996                                &cache->space_info);
8997        if (ret) {
8998                btrfs_remove_free_space_cache(cache);
8999                spin_lock(&root->fs_info->block_group_cache_lock);
9000                rb_erase(&cache->cache_node,
9001                         &root->fs_info->block_group_cache_tree);
9002                spin_unlock(&root->fs_info->block_group_cache_lock);
9003                btrfs_put_block_group(cache);
9004                return ret;
9005        }
9006        update_global_block_rsv(root->fs_info);
9007
9008        spin_lock(&cache->space_info->lock);
9009        cache->space_info->bytes_readonly += cache->bytes_super;
9010        spin_unlock(&cache->space_info->lock);
9011
9012        __link_block_group(cache->space_info, cache);
9013
9014        list_add_tail(&cache->new_bg_list, &trans->new_bgs);
9015
9016        set_avail_alloc_bits(extent_root->fs_info, type);
9017
9018        return 0;
9019}
9020
9021static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
9022{
9023        u64 extra_flags = chunk_to_extended(flags) &
9024                                BTRFS_EXTENDED_PROFILE_MASK;
9025
9026        write_seqlock(&fs_info->profiles_lock);
9027        if (flags & BTRFS_BLOCK_GROUP_DATA)
9028                fs_info->avail_data_alloc_bits &= ~extra_flags;
9029        if (flags & BTRFS_BLOCK_GROUP_METADATA)
9030                fs_info->avail_metadata_alloc_bits &= ~extra_flags;
9031        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
9032                fs_info->avail_system_alloc_bits &= ~extra_flags;
9033        write_sequnlock(&fs_info->profiles_lock);
9034}
9035
9036int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9037                             struct btrfs_root *root, u64 group_start)
9038{
9039        struct btrfs_path *path;
9040        struct btrfs_block_group_cache *block_group;
9041        struct btrfs_free_cluster *cluster;
9042        struct btrfs_root *tree_root = root->fs_info->tree_root;
9043        struct btrfs_key key;
9044        struct inode *inode;
9045        struct kobject *kobj = NULL;
9046        int ret;
9047        int index;
9048        int factor;
9049
9050        root = root->fs_info->extent_root;
9051
9052        block_group = btrfs_lookup_block_group(root->fs_info, group_start);
9053        BUG_ON(!block_group);
9054        BUG_ON(!block_group->ro);
9055
9056        /*
9057         * Free the reserved super bytes from this block group before
9058         * remove it.
9059         */
9060        free_excluded_extents(root, block_group);
9061
9062        memcpy(&key, &block_group->key, sizeof(key));
9063        index = get_block_group_index(block_group);
9064        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
9065                                  BTRFS_BLOCK_GROUP_RAID1 |
9066                                  BTRFS_BLOCK_GROUP_RAID10))
9067                factor = 2;
9068        else
9069                factor = 1;
9070
9071        /* make sure this block group isn't part of an allocation cluster */
9072        cluster = &root->fs_info->data_alloc_cluster;
9073        spin_lock(&cluster->refill_lock);
9074        btrfs_return_cluster_to_free_space(block_group, cluster);
9075        spin_unlock(&cluster->refill_lock);
9076
9077        /*
9078         * make sure this block group isn't part of a metadata
9079         * allocation cluster
9080         */
9081        cluster = &root->fs_info->meta_alloc_cluster;
9082        spin_lock(&cluster->refill_lock);
9083        btrfs_return_cluster_to_free_space(block_group, cluster);
9084        spin_unlock(&cluster->refill_lock);
9085
9086        path = btrfs_alloc_path();
9087        if (!path) {
9088                ret = -ENOMEM;
9089                goto out;
9090        }
9091
9092        inode = lookup_free_space_inode(tree_root, block_group, path);
9093        if (!IS_ERR(inode)) {
9094                ret = btrfs_orphan_add(trans, inode);
9095                if (ret) {
9096                        btrfs_add_delayed_iput(inode);
9097                        goto out;
9098                }
9099                clear_nlink(inode);
9100                /* One for the block groups ref */
9101                spin_lock(&block_group->lock);
9102                if (block_group->iref) {
9103                        block_group->iref = 0;
9104                        block_group->inode = NULL;
9105                        spin_unlock(&block_group->lock);
9106                        iput(inode);
9107                } else {
9108                        spin_unlock(&block_group->lock);
9109                }
9110                /* One for our lookup ref */
9111                btrfs_add_delayed_iput(inode);
9112        }
9113
9114        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
9115        key.offset = block_group->key.objectid;
9116        key.type = 0;
9117
9118        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
9119        if (ret < 0)
9120                goto out;
9121        if (ret > 0)
9122                btrfs_release_path(path);
9123        if (ret == 0) {
9124                ret = btrfs_del_item(trans, tree_root, path);
9125                if (ret)
9126                        goto out;
9127                btrfs_release_path(path);
9128        }
9129
9130        spin_lock(&root->fs_info->block_group_cache_lock);
9131        rb_erase(&block_group->cache_node,
9132                 &root->fs_info->block_group_cache_tree);
9133
9134        if (root->fs_info->first_logical_byte == block_group->key.objectid)
9135                root->fs_info->first_logical_byte = (u64)-1;
9136        spin_unlock(&root->fs_info->block_group_cache_lock);
9137
9138        down_write(&block_group->space_info->groups_sem);
9139        /*
9140         * we must use list_del_init so people can check to see if they
9141         * are still on the list after taking the semaphore
9142         */
9143        list_del_init(&block_group->list);
9144        if (list_empty(&block_group->space_info->block_groups[index])) {
9145                kobj = block_group->space_info->block_group_kobjs[index];
9146                block_group->space_info->block_group_kobjs[index] = NULL;
9147                clear_avail_alloc_bits(root->fs_info, block_group->flags);
9148        }
9149        up_write(&block_group->space_info->groups_sem);
9150        if (kobj) {
9151                kobject_del(kobj);
9152                kobject_put(kobj);
9153        }
9154
9155        if (block_group->cached == BTRFS_CACHE_STARTED)
9156                wait_block_group_cache_done(block_group);
9157
9158        btrfs_remove_free_space_cache(block_group);
9159
9160        spin_lock(&block_group->space_info->lock);
9161        block_group->space_info->total_bytes -= block_group->key.offset;
9162        block_group->space_info->bytes_readonly -= block_group->key.offset;
9163        block_group->space_info->disk_total -= block_group->key.offset * factor;
9164        spin_unlock(&block_group->space_info->lock);
9165
9166        memcpy(&key, &block_group->key, sizeof(key));
9167
9168        btrfs_clear_space_info_full(root->fs_info);
9169
9170        btrfs_put_block_group(block_group);
9171        btrfs_put_block_group(block_group);
9172
9173        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
9174        if (ret > 0)
9175                ret = -EIO;
9176        if (ret < 0)
9177                goto out;
9178
9179        ret = btrfs_del_item(trans, root, path);
9180out:
9181        btrfs_free_path(path);
9182        return ret;
9183}
9184
9185int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
9186{
9187        struct btrfs_space_info *space_info;
9188        struct btrfs_super_block *disk_super;
9189        u64 features;
9190        u64 flags;
9191        int mixed = 0;
9192        int ret;
9193
9194        disk_super = fs_info->super_copy;
9195        if (!btrfs_super_root(disk_super))
9196                return 1;
9197
9198        features = btrfs_super_incompat_flags(disk_super);
9199        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
9200                mixed = 1;
9201
9202        flags = BTRFS_BLOCK_GROUP_SYSTEM;
9203        ret = update_space_info(fs_info, flags, 0, 0, &space_info);
9204        if (ret)
9205                goto out;
9206
9207        if (mixed) {
9208                flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
9209                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
9210        } else {
9211                flags = BTRFS_BLOCK_GROUP_METADATA;
9212                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
9213                if (ret)
9214                        goto out;
9215
9216                flags = BTRFS_BLOCK_GROUP_DATA;
9217                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
9218        }
9219out:
9220        return ret;
9221}
9222
9223int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
9224{
9225        return unpin_extent_range(root, start, end);
9226}
9227
9228int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
9229                               u64 num_bytes, u64 *actual_bytes)
9230{
9231        return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
9232}
9233
9234int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
9235{
9236        struct btrfs_fs_info *fs_info = root->fs_info;
9237        struct btrfs_block_group_cache *cache = NULL;
9238        u64 group_trimmed;
9239        u64 start;
9240        u64 end;
9241        u64 trimmed = 0;
9242        u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
9243        int ret = 0;
9244
9245        /*
9246         * try to trim all FS space, our block group may start from non-zero.
9247         */
9248        if (range->len == total_bytes)
9249                cache = btrfs_lookup_first_block_group(fs_info, range->start);
9250        else
9251                cache = btrfs_lookup_block_group(fs_info, range->start);
9252
9253        while (cache) {
9254                if (cache->key.objectid >= (range->start + range->len)) {
9255                        btrfs_put_block_group(cache);
9256                        break;
9257                }
9258
9259                start = max(range->start, cache->key.objectid);
9260                end = min(range->start + range->len,
9261                                cache->key.objectid + cache->key.offset);
9262
9263                if (end - start >= range->minlen) {
9264                        if (!block_group_cache_done(cache)) {
9265                                ret = cache_block_group(cache, 0);
9266                                if (ret) {
9267                                        btrfs_put_block_group(cache);
9268                                        break;
9269                                }
9270                                ret = wait_block_group_cache_done(cache);
9271                                if (ret) {
9272                                        btrfs_put_block_group(cache);
9273                                        break;
9274                                }
9275                        }
9276                        ret = btrfs_trim_block_group(cache,
9277                                                     &group_trimmed,
9278                                                     start,
9279                                                     end,
9280                                                     range->minlen);
9281
9282                        trimmed += group_trimmed;
9283                        if (ret) {
9284                                btrfs_put_block_group(cache);
9285                                break;
9286                        }
9287                }
9288
9289                cache = next_block_group(fs_info->tree_root, cache);
9290        }
9291
9292        range->len = trimmed;
9293        return ret;
9294}
9295
9296/*
9297 * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
9298 * they are used to prevent the some tasks writing data into the page cache
9299 * by nocow before the subvolume is snapshoted, but flush the data into
9300 * the disk after the snapshot creation.
9301 */
9302void btrfs_end_nocow_write(struct btrfs_root *root)
9303{
9304        percpu_counter_dec(&root->subv_writers->counter);
9305        /*
9306         * Make sure counter is updated before we wake up
9307         * waiters.
9308         */
9309        smp_mb();
9310        if (waitqueue_active(&root->subv_writers->wait))
9311                wake_up(&root->subv_writers->wait);
9312}
9313
9314int btrfs_start_nocow_write(struct btrfs_root *root)
9315{
9316        if (unlikely(atomic_read(&root->will_be_snapshoted)))
9317                return 0;
9318
9319        percpu_counter_inc(&root->subv_writers->counter);
9320        /*
9321         * Make sure counter is updated before we check for snapshot creation.
9322         */
9323        smp_mb();
9324        if (unlikely(atomic_read(&root->will_be_snapshoted))) {
9325                btrfs_end_nocow_write(root);
9326                return 0;
9327        }
9328        return 1;
9329}
9330